Spaces:
Sleeping
Sleeping
File size: 2,649 Bytes
5fa76ab 1ee9392 d8faee6 19e6be5 1ee9392 19e6be5 4f0b8fa d8faee6 db6a45c 9f3503e 4f0b8fa d236bfb 9886add 19e6be5 4f0b8fa 9886add d8faee6 9463fa8 9f3503e d8faee6 19e6be5 8d5fa35 d8faee6 5fa76ab b803af4 8c27ab6 5fa76ab d8faee6 9463fa8 8c27ab6 9463fa8 5fa76ab 9463fa8 f43bb1f 9463fa8 f43bb1f 9463fa8 8b05f55 19e6be5 9886add b40fa52 9463fa8 896e66e d8faee6 5fa76ab 477c34c b803af4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import uvicorn
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
token = os.getenv("HUGGINGFACE_TOKEN")
assert token is not None, "Hugging Face token is missing. Please set the 'HUGGINGFACE_TOKEN' environment variable."
#Load pre-trained tokenizer and model (Works)
model_name = "microsoft/Phi-4-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
torch_dtype="auto",
token=token
)
# Example usage: Generate text
prompt = "<|system|>You are a helpful assistant<|end|><|user|>What is the capital of france?<|end|><|assistant|>"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, return_attention_mask=True ).to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=100,
pad_token_id=tokenizer.eos_token_id # Set this to suppress warning
)
resp = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(resp)
app = FastAPI()
class EchoMessage(BaseModel):
message: str
class Item(BaseModel):
prompt: str
@app.post("/generate/")
async def generate_text(item: Item):
# messages = [
# {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
# {"role": "user", "content": "Give me a short introduction to LLMs."},
# ]
# outputs = pipeline(
# messages,
# max_new_tokens=512,
# )
# logging.info("request got")
# resp = outputs[0]["generated_text"][-1]
# logging.info("Response generated")
inp =f"<|system|>You are a helpful assistant<|end|><|user|> {item.prompt} <|end|><|assistant|>"
inputs = tokenizer(inp, return_tensors="pt", padding=True, return_attention_mask=True ).to(model.device)
# input_ids = tokenizer.encode(item.prompt, return_tensors="pt")
# output = model.generate(input_ids, max_length=50, num_return_sequences=1)
# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# resp = generated_text
outputs = model.generate(
**inputs,
max_new_tokens=100,
pad_token_id=tokenizer.eos_token_id # Set this to suppress warning
)
resp = tokenizer.decode(outputs[0], skip_special_tokens=False)
return {"response": resp}
@app.get("/")
async def home():
return {"msg":"hey"}
@app.post("/echo/")
async def echo(echo_msg: EchoMessage):
return {"msg":echo_msg.message}
|