File size: 2,649 Bytes
5fa76ab
 
 
 
1ee9392
d8faee6
19e6be5
1ee9392
19e6be5
4f0b8fa
 
d8faee6
db6a45c
9f3503e
4f0b8fa
d236bfb
9886add
 
 
19e6be5
4f0b8fa
9886add
 
d8faee6
9463fa8
9f3503e
 
d8faee6
19e6be5
8d5fa35
 
 
 
 
 
 
 
 
d8faee6
 
5fa76ab
 
b803af4
 
 
8c27ab6
 
 
5fa76ab
d8faee6
 
9463fa8
 
 
 
8c27ab6
9463fa8
 
 
 
5fa76ab
9463fa8
f43bb1f
9463fa8
f43bb1f
9463fa8
 
8b05f55
19e6be5
9886add
 
 
 
 
 
 
 
 
 
 
 
 
b40fa52
9463fa8
896e66e
d8faee6
5fa76ab
477c34c
 
 
b803af4
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import uvicorn
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

token = os.getenv("HUGGINGFACE_TOKEN")
assert token is not None, "Hugging Face token is missing. Please set the 'HUGGINGFACE_TOKEN' environment variable."


#Load pre-trained tokenizer and model (Works)
model_name = "microsoft/Phi-4-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    token=token
)


# Example usage: Generate text
prompt = "<|system|>You are a helpful assistant<|end|><|user|>What is the capital of france?<|end|><|assistant|>"


inputs = tokenizer(prompt, return_tensors="pt", padding=True, return_attention_mask=True ).to(model.device)
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    pad_token_id=tokenizer.eos_token_id  # Set this to suppress warning
)

resp = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(resp)


app = FastAPI()

class EchoMessage(BaseModel):
    message: str

class Item(BaseModel):
    prompt: str


@app.post("/generate/")
async def generate_text(item: Item):
    # messages = [
    #     {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    #     {"role": "user", "content": "Give me a short introduction to LLMs."},
    # ]

    # outputs = pipeline(
    #     messages,
    #     max_new_tokens=512,
    # )

    # logging.info("request got")

    # resp = outputs[0]["generated_text"][-1]

    # logging.info("Response generated")

    inp =f"<|system|>You are a helpful assistant<|end|><|user|> {item.prompt} <|end|><|assistant|>"
    inputs = tokenizer(inp, return_tensors="pt", padding=True, return_attention_mask=True ).to(model.device)


    # input_ids = tokenizer.encode(item.prompt, return_tensors="pt")
    # output = model.generate(input_ids, max_length=50, num_return_sequences=1)
    # generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    # resp = generated_text

    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        pad_token_id=tokenizer.eos_token_id  # Set this to suppress warning
    )

    resp = tokenizer.decode(outputs[0], skip_special_tokens=False)


    return {"response": resp}

@app.get("/")
async def home():
    return {"msg":"hey"}

@app.post("/echo/")
async def echo(echo_msg: EchoMessage):
    return {"msg":echo_msg.message}