Spaces:
Runtime error
Runtime error
File size: 1,989 Bytes
f0a5521 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from fastapi import FastAPI, HTTPException
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
app = FastAPI()
# Load model once at startup
@app.on_event("startup")
async def load_model():
try:
# Configuration
model_name = "unsloth/deepseek-r1-distill-llama-8b-unsloth-bnb-4bit"
adapter_name = "LAWSA07/medical_fine_tuned_deepseekR1"
# Load base model with 4-bit quantization
app.state.base_model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
# Attach PEFT adapter
app.state.model = PeftModel.from_pretrained(
app.state.base_model,
adapter_name,
adapter_weight_name="adapter_model.safetensors"
)
# Load tokenizer
app.state.tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Model loading failed: {str(e)}"
)
@app.get("/")
def health_check():
return {"status": "OK"}
@app.post("/generate")
async def generate_text(prompt: str, max_length: int = 200):
try:
inputs = app.state.tokenizer(
prompt,
return_tensors="pt",
padding=True
).to("cuda")
outputs = app.state.model.generate(
**inputs,
max_length=max_length,
temperature=0.7,
do_sample=True
)
decoded = app.state.tokenizer.decode(
outputs[0],
skip_special_tokens=True
)
return {"response": decoded}
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Generation failed: {str(e)}"
) |