major problem couldnt find a fix
#40
by
AladinBroDev
- opened
ValueError: .to
is not supported for 4-bit
or 8-bit
bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct dtype
.
and this is my code please fix
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json
import re
app = FastAPI()
βββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MODEL_NAME = "HuggingFaceH4/zephyr-7b-alpha"
SYSTEM_PROMPT = """You are an expert Agile product coach. When asked to plan a project:
- Provide 2-3 line summary
- List Epics with descriptions
- Include 3-5 User Stories per Epic
- Recommend tech stack
- Add estimates and prioritization
Format your reply with markdown sections followed by JSON:
{"project": {"summary": "...", "epics": [...]}}```"""
# βββ Pydantic Models βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class ChatRequest(BaseModel):
message: str
history: list = []
class ChatResponse(BaseModel):
reply: str
history: list
structured_data: dict = None
# βββ Model Initialization ββββββββββββββββββββββββββββββββββββββββββββββββββ
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=quant_config
)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if torch.cuda.is_available() else -1
)
# βββ Helper Functions ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def extract_json(text: str) -> dict:
match = re.search(r'```json\n(.*?)\n```', text, re.DOTALL)
if match:
try:
return json.loads(match.group(1))
except json.JSONDecodeError:
return None
return None
def format_messages(history: list, new_message: str) -> list:
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for i, content in enumerate(history):
messages.append({
"role": "user" if i % 2 == 0 else "assistant",
"content": content
})
messages.append({"role": "user", "content": new_message})
return messages
# βββ API Endpoint ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@app
.post("/chat", response_model=ChatResponse)
async def chat_endpoint(request: ChatRequest):
try:
# Format conversation history
messages = format_messages(request.history, request.message)
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Generate response
outputs = pipe(
prompt,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
full_response = outputs[0]["generated_text"][len(prompt):]
structured_data = extract_json(full_response)
# Clean response text
clean_response = re.sub(r'<\|.*?\|>', '', full_response).strip()
return ChatResponse(
reply=clean_response,
history=request.history + [request.message, clean_response],
structured_data=structured_data
)
except Exception as e:
return {"error": str(e)}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)