from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gradio as gr
import torch


# Set quantization config (4-bit for max speed)
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit precision
    bnb_4bit_quant_type="nf4",  # NF4 for better accuracy
    bnb_4bit_compute_dtype=torch.float16,  # Use float16 for computation
    device_map="auto"
)
# Load Phi-2 (smaller model with high-quality responses)
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name)
# Speed up inference with torch.compile
model = torch.compile(model)  # Compile the model for faster inference

def respond(message, history):
    inputs = tokenizer(message, return_tensors="pt")
    outputs = model.generate(inputs.input_ids, max_new_tokens=50, temperature=0.7, top_p=0.9)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Gradio Chat Interface
gr.ChatInterface(
    respond,
    title="🤖 Phi-2 Chatbot",
    description="Ask me anything! Powered by Phi-2.",
    examples=["What's your favorite book?", "Tell me a fun fact about space!"],
    theme="soft"
).launch()