from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import gradio as gr import torch # Set quantization config (4-bit for max speed) quant_config = BitsAndBytesConfig( load_in_4bit=True, # 4-bit precision bnb_4bit_quant_type="nf4", # NF4 for better accuracy bnb_4bit_compute_dtype=torch.float16, # Use float16 for computation device_map="auto" ) # Load Phi-2 (smaller model with high-quality responses) model_name = "microsoft/phi-2" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Speed up inference with torch.compile model = torch.compile(model) # Compile the model for faster inference def respond(message, history): inputs = tokenizer(message, return_tensors="pt") outputs = model.generate(inputs.input_ids, max_new_tokens=50, temperature=0.7, top_p=0.9) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Gradio Chat Interface gr.ChatInterface( respond, title="🤖 Phi-2 Chatbot", description="Ask me anything! Powered by Phi-2.", examples=["What's your favorite book?", "Tell me a fun fact about space!"], theme="soft" ).launch()