import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel # Your adapter (LoRA fine-tuned model on Hugging Face) ADAPTER_ID = "Anabury/My_Finetuned_Phi-4" # Detect device USE_GPU = torch.cuda.is_available() # Pick base model depending on device if USE_GPU: BASE_MODEL = "unsloth/phi-4-unsloth-bnb-4bit" # fast + quantized else: BASE_MODEL = "unsloth/phi-4" # full precision for CPU print(f"Loading base model: {BASE_MODEL} on {'GPU' if USE_GPU else 'CPU'}") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) # Load base model base = AutoModelForCausalLM.from_pretrained( BASE_MODEL, device_map="auto" if USE_GPU else None, torch_dtype=torch.float16 if USE_GPU else torch.float32, trust_remote_code=True ) # Attach your LoRA adapter model = PeftModel.from_pretrained(base, ADAPTER_ID) model.eval() # Chat function def chat(message, history): # simple prompt, you can swap in chat template later inputs = tokenizer(message, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id, ) reply = tokenizer.decode(outputs[0], skip_special_tokens=True) history.append((message, reply)) return history, history # Gradio UI with gr.Blocks() as demo: gr.Markdown("# 🧠 Phi-4 Chatbot (Fine-tuned)") chatbot = gr.Chatbot(height=420) msg = gr.Textbox(placeholder="Ask me anything…") clear = gr.Button("Clear") msg.submit(chat, [msg, chatbot], [chatbot, chatbot]) clear.click(lambda: [], None, chatbot, queue=False) demo.launch()