import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer from peft import PeftModel import gradio as gr import threading # --- Load Model & Tokenizer --- base_model_name = "unsloth/llama-3.2-3b-bnb-4bit" adapter_model_name = "aismaanly/ai_synthetic" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 ) print("Loading base model...") model = AutoModelForCausalLM.from_pretrained( base_model_name, quantization_config=bnb_config, device_map="auto" ) print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(base_model_name) print("Loading PEFT adapter...") model = PeftModel.from_pretrained(model, adapter_model_name) model = model.merge_and_unload() print("Model ready!") # --- Gradio Streaming Function --- def chat_fn(message, history, max_tokens): prompt = message inputs = tokenizer(prompt, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer( tokenizer, skip_special_tokens=True ) generation_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=int(max_tokens), temperature=0.7, ) thread = threading.Thread( target=model.generate, kwargs=generation_kwargs ) thread.start() partial_text = "" for new_text in streamer: partial_text += new_text yield partial_text # --- Create Dropdown Component for max tokens --- dropdown = gr.Dropdown( choices=["100", "200", "300"], value="100", label="Max New Tokens" ) # --- Launch Gradio Chat Interface --- gr.ChatInterface( fn=chat_fn, additional_inputs=[dropdown], title="LLM Finetuned Comment Generator", description="Chat with the model.", ).launch(share=False)