# app.py import torch import gradio as gr import spaces # ๐Ÿ‘ˆ Required for ZeroGPU from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel # ------------------------------------------------- # MODEL LOADING # ------------------------------------------------- BASE_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" ADAPTER_MODEL = "GilbertAkham/deepseek-R1-multitask-lora" print("๐Ÿ”„ Loading base model and LoRA adapter...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, # 4-bit quantization for GPU memory efficiency bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, device_map="auto", quantization_config=bnb_config, torch_dtype=torch.float16, trust_remote_code=True, ) model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) model.eval() print("โœ… Model and tokenizer loaded successfully!") # ------------------------------------------------- # GPU INFERENCE FUNCTION # ------------------------------------------------- @spaces.GPU # ๐Ÿ‘ˆ Required for ZeroGPU runtime def generate_response(message, history, system_message, max_tokens, temperature, top_p): """ Generates text using the multitask LoRA model. Supports reasoning, chat, summarization, story continuation, etc. """ prompt = f"{system_message}\n\n" for turn in history: prompt += f"User: {turn['content']}\nAssistant: {turn.get('response', '')}\n" prompt += f"User: {message}\nAssistant:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.1, ) text = tokenizer.decode(output[0], skip_special_tokens=True) answer = text.split("Assistant:")[-1].strip() return answer # ------------------------------------------------- # GRADIO CHAT INTERFACE # ------------------------------------------------- chatbot = gr.ChatInterface( fn=generate_response, type="messages", additional_inputs=[ gr.Textbox( value=( "You are Chat-Bot, a helpful and logical assistant trained for " "reasoning, email, chatting, summarization, story continuation, and report writing." ), label="๐Ÿง  System Message", ), gr.Slider(64, 2048, value=512, step=16, label="๐Ÿ“ Max New Tokens"), gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="๐ŸŒก๏ธ Temperature"), gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="๐ŸŽฏ Top-p"), ], ) # ------------------------------------------------- # UI LAYOUT # ------------------------------------------------- with gr.Blocks(title="Gilbert Multitask Reasoning AI") as demo: with gr.Sidebar(): gr.Markdown("## ๐Ÿ’ก About This App") gr.Markdown( """ - **Model:** `GilbertAkham/deepseek-R1-multitask-lora` - **Base:** `deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B` - **Capabilities:** ๐Ÿงฉ Reasoning, ๐Ÿ—ฃ๏ธ Chat, ๐Ÿ“ง Email writing, ๐Ÿ“š Summarization, โœ๏ธ Story continuation, ๐Ÿงพ Report generation - **ZeroGPU Enabled:** GPU spins up only when generating responses. """ ) chatbot.render() # ------------------------------------------------- # LAUNCH # ------------------------------------------------- if __name__ == "__main__": demo.launch()