Spaces:

AI-RESEARCHER-2024
/

Creighton

Runtime error

App Files Files Community

AI-RESEARCHER-2024 commited on 22 days ago

Commit

fab8f7a

verified ·

1 Parent(s): b1b786a

Create app.py

Browse files

Files changed (1) hide show

app.py +261 -0

app.py ADDED Viewed

	@@ -0,0 +1,261 @@

+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import spaces
+import os
+# Model configuration
+# Replace with your desired model from Hugging Face Hub
+MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"  # Example with Llama 3.2 3B
+# For larger models, you might use: "meta-llama/Llama-3.1-8B-Instruct"
+# Note: Some models require access approval on Hugging Face
+# Set device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Initialize model and tokenizer
+print(f"Loading model: {MODEL_ID}")
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True
+)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+    device_map="auto",
+    trust_remote_code=True,
+    low_cpu_mem_usage=True
+)
+# For MoE (Mixture of Experts) models like Mixtral, you would use:
+# MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+# This is an example of a model with multiple experts
+@spaces.GPU(duration=60)  # Request GPU for 60 seconds per inference
+def generate_response(
+    message,
+    history,
+    max_tokens=512,
+    temperature=0.7,
+    top_p=0.95,
+    repetition_penalty=1.1,
+):
+    """Generate response using the loaded model"""
+    # Format the conversation history
+    messages = []
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    # Apply chat template if available
+    if hasattr(tokenizer, 'apply_chat_template'):
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+    else:
+        # Fallback formatting
+        prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
+        prompt += "\nassistant: "
+    # Tokenize input
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Generate response
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+            repetition_penalty=repetition_penalty,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    # Decode response
+    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+    return response
+# Alternative: Using pipeline (simpler but less control)
+def generate_with_pipeline(message, history, max_tokens=512, temperature=0.7):
+    """Alternative generation using transformers pipeline"""
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        device=device
+    )
+    messages = []
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    response = pipe(
+        messages,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        do_sample=True,
+        return_full_text=False
+    )
+    return response[0]['generated_text']
+# Create Gradio interface
+with gr.Blocks(title="Open Source LLM Chat") as demo:
+    gr.Markdown(f"""
+    # 🤖 Open Source LLM Chat Interface
+    **Model**: {MODEL_ID}
+    This interface allows you to chat with open-source language models from Hugging Face.
+    """)
+    chatbot = gr.Chatbot(
+        height=500,
+        show_label=False,
+        elem_id="chatbot"
+    )
+    with gr.Row():
+        msg = gr.Textbox(
+            label="Message",
+            placeholder="Type your message here...",
+            lines=2,
+            scale=4
+        )
+        submit_btn = gr.Button("Send", variant="primary", scale=1)
+    with gr.Accordion("⚙️ Generation Settings", open=False):
+        max_tokens = gr.Slider(
+            minimum=50,
+            maximum=2048,
+            value=512,
+            step=50,
+            label="Max Tokens"
+        )
+        temperature = gr.Slider(
+            minimum=0.1,
+            maximum=2.0,
+            value=0.7,
+            step=0.1,
+            label="Temperature (higher = more creative)"
+        )
+        top_p = gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.95,
+            step=0.05,
+            label="Top P (nucleus sampling)"
+        )
+        repetition_penalty = gr.Slider(
+            minimum=1.0,
+            maximum=2.0,
+            value=1.1,
+            step=0.1,
+            label="Repetition Penalty"
+        )
+    with gr.Row():
+        clear_btn = gr.Button("🗑️ Clear Chat")
+        retry_btn = gr.Button("🔄 Retry Last")
+        undo_btn = gr.Button("↩️ Undo")
+    # Example prompts
+    gr.Examples(
+        examples=[
+            "Explain quantum computing in simple terms",
+            "Write a Python function to find prime numbers",
+            "What are the key differences between supervised and unsupervised learning?",
+            "Create a healthy meal plan for a week",
+            "Explain the concept of blockchain technology"
+        ],
+        inputs=msg,
+        label="Example Prompts"
+    )
+    # Event handlers
+    def user_submit(message, history):
+        return "", history + [[message, None]]
+    def bot_response(history, max_tokens, temperature, top_p, repetition_penalty):
+        if not history:
+            return history
+        message = history[-1][0]
+        bot_message = generate_response(
+            message,
+            history[:-1],
+            max_tokens,
+            temperature,
+            top_p,
+            repetition_penalty
+        )
+        history[-1][1] = bot_message
+        return history
+    def clear_chat():
+        return None
+    def retry_last(history):
+        if history and history[-1][1]:
+            history[-1][1] = None
+            return history
+        return history
+    def undo_last(history):
+        if history:
+            return history[:-1]
+        return history
+    # Connect events
+    msg.submit(
+        user_submit,
+        [msg, chatbot],
+        [msg, chatbot]
+    ).then(
+        bot_response,
+        [chatbot, max_tokens, temperature, top_p, repetition_penalty],
+        chatbot
+    )
+    submit_btn.click(
+        user_submit,
+        [msg, chatbot],
+        [msg, chatbot]
+    ).then(
+        bot_response,
+        [chatbot, max_tokens, temperature, top_p, repetition_penalty],
+        chatbot
+    )
+    clear_btn.click(clear_chat, outputs=chatbot)
+    retry_btn.click(retry_last, chatbot, chatbot).then(
+        bot_response,
+        [chatbot, max_tokens, temperature, top_p, repetition_penalty],
+        chatbot
+    )
+    undo_btn.click(undo_last, chatbot, chatbot)
+    # Footer
+    gr.Markdown("""
+    ---
+    💡 **Tips**:
+    - Adjust temperature for more/less creative responses
+    - Use repetition penalty to reduce repetitive text
+    - Some models require Hugging Face access tokens
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()