AI-RESEARCHER-2024 commited on
Commit
fab8f7a
·
verified ·
1 Parent(s): b1b786a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -0
app.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
+ import spaces
5
+ import os
6
+
7
+ # Model configuration
8
+ # Replace with your desired model from Hugging Face Hub
9
+ MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct" # Example with Llama 3.2 3B
10
+ # For larger models, you might use: "meta-llama/Llama-3.1-8B-Instruct"
11
+ # Note: Some models require access approval on Hugging Face
12
+
13
+ # Set device
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ # Initialize model and tokenizer
17
+ print(f"Loading model: {MODEL_ID}")
18
+ tokenizer = AutoTokenizer.from_pretrained(
19
+ MODEL_ID,
20
+ trust_remote_code=True
21
+ )
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ MODEL_ID,
24
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
25
+ device_map="auto",
26
+ trust_remote_code=True,
27
+ low_cpu_mem_usage=True
28
+ )
29
+
30
+ # For MoE (Mixture of Experts) models like Mixtral, you would use:
31
+ # MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
32
+ # This is an example of a model with multiple experts
33
+
34
+ @spaces.GPU(duration=60) # Request GPU for 60 seconds per inference
35
+ def generate_response(
36
+ message,
37
+ history,
38
+ max_tokens=512,
39
+ temperature=0.7,
40
+ top_p=0.95,
41
+ repetition_penalty=1.1,
42
+ ):
43
+ """Generate response using the loaded model"""
44
+
45
+ # Format the conversation history
46
+ messages = []
47
+ for user_msg, assistant_msg in history:
48
+ messages.append({"role": "user", "content": user_msg})
49
+ if assistant_msg:
50
+ messages.append({"role": "assistant", "content": assistant_msg})
51
+ messages.append({"role": "user", "content": message})
52
+
53
+ # Apply chat template if available
54
+ if hasattr(tokenizer, 'apply_chat_template'):
55
+ prompt = tokenizer.apply_chat_template(
56
+ messages,
57
+ tokenize=False,
58
+ add_generation_prompt=True
59
+ )
60
+ else:
61
+ # Fallback formatting
62
+ prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
63
+ prompt += "\nassistant: "
64
+
65
+ # Tokenize input
66
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
67
+ inputs = {k: v.to(device) for k, v in inputs.items()}
68
+
69
+ # Generate response
70
+ with torch.no_grad():
71
+ outputs = model.generate(
72
+ **inputs,
73
+ max_new_tokens=max_tokens,
74
+ temperature=temperature,
75
+ top_p=top_p,
76
+ do_sample=True,
77
+ repetition_penalty=repetition_penalty,
78
+ eos_token_id=tokenizer.eos_token_id,
79
+ pad_token_id=tokenizer.eos_token_id,
80
+ )
81
+
82
+ # Decode response
83
+ response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
84
+
85
+ return response
86
+
87
+ # Alternative: Using pipeline (simpler but less control)
88
+ def generate_with_pipeline(message, history, max_tokens=512, temperature=0.7):
89
+ """Alternative generation using transformers pipeline"""
90
+ pipe = pipeline(
91
+ "text-generation",
92
+ model=model,
93
+ tokenizer=tokenizer,
94
+ device=device
95
+ )
96
+
97
+ messages = []
98
+ for user_msg, assistant_msg in history:
99
+ messages.append({"role": "user", "content": user_msg})
100
+ if assistant_msg:
101
+ messages.append({"role": "assistant", "content": assistant_msg})
102
+ messages.append({"role": "user", "content": message})
103
+
104
+ response = pipe(
105
+ messages,
106
+ max_new_tokens=max_tokens,
107
+ temperature=temperature,
108
+ do_sample=True,
109
+ return_full_text=False
110
+ )
111
+
112
+ return response[0]['generated_text']
113
+
114
+ # Create Gradio interface
115
+ with gr.Blocks(title="Open Source LLM Chat") as demo:
116
+ gr.Markdown(f"""
117
+ # 🤖 Open Source LLM Chat Interface
118
+
119
+ **Model**: {MODEL_ID}
120
+
121
+ This interface allows you to chat with open-source language models from Hugging Face.
122
+ """)
123
+
124
+ chatbot = gr.Chatbot(
125
+ height=500,
126
+ show_label=False,
127
+ elem_id="chatbot"
128
+ )
129
+
130
+ with gr.Row():
131
+ msg = gr.Textbox(
132
+ label="Message",
133
+ placeholder="Type your message here...",
134
+ lines=2,
135
+ scale=4
136
+ )
137
+ submit_btn = gr.Button("Send", variant="primary", scale=1)
138
+
139
+ with gr.Accordion("⚙️ Generation Settings", open=False):
140
+ max_tokens = gr.Slider(
141
+ minimum=50,
142
+ maximum=2048,
143
+ value=512,
144
+ step=50,
145
+ label="Max Tokens"
146
+ )
147
+ temperature = gr.Slider(
148
+ minimum=0.1,
149
+ maximum=2.0,
150
+ value=0.7,
151
+ step=0.1,
152
+ label="Temperature (higher = more creative)"
153
+ )
154
+ top_p = gr.Slider(
155
+ minimum=0.1,
156
+ maximum=1.0,
157
+ value=0.95,
158
+ step=0.05,
159
+ label="Top P (nucleus sampling)"
160
+ )
161
+ repetition_penalty = gr.Slider(
162
+ minimum=1.0,
163
+ maximum=2.0,
164
+ value=1.1,
165
+ step=0.1,
166
+ label="Repetition Penalty"
167
+ )
168
+
169
+ with gr.Row():
170
+ clear_btn = gr.Button("🗑️ Clear Chat")
171
+ retry_btn = gr.Button("🔄 Retry Last")
172
+ undo_btn = gr.Button("↩️ Undo")
173
+
174
+ # Example prompts
175
+ gr.Examples(
176
+ examples=[
177
+ "Explain quantum computing in simple terms",
178
+ "Write a Python function to find prime numbers",
179
+ "What are the key differences between supervised and unsupervised learning?",
180
+ "Create a healthy meal plan for a week",
181
+ "Explain the concept of blockchain technology"
182
+ ],
183
+ inputs=msg,
184
+ label="Example Prompts"
185
+ )
186
+
187
+ # Event handlers
188
+ def user_submit(message, history):
189
+ return "", history + [[message, None]]
190
+
191
+ def bot_response(history, max_tokens, temperature, top_p, repetition_penalty):
192
+ if not history:
193
+ return history
194
+
195
+ message = history[-1][0]
196
+ bot_message = generate_response(
197
+ message,
198
+ history[:-1],
199
+ max_tokens,
200
+ temperature,
201
+ top_p,
202
+ repetition_penalty
203
+ )
204
+ history[-1][1] = bot_message
205
+ return history
206
+
207
+ def clear_chat():
208
+ return None
209
+
210
+ def retry_last(history):
211
+ if history and history[-1][1]:
212
+ history[-1][1] = None
213
+ return history
214
+ return history
215
+
216
+ def undo_last(history):
217
+ if history:
218
+ return history[:-1]
219
+ return history
220
+
221
+ # Connect events
222
+ msg.submit(
223
+ user_submit,
224
+ [msg, chatbot],
225
+ [msg, chatbot]
226
+ ).then(
227
+ bot_response,
228
+ [chatbot, max_tokens, temperature, top_p, repetition_penalty],
229
+ chatbot
230
+ )
231
+
232
+ submit_btn.click(
233
+ user_submit,
234
+ [msg, chatbot],
235
+ [msg, chatbot]
236
+ ).then(
237
+ bot_response,
238
+ [chatbot, max_tokens, temperature, top_p, repetition_penalty],
239
+ chatbot
240
+ )
241
+
242
+ clear_btn.click(clear_chat, outputs=chatbot)
243
+ retry_btn.click(retry_last, chatbot, chatbot).then(
244
+ bot_response,
245
+ [chatbot, max_tokens, temperature, top_p, repetition_penalty],
246
+ chatbot
247
+ )
248
+ undo_btn.click(undo_last, chatbot, chatbot)
249
+
250
+ # Footer
251
+ gr.Markdown("""
252
+ ---
253
+ 💡 **Tips**:
254
+ - Adjust temperature for more/less creative responses
255
+ - Use repetition penalty to reduce repetitive text
256
+ - Some models require Hugging Face access tokens
257
+ """)
258
+
259
+ # Launch the app
260
+ if __name__ == "__main__":
261
+ demo.launch()