Molchevsky commited on
Commit
423e539
·
1 Parent(s): 630e7aa

many updates

Browse files
Files changed (1) hide show
  1. app.py +57 -103
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
3
  import torch
4
- from threading import Thread
5
  import traceback
6
 
7
  # Fixed system prompt (your "persona")
@@ -32,8 +31,8 @@ def load_model():
32
  # CPU-optimized model loading
33
  model = AutoModelForCausalLM.from_pretrained(
34
  model_name,
35
- torch_dtype=torch.float32, # Use float32 for CPU
36
- device_map=None, # Don't use device_map for CPU
37
  low_cpu_mem_usage=True,
38
  trust_remote_code=True,
39
  use_cache=True,
@@ -41,11 +40,9 @@ def load_model():
41
 
42
  # Explicitly move to CPU
43
  model = model.to('cpu')
 
44
 
45
  print(f"Model loaded successfully on CPU!")
46
- print(f"Model device: {next(model.parameters()).device}")
47
- print(f"Model dtype: {next(model.parameters()).dtype}")
48
-
49
  return True
50
 
51
  except Exception as e:
@@ -56,121 +53,78 @@ def load_model():
56
  # Load model at startup
57
  model_loaded = load_model()
58
 
59
- def respond(
60
- message,
61
- history: list[dict[str, str]],
62
- max_tokens,
63
- temperature,
64
- top_p,
65
- ):
66
  """
67
- Generate response using CPU inference.
68
  """
69
  if not model_loaded or model is None or tokenizer is None:
70
- yield "Error: Model not loaded properly. Please check the logs."
71
- return
72
 
73
  try:
74
- print(f"Processing message: {message}")
75
-
76
- # Keep conversation history manageable for CPU
77
- recent_history = history[-3:] if len(history) > 3 else history
78
-
79
- # Build simple conversation format
80
- conversation_text = f"{SYSTEM_PROMPT}\n\n"
81
-
82
- # Add recent history
83
- for msg in recent_history:
84
- if msg.get("role") == "user":
85
- conversation_text += f"User: {msg['content']}\n"
86
- elif msg.get("role") == "assistant":
87
- conversation_text += f"Assistant: {msg['content']}\n"
88
 
89
- conversation_text += f"User: {message}\nAssistant:"
 
 
90
 
91
- print(f"Prompt length: {len(conversation_text)}")
 
 
 
92
 
93
- # Tokenize - keep it simple for CPU
94
- inputs = tokenizer(
95
- conversation_text,
96
- return_tensors="pt",
97
- truncation=True,
98
- max_length=1024, # Shorter for CPU
99
- padding=False
100
- )
101
-
102
- print(f"Input tokens shape: {inputs.input_ids.shape}")
103
-
104
- # CPU-optimized generation with streaming
105
- streamer = TextIteratorStreamer(
106
- tokenizer,
107
- timeout=120, # Longer timeout for CPU
108
- skip_prompt=True,
109
- skip_special_tokens=True
110
- )
111
 
112
- generation_kwargs = {
113
- "input_ids": inputs.input_ids,
114
- "attention_mask": inputs.attention_mask,
115
- "streamer": streamer,
116
- "max_new_tokens": min(max_tokens, 200), # Limit for CPU
117
- "temperature": temperature,
118
- "top_p": top_p,
119
- "do_sample": True,
120
- "pad_token_id": tokenizer.eos_token_id,
121
- "eos_token_id": tokenizer.eos_token_id,
122
- "use_cache": True,
123
- # CPU-specific optimizations
124
- "num_beams": 1, # No beam search for speed
125
- }
126
 
127
- print("Starting CPU generation...")
 
 
128
 
129
- # Start generation in thread
130
- generation_thread = Thread(target=model.generate, kwargs=generation_kwargs)
131
- generation_thread.start()
132
 
133
- # Stream response
134
- response = ""
135
- token_count = 0
136
 
137
- for token in streamer:
138
- response += token
139
- token_count += 1
140
-
141
- # Yield periodically for better UX
142
- if token_count % 5 == 0 or len(response) > len(response.split()[-1]):
143
- yield response
144
-
145
- # Final yield
146
- yield response
147
- print(f"Generation completed. Response length: {len(response)}")
148
 
149
  except Exception as e:
150
- error_msg = f"Error in generation: {str(e)}"
151
  print(error_msg)
152
  print(traceback.format_exc())
153
- yield error_msg
154
-
155
- # Create the chat interface
156
- chatbot = gr.ChatInterface(
157
- respond,
158
- type="messages",
159
- additional_inputs=[
160
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
161
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
162
- gr.Slider(
163
- minimum=0.1,
164
- maximum=1.0,
165
- value=0.95,
166
- step=0.05,
167
- label="Top-p (nucleus sampling)",
168
- ),
169
- ],
170
- )
171
 
 
172
  with gr.Blocks() as demo:
173
- chatbot.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  if __name__ == "__main__":
176
  demo.launch(debug=True)
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
 
4
  import traceback
5
 
6
  # Fixed system prompt (your "persona")
 
31
  # CPU-optimized model loading
32
  model = AutoModelForCausalLM.from_pretrained(
33
  model_name,
34
+ torch_dtype=torch.float32,
35
+ device_map=None,
36
  low_cpu_mem_usage=True,
37
  trust_remote_code=True,
38
  use_cache=True,
 
40
 
41
  # Explicitly move to CPU
42
  model = model.to('cpu')
43
+ model.eval() # Set to evaluation mode
44
 
45
  print(f"Model loaded successfully on CPU!")
 
 
 
46
  return True
47
 
48
  except Exception as e:
 
53
  # Load model at startup
54
  model_loaded = load_model()
55
 
56
+ def simple_respond(message, history, max_tokens, temperature, top_p):
 
 
 
 
 
 
57
  """
58
+ Simple non-streaming generation for debugging.
59
  """
60
  if not model_loaded or model is None or tokenizer is None:
61
+ return "Error: Model not loaded properly."
 
62
 
63
  try:
64
+ print(f"Processing: {message}")
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ # Very simple prompt
67
+ prompt = f"User: {message}\nAssistant:"
68
+ print(f"Prompt: {repr(prompt)}")
69
 
70
+ # Tokenize
71
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
72
+ print(f"Input shape: {inputs.input_ids.shape}")
73
+ print(f"Input tokens: {inputs.input_ids[0][:10]}") # First 10 tokens
74
 
75
+ # Simple generation - no streaming
76
+ print("Starting generation...")
77
+ with torch.no_grad():
78
+ outputs = model.generate(
79
+ inputs.input_ids,
80
+ attention_mask=inputs.attention_mask,
81
+ max_new_tokens=20, # Very small for testing
82
+ temperature=0.7,
83
+ do_sample=True,
84
+ pad_token_id=tokenizer.eos_token_id,
85
+ eos_token_id=tokenizer.eos_token_id,
86
+ )
 
 
 
 
 
 
87
 
88
+ print("Generation completed!")
89
+ print(f"Output shape: {outputs.shape}")
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ # Decode only the new tokens
92
+ new_tokens = outputs[0][inputs.input_ids.shape[1]:]
93
+ response = tokenizer.decode(new_tokens, skip_special_tokens=True)
94
 
95
+ print(f"Response: {repr(response)}")
 
 
96
 
97
+ if not response.strip():
98
+ return "Model generated empty response. This might be a model configuration issue."
 
99
 
100
+ return response.strip()
 
 
 
 
 
 
 
 
 
 
101
 
102
  except Exception as e:
103
+ error_msg = f"Error: {str(e)}"
104
  print(error_msg)
105
  print(traceback.format_exc())
106
+ return error_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ # Create simple interface for testing
109
  with gr.Blocks() as demo:
110
+ gr.Markdown("# Debug Version - Simple Generation Test")
111
+
112
+ with gr.Row():
113
+ msg_input = gr.Textbox(label="Message", placeholder="Type your message...")
114
+ send_btn = gr.Button("Send")
115
+
116
+ output = gr.Textbox(label="Response", lines=5)
117
+
118
+ # Simple controls
119
+ max_tokens = gr.Slider(1, 100, value=20, label="Max Tokens")
120
+ temperature = gr.Slider(0.1, 2.0, value=0.7, label="Temperature")
121
+ top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top-p")
122
+
123
+ send_btn.click(
124
+ simple_respond,
125
+ inputs=[msg_input, gr.State([]), max_tokens, temperature, top_p],
126
+ outputs=output
127
+ )
128
 
129
  if __name__ == "__main__":
130
  demo.launch(debug=True)