Frankie-walsh4 commited on
Commit
cddec45
Β·
1 Parent(s): 387c509
Files changed (1) hide show
  1. app.py +89 -44
app.py CHANGED
@@ -29,10 +29,11 @@ headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
29
  # Global variable to track if model is warmed up
30
  model_warmed_up = False
31
  warming_up = False
 
32
 
33
  def warm_up_model():
34
  """Send a warmup request to get the model loaded before user interaction"""
35
- global warming_up, model_warmed_up
36
 
37
  if warming_up:
38
  return # Already warming up
@@ -40,38 +41,49 @@ def warm_up_model():
40
  warming_up = True
41
  print("Starting model warm-up...")
42
 
43
- # Simple warmup message
44
- warmup_messages = [
45
- {"role": "system", "content": "You are a helpful assistant."},
46
- {"role": "user", "content": "Hello"}
47
- ]
48
-
49
- # Try direct API approach first
50
- try:
51
- payload = {
52
- "inputs": warmup_messages,
53
- "parameters": {
54
- "max_new_tokens": 5, # Just need a short response
55
- "temperature": 0.1,
56
- "top_p": 0.95,
57
- },
58
- "stream": False,
59
- }
60
-
61
- print("Sending warmup request...")
62
- response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- if response.status_code == 200:
65
- print("Warmup successful!")
66
- model_warmed_up = True
67
- else:
68
- print(f"Warmup API call failed with status {response.status_code}")
69
- print(f"Response: {response.text}")
70
- except Exception as e:
71
- print(f"Warmup exception: {str(e)}")
72
 
73
  # Even if it failed, mark as no longer warming up
74
  warming_up = False
 
75
 
76
  # Start warmup in background thread
77
  threading.Thread(target=warm_up_model, daemon=True).start()
@@ -84,11 +96,14 @@ def respond(
84
  temperature,
85
  top_p,
86
  ):
87
- global model_warmed_up
88
 
89
- # If model isn't warmed up yet, give a message
90
  if not model_warmed_up:
91
- yield "βŒ› Model is being loaded for the first time, this may take up to a minute. Please be patient..."
 
 
 
92
 
93
  messages = [{"role": "system", "content": system_message}]
94
 
@@ -106,7 +121,7 @@ def respond(
106
  print(f"Sending messages: {json.dumps(messages, indent=2)}")
107
 
108
  # Try to initialize the model with retries
109
- max_retries = 5 # Increased from 3 to 5
110
  retry_count = 0
111
 
112
  # Try both methods: InferenceClient and direct API call
@@ -124,7 +139,6 @@ def respond(
124
  stream=True,
125
  temperature=temperature,
126
  top_p=top_p,
127
- timeout=30, # Increased timeout
128
  ):
129
  token = message.choices[0].delta.content
130
  if token:
@@ -147,7 +161,7 @@ def respond(
147
  }
148
 
149
  print(f"Making direct API call to {API_URL}")
150
- api_response = requests.post(API_URL, headers=headers, json=payload, timeout=60) # Increased timeout
151
  print(f"API response status: {api_response.status_code}")
152
 
153
  if api_response.status_code == 200:
@@ -161,11 +175,22 @@ def respond(
161
  else:
162
  print(f"Unexpected API response format: {result}")
163
  retry_count += 1
 
 
 
 
 
 
 
 
 
 
 
164
  else:
165
  print(f"API error: {api_response.text}")
166
  if api_response.status_code == 504 and retry_count < max_retries - 1:
167
  retry_count += 1
168
- wait_time = 15 # Increased wait time
169
  yield f"βŒ› Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
170
  time.sleep(wait_time)
171
  else:
@@ -177,18 +202,25 @@ def respond(
177
  error_message = str(e)
178
  print(f"Error: {error_message}")
179
 
180
- if "504 Server Error: Gateway Timeout" in error_message:
181
  if retry_count < max_retries - 1:
182
- wait_time = 15 # Increased wait time
183
- print(f"Model timed out. Waiting {wait_time} seconds before retry {retry_count}/{max_retries}...")
184
  yield f"βŒ› Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
185
  time.sleep(wait_time)
186
- # Try direct API on next attempt if we've tried InferenceClient twice
187
  if retry_count >= 2:
188
  use_direct_api = True
189
  else:
190
  print("All retries failed.")
191
- yield "❌ The model timed out after multiple attempts. Your model is probably too large for the free tier. Try again in a few minutes or consider using a smaller model."
 
 
 
 
 
 
 
192
  break
193
  else:
194
  print(f"Non-timeout error: {error_message}")
@@ -198,8 +230,21 @@ def respond(
198
 
199
  except Exception as e:
200
  print(f"Unexpected error: {str(e)}")
201
- yield f"❌ Unexpected error: {str(e)}"
202
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
 
205
  """
@@ -220,7 +265,7 @@ demo = gr.ChatInterface(
220
  ),
221
  ],
222
  description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
223
- ⚠️ Note: This model needs time to load when first used. You may experience a delay of up to 60 seconds on your first message."""
224
  )
225
 
226
 
 
29
  # Global variable to track if model is warmed up
30
  model_warmed_up = False
31
  warming_up = False
32
+ estimated_time = None
33
 
34
  def warm_up_model():
35
  """Send a warmup request to get the model loaded before user interaction"""
36
+ global warming_up, model_warmed_up, estimated_time
37
 
38
  if warming_up:
39
  return # Already warming up
 
41
  warming_up = True
42
  print("Starting model warm-up...")
43
 
44
+ # Try up to 10 times with increasing delays
45
+ for attempt in range(1, 11):
46
+ try:
47
+ # Simple check if model is loaded
48
+ print(f"Warmup attempt {attempt}/10...")
49
+ response = requests.get(
50
+ API_URL,
51
+ headers=headers
52
+ )
53
+
54
+ print(f"Status: {response.status_code}")
55
+ response_json = response.json() if response.text else {}
56
+ print(f"Response: {response_json}")
57
+
58
+ # If we get a 200, the model is loaded
59
+ if response.status_code == 200:
60
+ print("Warmup successful! Model is ready.")
61
+ model_warmed_up = True
62
+ return
63
+
64
+ # If model is loading, get the estimated time
65
+ if response.status_code == 503 and "estimated_time" in response_json:
66
+ est_time = response_json["estimated_time"]
67
+ estimated_time = est_time
68
+ print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
69
+
70
+ # If estimated time is long, wait longer
71
+ wait_time = min(30, max(10, est_time / 4)) # Cap at 30 seconds, minimum 10
72
+ print(f"Waiting {wait_time:.2f} seconds before next check...")
73
+ time.sleep(wait_time)
74
+ else:
75
+ # Other error, wait and retry
76
+ wait_time = 10 * attempt # Increase wait time with each attempt
77
+ print(f"Waiting {wait_time} seconds before retry...")
78
+ time.sleep(wait_time)
79
 
80
+ except Exception as e:
81
+ print(f"Warmup exception: {str(e)}")
82
+ time.sleep(15) # Wait before retry on exception
 
 
 
 
 
83
 
84
  # Even if it failed, mark as no longer warming up
85
  warming_up = False
86
+ print("Warmup process completed (or gave up after max attempts)")
87
 
88
  # Start warmup in background thread
89
  threading.Thread(target=warm_up_model, daemon=True).start()
 
96
  temperature,
97
  top_p,
98
  ):
99
+ global model_warmed_up, estimated_time
100
 
101
+ # If model isn't warmed up yet, give a message with estimated time if available
102
  if not model_warmed_up:
103
+ if estimated_time:
104
+ yield f"βŒ› Model is being loaded for the first time, estimated wait time: {estimated_time:.0f} seconds. Please be patient or try again later."
105
+ else:
106
+ yield "βŒ› Model is being loaded for the first time, this may take 2-3 minutes. Please be patient or try again later."
107
 
108
  messages = [{"role": "system", "content": system_message}]
109
 
 
121
  print(f"Sending messages: {json.dumps(messages, indent=2)}")
122
 
123
  # Try to initialize the model with retries
124
+ max_retries = 8 # Even more retries
125
  retry_count = 0
126
 
127
  # Try both methods: InferenceClient and direct API call
 
139
  stream=True,
140
  temperature=temperature,
141
  top_p=top_p,
 
142
  ):
143
  token = message.choices[0].delta.content
144
  if token:
 
161
  }
162
 
163
  print(f"Making direct API call to {API_URL}")
164
+ api_response = requests.post(API_URL, headers=headers, json=payload, timeout=180) # 3 minute timeout
165
  print(f"API response status: {api_response.status_code}")
166
 
167
  if api_response.status_code == 200:
 
175
  else:
176
  print(f"Unexpected API response format: {result}")
177
  retry_count += 1
178
+ elif api_response.status_code == 503 and "estimated_time" in api_response.json():
179
+ # Model is loading, get estimated time
180
+ est_time = api_response.json()["estimated_time"]
181
+ estimated_time = est_time
182
+ retry_count += 1
183
+ print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
184
+
185
+ # Wait a portion of the estimated time
186
+ wait_time = min(45, max(15, est_time / 3)) # Cap at 45 seconds
187
+ yield f"βŒ› Model is loading... Estimated time remaining: {est_time:.0f} seconds (Attempt {retry_count}/{max_retries})"
188
+ time.sleep(wait_time)
189
  else:
190
  print(f"API error: {api_response.text}")
191
  if api_response.status_code == 504 and retry_count < max_retries - 1:
192
  retry_count += 1
193
+ wait_time = 20 # Increased wait time
194
  yield f"βŒ› Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
195
  time.sleep(wait_time)
196
  else:
 
202
  error_message = str(e)
203
  print(f"Error: {error_message}")
204
 
205
+ if "503 Service Unavailable" in error_message or "504 Server Error: Gateway Timeout" in error_message:
206
  if retry_count < max_retries - 1:
207
+ wait_time = 20 # Increased wait time
208
+ print(f"Model not ready. Waiting {wait_time} seconds before retry {retry_count}/{max_retries}...")
209
  yield f"βŒ› Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
210
  time.sleep(wait_time)
211
+ # Switch to direct API after 2 attempts
212
  if retry_count >= 2:
213
  use_direct_api = True
214
  else:
215
  print("All retries failed.")
216
+ yield """❌ The model couldn't be loaded after multiple attempts.
217
+
218
+ This is common with larger models like Mistral-7B on free-tier hosting.
219
+
220
+ Please try:
221
+ 1. Waiting a few minutes and trying again
222
+ 2. Creating a quantized (4-bit) version of your model which loads faster
223
+ 3. Using a smaller model for better performance"""
224
  break
225
  else:
226
  print(f"Non-timeout error: {error_message}")
 
230
 
231
  except Exception as e:
232
  print(f"Unexpected error: {str(e)}")
233
+ retry_count += 1
234
+
235
+ # For the specific timeout error we saw, switch to direct API
236
+ if "timeout" in str(e).lower():
237
+ use_direct_api = True
238
+ print("Switching to direct API method due to timeout parameter error")
239
+ if retry_count < max_retries - 1:
240
+ yield "βŒ› Trying alternative API method..."
241
+ time.sleep(2) # Short delay before retry
242
+ else:
243
+ yield f"❌ Unexpected error: {str(e)}"
244
+ break
245
+ else:
246
+ yield f"❌ Unexpected error: {str(e)}"
247
+ break
248
 
249
 
250
  """
 
265
  ),
266
  ],
267
  description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
268
+ ⚠️ IMPORTANT: This model requires 2-3 minutes to load when first used. Please be patient with your first message."""
269
  )
270
 
271