Frankie-walsh4 commited on
Commit
ee12cf3
Β·
1 Parent(s): cddec45
Files changed (1) hide show
  1. app.py +178 -179
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
  import os
4
  import time
5
  import json
@@ -14,76 +13,138 @@ For more information on `huggingface_hub` Inference API support, please check th
14
  HF_TOKEN = os.environ.get("HF_TOKEN")
15
  print(f"HF_TOKEN is {'available' if HF_TOKEN else 'not available'}")
16
 
17
- # Try direct client with and without token
18
- if HF_TOKEN:
19
- client = InferenceClient("Trinoid/Data_Management_Mistral", token=HF_TOKEN)
20
- print("Created client with token")
21
- else:
22
- client = InferenceClient("Trinoid/Data_Management_Mistral")
23
- print("Created client without token")
24
-
25
- # Alternative API endpoint setup
26
  API_URL = "https://api-inference.huggingface.co/models/Trinoid/Data_Management_Mistral"
27
  headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
28
 
 
 
 
 
 
29
  # Global variable to track if model is warmed up
30
  model_warmed_up = False
31
- warming_up = False
32
  estimated_time = None
33
 
34
- def warm_up_model():
35
- """Send a warmup request to get the model loaded before user interaction"""
36
- global warming_up, model_warmed_up, estimated_time
 
 
37
 
38
- if warming_up:
39
- return # Already warming up
40
-
41
- warming_up = True
42
- print("Starting model warm-up...")
43
 
44
- # Try up to 10 times with increasing delays
45
- for attempt in range(1, 11):
 
 
 
46
  try:
47
- # Simple check if model is loaded
48
- print(f"Warmup attempt {attempt}/10...")
49
- response = requests.get(
50
  API_URL,
51
- headers=headers
 
 
52
  )
53
 
54
- print(f"Status: {response.status_code}")
55
- response_json = response.json() if response.text else {}
56
- print(f"Response: {response_json}")
57
 
58
- # If we get a 200, the model is loaded
59
  if response.status_code == 200:
60
- print("Warmup successful! Model is ready.")
61
- model_warmed_up = True
62
- return
63
-
64
- # If model is loading, get the estimated time
65
- if response.status_code == 503 and "estimated_time" in response_json:
66
- est_time = response_json["estimated_time"]
67
- estimated_time = est_time
68
  print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
69
 
70
- # If estimated time is long, wait longer
71
- wait_time = min(30, max(10, est_time / 4)) # Cap at 30 seconds, minimum 10
72
- print(f"Waiting {wait_time:.2f} seconds before next check...")
73
  time.sleep(wait_time)
 
 
74
  else:
75
- # Other error, wait and retry
76
- wait_time = 10 * attempt # Increase wait time with each attempt
77
  print(f"Waiting {wait_time} seconds before retry...")
78
  time.sleep(wait_time)
79
-
80
  except Exception as e:
81
- print(f"Warmup exception: {str(e)}")
82
- time.sleep(15) # Wait before retry on exception
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- # Even if it failed, mark as no longer warming up
85
- warming_up = False
86
- print("Warmup process completed (or gave up after max attempts)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  # Start warmup in background thread
89
  threading.Thread(target=warm_up_model, daemon=True).start()
@@ -98,154 +159,95 @@ def respond(
98
  ):
99
  global model_warmed_up, estimated_time
100
 
101
- # If model isn't warmed up yet, give a message with estimated time if available
102
- if not model_warmed_up:
103
- if estimated_time:
104
- yield f"βŒ› Model is being loaded for the first time, estimated wait time: {estimated_time:.0f} seconds. Please be patient or try again later."
105
- else:
106
- yield "βŒ› Model is being loaded for the first time, this may take 2-3 minutes. Please be patient or try again later."
107
-
108
  messages = [{"role": "system", "content": system_message}]
109
-
110
  for val in history:
111
  if val[0]:
112
  messages.append({"role": "user", "content": val[0]})
113
  if val[1]:
114
  messages.append({"role": "assistant", "content": val[1]})
115
-
116
- messages.append({"role": "user", "content": message})
117
-
118
- response = ""
119
 
120
- # Debug the messages being sent
121
- print(f"Sending messages: {json.dumps(messages, indent=2)}")
122
 
123
- # Try to initialize the model with retries
124
- max_retries = 8 # Even more retries
125
- retry_count = 0
 
 
 
126
 
127
- # Try both methods: InferenceClient and direct API call
128
- use_direct_api = False
 
 
 
 
 
129
 
130
- while retry_count < max_retries:
 
 
131
  try:
132
- print(f"Attempt {retry_count + 1}/{max_retries} using {'direct API' if use_direct_api else 'InferenceClient'}...")
133
 
134
- if not use_direct_api:
135
- # Method 1: Using InferenceClient
136
- for message in client.chat_completion(
137
- messages,
138
- max_tokens=max_tokens,
139
- stream=True,
140
- temperature=temperature,
141
- top_p=top_p,
142
- ):
143
- token = message.choices[0].delta.content
144
- if token:
145
- response += token
146
- yield response
147
-
148
- # If we got here, we were successful
149
- model_warmed_up = True
150
- break
151
- else:
152
- # Method 2: Direct API call
153
- payload = {
154
- "inputs": messages,
155
- "parameters": {
156
- "max_new_tokens": max_tokens,
157
- "temperature": temperature,
158
- "top_p": top_p,
159
- },
160
- "stream": False,
161
- }
162
-
163
- print(f"Making direct API call to {API_URL}")
164
- api_response = requests.post(API_URL, headers=headers, json=payload, timeout=180) # 3 minute timeout
165
- print(f"API response status: {api_response.status_code}")
166
-
167
- if api_response.status_code == 200:
168
- result = api_response.json()
169
- print(f"API response: {json.dumps(result, indent=2)}")
170
- if isinstance(result, list) and len(result) > 0 and "generated_text" in result[0]:
171
  response = result[0]["generated_text"]
172
- yield response
173
  model_warmed_up = True
174
- break
175
- else:
176
- print(f"Unexpected API response format: {result}")
177
- retry_count += 1
178
- elif api_response.status_code == 503 and "estimated_time" in api_response.json():
179
- # Model is loading, get estimated time
180
- est_time = api_response.json()["estimated_time"]
181
- estimated_time = est_time
182
- retry_count += 1
183
- print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
184
 
185
- # Wait a portion of the estimated time
186
- wait_time = min(45, max(15, est_time / 3)) # Cap at 45 seconds
187
- yield f"βŒ› Model is loading... Estimated time remaining: {est_time:.0f} seconds (Attempt {retry_count}/{max_retries})"
188
- time.sleep(wait_time)
189
- else:
190
- print(f"API error: {api_response.text}")
191
- if api_response.status_code == 504 and retry_count < max_retries - 1:
192
- retry_count += 1
193
- wait_time = 20 # Increased wait time
194
- yield f"βŒ› Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
195
- time.sleep(wait_time)
196
- else:
197
- yield f"❌ API error: {api_response.status_code} - {api_response.text}"
198
- break
199
-
200
- except HfHubHTTPError as e:
201
- retry_count += 1
202
- error_message = str(e)
203
- print(f"Error: {error_message}")
204
 
205
- if "503 Service Unavailable" in error_message or "504 Server Error: Gateway Timeout" in error_message:
206
- if retry_count < max_retries - 1:
207
- wait_time = 20 # Increased wait time
208
- print(f"Model not ready. Waiting {wait_time} seconds before retry {retry_count}/{max_retries}...")
209
- yield f"βŒ› Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
210
- time.sleep(wait_time)
211
- # Switch to direct API after 2 attempts
212
- if retry_count >= 2:
213
- use_direct_api = True
214
- else:
215
- print("All retries failed.")
216
- yield """❌ The model couldn't be loaded after multiple attempts.
217
-
218
- This is common with larger models like Mistral-7B on free-tier hosting.
219
-
220
- Please try:
221
- 1. Waiting a few minutes and trying again
222
- 2. Creating a quantized (4-bit) version of your model which loads faster
223
- 3. Using a smaller model for better performance"""
224
- break
225
  else:
226
- print(f"Non-timeout error: {error_message}")
227
- yield f"❌ An error occurred: {error_message}"
228
- # Try direct API on next attempt
229
- use_direct_api = True
230
-
 
 
 
231
  except Exception as e:
232
  print(f"Unexpected error: {str(e)}")
233
- retry_count += 1
234
 
235
- # For the specific timeout error we saw, switch to direct API
236
- if "timeout" in str(e).lower():
237
- use_direct_api = True
238
- print("Switching to direct API method due to timeout parameter error")
239
- if retry_count < max_retries - 1:
240
- yield "βŒ› Trying alternative API method..."
241
- time.sleep(2) # Short delay before retry
242
- else:
243
- yield f"❌ Unexpected error: {str(e)}"
244
- break
245
  else:
246
- yield f"❌ Unexpected error: {str(e)}"
247
- break
248
-
249
 
250
  """
251
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
@@ -265,13 +267,10 @@ demo = gr.ChatInterface(
265
  ),
266
  ],
267
  description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
268
- ⚠️ IMPORTANT: This model requires 2-3 minutes to load when first used. Please be patient with your first message."""
269
  )
270
 
271
 
272
  if __name__ == "__main__":
273
- # Start model warmup
274
- warm_up_model()
275
-
276
  # Launch the app
277
  demo.launch()
 
1
  import gradio as gr
 
2
  import os
3
  import time
4
  import json
 
13
  HF_TOKEN = os.environ.get("HF_TOKEN")
14
  print(f"HF_TOKEN is {'available' if HF_TOKEN else 'not available'}")
15
 
16
+ # Setup API for the Hugging Face Inference API
 
 
 
 
 
 
 
 
17
  API_URL = "https://api-inference.huggingface.co/models/Trinoid/Data_Management_Mistral"
18
  headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
19
 
20
+ print("Trying to access model directly via API")
21
+ response = requests.get(API_URL, headers=headers)
22
+ print(f"Status: {response.status_code}")
23
+ print(f"Response: {response.text[:200]}...") # Print first 200 chars of response
24
+
25
  # Global variable to track if model is warmed up
26
  model_warmed_up = False
27
+ model_loading = False
28
  estimated_time = None
29
 
30
+ def query_model(inputs, parameters=None):
31
+ """Send a query to the model via the Inference API"""
32
+ payload = {
33
+ "inputs": inputs,
34
+ }
35
 
36
+ if parameters:
37
+ payload["parameters"] = parameters
 
 
 
38
 
39
+ print(f"Sending query to API: {json.dumps(payload, indent=2)[:200]}...")
40
+
41
+ # Try multiple times with backoff
42
+ max_attempts = 5
43
+ for attempt in range(max_attempts):
44
  try:
45
+ response = requests.post(
 
 
46
  API_URL,
47
+ headers=headers,
48
+ json=payload,
49
+ timeout=180 # 3 minute timeout
50
  )
51
 
52
+ print(f"API response status: {response.status_code}")
 
 
53
 
54
+ # If successful, return the result
55
  if response.status_code == 200:
56
+ return response.json()
57
+
58
+ # If model is loading, handle the error
59
+ elif response.status_code == 503 and "estimated_time" in response.json():
60
+ est_time = response.json()["estimated_time"]
 
 
 
61
  print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
62
 
63
+ # Wait a portion of the estimated time
64
+ wait_time = min(30, max(10, est_time / 4))
65
+ print(f"Waiting {wait_time:.2f} seconds before retry...")
66
  time.sleep(wait_time)
67
+
68
+ # For other errors, wait and retry
69
  else:
70
+ print(f"API error: {response.text}")
71
+ wait_time = 10 * (attempt + 1)
72
  print(f"Waiting {wait_time} seconds before retry...")
73
  time.sleep(wait_time)
74
+
75
  except Exception as e:
76
+ print(f"Request exception: {str(e)}")
77
+ wait_time = 15 * (attempt + 1)
78
+ print(f"Waiting {wait_time} seconds before retry...")
79
+ time.sleep(wait_time)
80
+
81
+ # If we've tried all attempts and still failed, return None
82
+ return None
83
+
84
+ def is_model_loaded():
85
+ """Check if the model is loaded and ready for inference"""
86
+ try:
87
+ # Send a simple query to check model status
88
+ response = requests.get(API_URL, headers=headers)
89
+
90
+ # If we get a 200, the model is ready
91
+ if response.status_code == 200:
92
+ return True
93
+
94
+ # If we get a 503 with estimated_time, it's loading
95
+ if response.status_code == 503 and "estimated_time" in response.json():
96
+ global estimated_time
97
+ estimated_time = response.json()["estimated_time"]
98
+ return False
99
+
100
+ # Other response indicates an issue
101
+ return False
102
+
103
+ except Exception as e:
104
+ print(f"Error checking model status: {str(e)}")
105
+ return False
106
+
107
+ def warm_up_model():
108
+ """Send a warmup request to get the model loaded"""
109
+ global model_warmed_up, model_loading
110
+
111
+ if model_loading:
112
+ return # Already warming up
113
+
114
+ model_loading = True
115
+
116
+ # Check if model is already loaded
117
+ if is_model_loaded():
118
+ print("Model is already loaded!")
119
+ model_warmed_up = True
120
+ model_loading = False
121
+ return
122
+
123
+ print("Starting model warm-up with basic query...")
124
 
125
+ # Try to trigger model loading with a simple query
126
+ inputs = [
127
+ {"role": "system", "content": "You are a helpful assistant."},
128
+ {"role": "user", "content": "Hi"}
129
+ ]
130
+
131
+ parameters = {
132
+ "max_new_tokens": 5,
133
+ "temperature": 0.1,
134
+ "top_p": 0.95,
135
+ "do_sample": True
136
+ }
137
+
138
+ # Send the query and check result
139
+ result = query_model(inputs, parameters)
140
+
141
+ if result:
142
+ print("Warmup successful! Model is ready.")
143
+ model_warmed_up = True
144
+ else:
145
+ print("Warmup failed. Will try again during first user query.")
146
+
147
+ model_loading = False
148
 
149
  # Start warmup in background thread
150
  threading.Thread(target=warm_up_model, daemon=True).start()
 
159
  ):
160
  global model_warmed_up, estimated_time
161
 
162
+ # Create the messages list
 
 
 
 
 
 
163
  messages = [{"role": "system", "content": system_message}]
164
+
165
  for val in history:
166
  if val[0]:
167
  messages.append({"role": "user", "content": val[0]})
168
  if val[1]:
169
  messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
170
 
171
+ messages.append({"role": "user", "content": message})
 
172
 
173
+ # Check if the model is ready
174
+ if not model_warmed_up and not is_model_loaded():
175
+ if estimated_time:
176
+ yield f"βŒ› Model is being loaded, estimated wait time: {estimated_time:.0f} seconds. Please be patient or try again later."
177
+ else:
178
+ yield "βŒ› Model is being loaded. This may take some time on the first use."
179
 
180
+ # Set up parameters for the query
181
+ parameters = {
182
+ "max_new_tokens": max_tokens,
183
+ "temperature": temperature,
184
+ "top_p": top_p,
185
+ "do_sample": True
186
+ }
187
 
188
+ # Try multiple times if needed
189
+ max_retries = 5
190
+ for attempt in range(max_retries):
191
  try:
192
+ print(f"Attempt {attempt + 1}/{max_retries} to query the model...")
193
 
194
+ # Make API request
195
+ result = query_model(messages, parameters)
196
+
197
+ if result:
198
+ # Handle different response formats
199
+ if isinstance(result, list) and len(result) > 0:
200
+ if "generated_text" in result[0]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  response = result[0]["generated_text"]
 
202
  model_warmed_up = True
203
+ yield response
204
+ return
 
 
 
 
 
 
 
 
205
 
206
+ # Direct message response format
207
+ if isinstance(result, dict) and "generated_text" in result:
208
+ response = result["generated_text"]
209
+ model_warmed_up = True
210
+ yield response
211
+ return
212
+
213
+ # For completion format
214
+ if isinstance(result, str):
215
+ model_warmed_up = True
216
+ yield result
217
+ return
218
+
219
+ # Unknown format, show raw result
220
+ print(f"Unexpected response format: {json.dumps(result, indent=2)[:500]}...")
221
+ model_warmed_up = True
222
+ yield str(result)
223
+ return
 
224
 
225
+ # If query_model returned None, it means all its retries failed
226
+ print(f"Query attempt {attempt + 1} failed completely")
227
+
228
+ if attempt < max_retries - 1:
229
+ wait_time = 20 * (attempt + 1)
230
+ yield f"βŒ› Still trying to get a response (Attempt {attempt + 1}/{max_retries})..."
231
+ time.sleep(wait_time)
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  else:
233
+ yield """❌ The model couldn't be accessed after multiple attempts.
234
+
235
+ If you're seeing this on the Nvidia L40 hardware, please try:
236
+ 1. Restarting the space
237
+ 2. Checking your model's size and format
238
+ 3. Contacting Hugging Face support if the issue persists"""
239
+ return
240
+
241
  except Exception as e:
242
  print(f"Unexpected error: {str(e)}")
 
243
 
244
+ if attempt < max_retries - 1:
245
+ wait_time = 15
246
+ yield f"βŒ› An error occurred. Retrying (Attempt {attempt + 1}/{max_retries})..."
247
+ time.sleep(wait_time)
 
 
 
 
 
 
248
  else:
249
+ yield f"❌ An error occurred after multiple attempts: {str(e)}"
250
+ return
 
251
 
252
  """
253
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 
267
  ),
268
  ],
269
  description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
270
+ This model runs on Nvidia L40 GPU hardware for optimal performance."""
271
  )
272
 
273
 
274
  if __name__ == "__main__":
 
 
 
275
  # Launch the app
276
  demo.launch()