Spaces:

Trinoid
/

Data_Management_Mistral

Sleeping

App Files Files Community

Frankie-walsh4 commited on Apr 3

Commit

cddec45

1 Parent(s): 387c509

fixes

Browse files

Files changed (1) hide show

app.py +89 -44

app.py CHANGED Viewed

@@ -29,10 +29,11 @@ headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
 # Global variable to track if model is warmed up
 model_warmed_up = False
 warming_up = False
 def warm_up_model():
     """Send a warmup request to get the model loaded before user interaction"""
-    global warming_up, model_warmed_up
     if warming_up:
         return  # Already warming up
@@ -40,38 +41,49 @@ def warm_up_model():
     warming_up = True
     print("Starting model warm-up...")
-    # Simple warmup message
-    warmup_messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Hello"}
-    ]
-    # Try direct API approach first
-    try:
-        payload = {
-            "inputs": warmup_messages,
-            "parameters": {
-                "max_new_tokens": 5,  # Just need a short response
-                "temperature": 0.1,
-                "top_p": 0.95,
-            },
-            "stream": False,
-        }
-        print("Sending warmup request...")
-        response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
-        if response.status_code == 200:
-            print("Warmup successful!")
-            model_warmed_up = True
-        else:
-            print(f"Warmup API call failed with status {response.status_code}")
-            print(f"Response: {response.text}")
-    except Exception as e:
-        print(f"Warmup exception: {str(e)}")
     # Even if it failed, mark as no longer warming up
     warming_up = False
 # Start warmup in background thread
 threading.Thread(target=warm_up_model, daemon=True).start()
@@ -84,11 +96,14 @@ def respond(
     temperature,
     top_p,
 ):
-    global model_warmed_up
-    # If model isn't warmed up yet, give a message
     if not model_warmed_up:
-        yield "⌛ Model is being loaded for the first time, this may take up to a minute. Please be patient..."
     messages = [{"role": "system", "content": system_message}]
@@ -106,7 +121,7 @@ def respond(
     print(f"Sending messages: {json.dumps(messages, indent=2)}")
     # Try to initialize the model with retries
-    max_retries = 5  # Increased from 3 to 5
     retry_count = 0
     # Try both methods: InferenceClient and direct API call
@@ -124,7 +139,6 @@ def respond(
                     stream=True,
                     temperature=temperature,
                     top_p=top_p,
-                    timeout=30,  # Increased timeout
                 ):
                     token = message.choices[0].delta.content
                     if token:
@@ -147,7 +161,7 @@ def respond(
                 }
                 print(f"Making direct API call to {API_URL}")
-                api_response = requests.post(API_URL, headers=headers, json=payload, timeout=60)  # Increased timeout
                 print(f"API response status: {api_response.status_code}")
                 if api_response.status_code == 200:
@@ -161,11 +175,22 @@ def respond(
                     else:
                         print(f"Unexpected API response format: {result}")
                         retry_count += 1
                 else:
                     print(f"API error: {api_response.text}")
                     if api_response.status_code == 504 and retry_count < max_retries - 1:
                         retry_count += 1
-                        wait_time = 15  # Increased wait time
                         yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
                         time.sleep(wait_time)
                     else:
@@ -177,18 +202,25 @@ def respond(
             error_message = str(e)
             print(f"Error: {error_message}")
-            if "504 Server Error: Gateway Timeout" in error_message:
                 if retry_count < max_retries - 1:
-                    wait_time = 15  # Increased wait time
-                    print(f"Model timed out. Waiting {wait_time} seconds before retry {retry_count}/{max_retries}...")
                     yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
                     time.sleep(wait_time)
-                    # Try direct API on next attempt if we've tried InferenceClient twice
                     if retry_count >= 2:
                         use_direct_api = True
                 else:
                     print("All retries failed.")
-                    yield "❌ The model timed out after multiple attempts. Your model is probably too large for the free tier. Try again in a few minutes or consider using a smaller model."
                     break
             else:
                 print(f"Non-timeout error: {error_message}")
@@ -198,8 +230,21 @@ def respond(
         except Exception as e:
             print(f"Unexpected error: {str(e)}")
-            yield f"❌ Unexpected error: {str(e)}"
-            break
 """
@@ -220,7 +265,7 @@ demo = gr.ChatInterface(
         ),
     ],
     description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
-    ⚠️ Note: This model needs time to load when first used. You may experience a delay of up to 60 seconds on your first message."""
 )

 # Global variable to track if model is warmed up
 model_warmed_up = False
 warming_up = False
+estimated_time = None
 def warm_up_model():
     """Send a warmup request to get the model loaded before user interaction"""
+    global warming_up, model_warmed_up, estimated_time
     if warming_up:
         return  # Already warming up
     warming_up = True
     print("Starting model warm-up...")
+    # Try up to 10 times with increasing delays
+    for attempt in range(1, 11):
+        try:
+            # Simple check if model is loaded
+            print(f"Warmup attempt {attempt}/10...")
+            response = requests.get(
+                API_URL,
+                headers=headers
+            )
+            print(f"Status: {response.status_code}")
+            response_json = response.json() if response.text else {}
+            print(f"Response: {response_json}")
+            # If we get a 200, the model is loaded
+            if response.status_code == 200:
+                print("Warmup successful! Model is ready.")
+                model_warmed_up = True
+                return
+            # If model is loading, get the estimated time
+            if response.status_code == 503 and "estimated_time" in response_json:
+                est_time = response_json["estimated_time"]
+                estimated_time = est_time
+                print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
+                # If estimated time is long, wait longer
+                wait_time = min(30, max(10, est_time / 4))  # Cap at 30 seconds, minimum 10
+                print(f"Waiting {wait_time:.2f} seconds before next check...")
+                time.sleep(wait_time)
+            else:
+                # Other error, wait and retry
+                wait_time = 10 * attempt  # Increase wait time with each attempt
+                print(f"Waiting {wait_time} seconds before retry...")
+                time.sleep(wait_time)
+        except Exception as e:
+            print(f"Warmup exception: {str(e)}")
+            time.sleep(15)  # Wait before retry on exception
     # Even if it failed, mark as no longer warming up
     warming_up = False
+    print("Warmup process completed (or gave up after max attempts)")
 # Start warmup in background thread
 threading.Thread(target=warm_up_model, daemon=True).start()
     temperature,
     top_p,
 ):
+    global model_warmed_up, estimated_time
+    # If model isn't warmed up yet, give a message with estimated time if available
     if not model_warmed_up:
+        if estimated_time:
+            yield f"⌛ Model is being loaded for the first time, estimated wait time: {estimated_time:.0f} seconds. Please be patient or try again later."
+        else:
+            yield "⌛ Model is being loaded for the first time, this may take 2-3 minutes. Please be patient or try again later."
     messages = [{"role": "system", "content": system_message}]
     print(f"Sending messages: {json.dumps(messages, indent=2)}")
     # Try to initialize the model with retries
+    max_retries = 8  # Even more retries
     retry_count = 0
     # Try both methods: InferenceClient and direct API call
                     stream=True,
                     temperature=temperature,
                     top_p=top_p,
                 ):
                     token = message.choices[0].delta.content
                     if token:
                 }
                 print(f"Making direct API call to {API_URL}")
+                api_response = requests.post(API_URL, headers=headers, json=payload, timeout=180)  # 3 minute timeout
                 print(f"API response status: {api_response.status_code}")
                 if api_response.status_code == 200:
                     else:
                         print(f"Unexpected API response format: {result}")
                         retry_count += 1
+                elif api_response.status_code == 503 and "estimated_time" in api_response.json():
+                    # Model is loading, get estimated time
+                    est_time = api_response.json()["estimated_time"]
+                    estimated_time = est_time
+                    retry_count += 1
+                    print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
+                    # Wait a portion of the estimated time
+                    wait_time = min(45, max(15, est_time / 3))  # Cap at 45 seconds
+                    yield f"⌛ Model is loading... Estimated time remaining: {est_time:.0f} seconds (Attempt {retry_count}/{max_retries})"
+                    time.sleep(wait_time)
                 else:
                     print(f"API error: {api_response.text}")
                     if api_response.status_code == 504 and retry_count < max_retries - 1:
                         retry_count += 1
+                        wait_time = 20  # Increased wait time
                         yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
                         time.sleep(wait_time)
                     else:
             error_message = str(e)
             print(f"Error: {error_message}")
+            if "503 Service Unavailable" in error_message or "504 Server Error: Gateway Timeout" in error_message:
                 if retry_count < max_retries - 1:
+                    wait_time = 20  # Increased wait time
+                    print(f"Model not ready. Waiting {wait_time} seconds before retry {retry_count}/{max_retries}...")
                     yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
                     time.sleep(wait_time)
+                    # Switch to direct API after 2 attempts
                     if retry_count >= 2:
                         use_direct_api = True
                 else:
                     print("All retries failed.")
+                    yield """❌ The model couldn't be loaded after multiple attempts.
+This is common with larger models like Mistral-7B on free-tier hosting.
+Please try:
+1. Waiting a few minutes and trying again
+2. Creating a quantized (4-bit) version of your model which loads faster
+3. Using a smaller model for better performance"""
                     break
             else:
                 print(f"Non-timeout error: {error_message}")
         except Exception as e:
             print(f"Unexpected error: {str(e)}")
+            retry_count += 1
+            # For the specific timeout error we saw, switch to direct API
+            if "timeout" in str(e).lower():
+                use_direct_api = True
+                print("Switching to direct API method due to timeout parameter error")
+                if retry_count < max_retries - 1:
+                    yield "⌛ Trying alternative API method..."
+                    time.sleep(2)  # Short delay before retry
+                else:
+                    yield f"❌ Unexpected error: {str(e)}"
+                    break
+            else:
+                yield f"❌ Unexpected error: {str(e)}"
+                break
 """
         ),
     ],
     description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
+    ⚠️ IMPORTANT: This model requires 2-3 minutes to load when first used. Please be patient with your first message."""
 )