Spaces:

Trinoid
/

Data_Management_Mistral

Sleeping

App Files Files Community

Frankie-walsh4 commited on Apr 3

Commit

ee12cf3

1 Parent(s): cddec45

fixes

Browse files

Files changed (1) hide show

app.py +178 -179

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 import os
 import time
 import json
@@ -14,76 +13,138 @@ For more information on `huggingface_hub` Inference API support, please check th
 HF_TOKEN = os.environ.get("HF_TOKEN")
 print(f"HF_TOKEN is {'available' if HF_TOKEN else 'not available'}")
-# Try direct client with and without token
-if HF_TOKEN:
-    client = InferenceClient("Trinoid/Data_Management_Mistral", token=HF_TOKEN)
-    print("Created client with token")
-else:
-    client = InferenceClient("Trinoid/Data_Management_Mistral")
-    print("Created client without token")
-# Alternative API endpoint setup
 API_URL = "https://api-inference.huggingface.co/models/Trinoid/Data_Management_Mistral"
 headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
 # Global variable to track if model is warmed up
 model_warmed_up = False
-warming_up = False
 estimated_time = None
-def warm_up_model():
-    """Send a warmup request to get the model loaded before user interaction"""
-    global warming_up, model_warmed_up, estimated_time
-    if warming_up:
-        return  # Already warming up
-    warming_up = True
-    print("Starting model warm-up...")
-    # Try up to 10 times with increasing delays
-    for attempt in range(1, 11):
         try:
-            # Simple check if model is loaded
-            print(f"Warmup attempt {attempt}/10...")
-            response = requests.get(
                 API_URL,
-                headers=headers
             )
-            print(f"Status: {response.status_code}")
-            response_json = response.json() if response.text else {}
-            print(f"Response: {response_json}")
-            # If we get a 200, the model is loaded
             if response.status_code == 200:
-                print("Warmup successful! Model is ready.")
-                model_warmed_up = True
-                return
-            # If model is loading, get the estimated time
-            if response.status_code == 503 and "estimated_time" in response_json:
-                est_time = response_json["estimated_time"]
-                estimated_time = est_time
                 print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
-                # If estimated time is long, wait longer
-                wait_time = min(30, max(10, est_time / 4))  # Cap at 30 seconds, minimum 10
-                print(f"Waiting {wait_time:.2f} seconds before next check...")
                 time.sleep(wait_time)
             else:
-                # Other error, wait and retry
-                wait_time = 10 * attempt  # Increase wait time with each attempt
                 print(f"Waiting {wait_time} seconds before retry...")
                 time.sleep(wait_time)
         except Exception as e:
-            print(f"Warmup exception: {str(e)}")
-            time.sleep(15)  # Wait before retry on exception
-    # Even if it failed, mark as no longer warming up
-    warming_up = False
-    print("Warmup process completed (or gave up after max attempts)")
 # Start warmup in background thread
 threading.Thread(target=warm_up_model, daemon=True).start()
@@ -98,154 +159,95 @@ def respond(
 ):
     global model_warmed_up, estimated_time
-    # If model isn't warmed up yet, give a message with estimated time if available
-    if not model_warmed_up:
-        if estimated_time:
-            yield f"⌛ Model is being loaded for the first time, estimated wait time: {estimated_time:.0f} seconds. Please be patient or try again later."
-        else:
-            yield "⌛ Model is being loaded for the first time, this may take 2-3 minutes. Please be patient or try again later."
     messages = [{"role": "system", "content": system_message}]
     for val in history:
         if val[0]:
             messages.append({"role": "user", "content": val[0]})
         if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    # Debug the messages being sent
-    print(f"Sending messages: {json.dumps(messages, indent=2)}")
-    # Try to initialize the model with retries
-    max_retries = 8  # Even more retries
-    retry_count = 0
-    # Try both methods: InferenceClient and direct API call
-    use_direct_api = False
-    while retry_count < max_retries:
         try:
-            print(f"Attempt {retry_count + 1}/{max_retries} using {'direct API' if use_direct_api else 'InferenceClient'}...")
-            if not use_direct_api:
-                # Method 1: Using InferenceClient
-                for message in client.chat_completion(
-                    messages,
-                    max_tokens=max_tokens,
-                    stream=True,
-                    temperature=temperature,
-                    top_p=top_p,
-                ):
-                    token = message.choices[0].delta.content
-                    if token:
-                        response += token
-                        yield response
-                # If we got here, we were successful
-                model_warmed_up = True
-                break
-            else:
-                # Method 2: Direct API call
-                payload = {
-                    "inputs": messages,
-                    "parameters": {
-                        "max_new_tokens": max_tokens,
-                        "temperature": temperature,
-                        "top_p": top_p,
-                    },
-                    "stream": False,
-                }
-                print(f"Making direct API call to {API_URL}")
-                api_response = requests.post(API_URL, headers=headers, json=payload, timeout=180)  # 3 minute timeout
-                print(f"API response status: {api_response.status_code}")
-                if api_response.status_code == 200:
-                    result = api_response.json()
-                    print(f"API response: {json.dumps(result, indent=2)}")
-                    if isinstance(result, list) and len(result) > 0 and "generated_text" in result[0]:
                         response = result[0]["generated_text"]
-                        yield response
                         model_warmed_up = True
-                        break
-                    else:
-                        print(f"Unexpected API response format: {result}")
-                        retry_count += 1
-                elif api_response.status_code == 503 and "estimated_time" in api_response.json():
-                    # Model is loading, get estimated time
-                    est_time = api_response.json()["estimated_time"]
-                    estimated_time = est_time
-                    retry_count += 1
-                    print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
-                    # Wait a portion of the estimated time
-                    wait_time = min(45, max(15, est_time / 3))  # Cap at 45 seconds
-                    yield f"⌛ Model is loading... Estimated time remaining: {est_time:.0f} seconds (Attempt {retry_count}/{max_retries})"
-                    time.sleep(wait_time)
-                else:
-                    print(f"API error: {api_response.text}")
-                    if api_response.status_code == 504 and retry_count < max_retries - 1:
-                        retry_count += 1
-                        wait_time = 20  # Increased wait time
-                        yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
-                        time.sleep(wait_time)
-                    else:
-                        yield f"❌ API error: {api_response.status_code} - {api_response.text}"
-                        break
-        except HfHubHTTPError as e:
-            retry_count += 1
-            error_message = str(e)
-            print(f"Error: {error_message}")
-            if "503 Service Unavailable" in error_message or "504 Server Error: Gateway Timeout" in error_message:
-                if retry_count < max_retries - 1:
-                    wait_time = 20  # Increased wait time
-                    print(f"Model not ready. Waiting {wait_time} seconds before retry {retry_count}/{max_retries}...")
-                    yield f"⌛ Model is warming up, please wait... (Attempt {retry_count}/{max_retries})"
-                    time.sleep(wait_time)
-                    # Switch to direct API after 2 attempts
-                    if retry_count >= 2:
-                        use_direct_api = True
-                else:
-                    print("All retries failed.")
-                    yield """❌ The model couldn't be loaded after multiple attempts.
-This is common with larger models like Mistral-7B on free-tier hosting.
-Please try:
-1. Waiting a few minutes and trying again
-2. Creating a quantized (4-bit) version of your model which loads faster
-3. Using a smaller model for better performance"""
-                    break
             else:
-                print(f"Non-timeout error: {error_message}")
-                yield f"❌ An error occurred: {error_message}"
-                # Try direct API on next attempt
-                use_direct_api = True
         except Exception as e:
             print(f"Unexpected error: {str(e)}")
-            retry_count += 1
-            # For the specific timeout error we saw, switch to direct API
-            if "timeout" in str(e).lower():
-                use_direct_api = True
-                print("Switching to direct API method due to timeout parameter error")
-                if retry_count < max_retries - 1:
-                    yield "⌛ Trying alternative API method..."
-                    time.sleep(2)  # Short delay before retry
-                else:
-                    yield f"❌ Unexpected error: {str(e)}"
-                    break
             else:
-                yield f"❌ Unexpected error: {str(e)}"
-                break
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
@@ -265,13 +267,10 @@ demo = gr.ChatInterface(
         ),
     ],
     description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
-    ⚠️ IMPORTANT: This model requires 2-3 minutes to load when first used. Please be patient with your first message."""
 )
 if __name__ == "__main__":
-    # Start model warmup
-    warm_up_model()
     # Launch the app
     demo.launch()

 import gradio as gr
 import os
 import time
 import json
 HF_TOKEN = os.environ.get("HF_TOKEN")
 print(f"HF_TOKEN is {'available' if HF_TOKEN else 'not available'}")
+# Setup API for the Hugging Face Inference API
 API_URL = "https://api-inference.huggingface.co/models/Trinoid/Data_Management_Mistral"
 headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+print("Trying to access model directly via API")
+response = requests.get(API_URL, headers=headers)
+print(f"Status: {response.status_code}")
+print(f"Response: {response.text[:200]}...")  # Print first 200 chars of response
 # Global variable to track if model is warmed up
 model_warmed_up = False
+model_loading = False
 estimated_time = None
+def query_model(inputs, parameters=None):
+    """Send a query to the model via the Inference API"""
+    payload = {
+        "inputs": inputs,
+    }
+    if parameters:
+        payload["parameters"] = parameters
+    print(f"Sending query to API: {json.dumps(payload, indent=2)[:200]}...")
+    # Try multiple times with backoff
+    max_attempts = 5
+    for attempt in range(max_attempts):
         try:
+            response = requests.post(
                 API_URL,
+                headers=headers,
+                json=payload,
+                timeout=180  # 3 minute timeout
             )
+            print(f"API response status: {response.status_code}")
+            # If successful, return the result
             if response.status_code == 200:
+                return response.json()
+            # If model is loading, handle the error
+            elif response.status_code == 503 and "estimated_time" in response.json():
+                est_time = response.json()["estimated_time"]
                 print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
+                # Wait a portion of the estimated time
+                wait_time = min(30, max(10, est_time / 4))
+                print(f"Waiting {wait_time:.2f} seconds before retry...")
                 time.sleep(wait_time)
+            # For other errors, wait and retry
             else:
+                print(f"API error: {response.text}")
+                wait_time = 10 * (attempt + 1)
                 print(f"Waiting {wait_time} seconds before retry...")
                 time.sleep(wait_time)
         except Exception as e:
+            print(f"Request exception: {str(e)}")
+            wait_time = 15 * (attempt + 1)
+            print(f"Waiting {wait_time} seconds before retry...")
+            time.sleep(wait_time)
+    # If we've tried all attempts and still failed, return None
+    return None
+def is_model_loaded():
+    """Check if the model is loaded and ready for inference"""
+    try:
+        # Send a simple query to check model status
+        response = requests.get(API_URL, headers=headers)
+        # If we get a 200, the model is ready
+        if response.status_code == 200:
+            return True
+        # If we get a 503 with estimated_time, it's loading
+        if response.status_code == 503 and "estimated_time" in response.json():
+            global estimated_time
+            estimated_time = response.json()["estimated_time"]
+            return False
+        # Other response indicates an issue
+        return False
+    except Exception as e:
+        print(f"Error checking model status: {str(e)}")
+        return False
+def warm_up_model():
+    """Send a warmup request to get the model loaded"""
+    global model_warmed_up, model_loading
+    if model_loading:
+        return  # Already warming up
+    model_loading = True
+    # Check if model is already loaded
+    if is_model_loaded():
+        print("Model is already loaded!")
+        model_warmed_up = True
+        model_loading = False
+        return
+    print("Starting model warm-up with basic query...")
+    # Try to trigger model loading with a simple query
+    inputs = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Hi"}
+    ]
+    parameters = {
+        "max_new_tokens": 5,
+        "temperature": 0.1,
+        "top_p": 0.95,
+        "do_sample": True
+    }
+    # Send the query and check result
+    result = query_model(inputs, parameters)
+    if result:
+        print("Warmup successful! Model is ready.")
+        model_warmed_up = True
+    else:
+        print("Warmup failed. Will try again during first user query.")
+    model_loading = False
 # Start warmup in background thread
 threading.Thread(target=warm_up_model, daemon=True).start()
 ):
     global model_warmed_up, estimated_time
+    # Create the messages list
     messages = [{"role": "system", "content": system_message}]
     for val in history:
         if val[0]:
             messages.append({"role": "user", "content": val[0]})
         if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
+    messages.append({"role": "user", "content": message})
+    # Check if the model is ready
+    if not model_warmed_up and not is_model_loaded():
+        if estimated_time:
+            yield f"⌛ Model is being loaded, estimated wait time: {estimated_time:.0f} seconds. Please be patient or try again later."
+        else:
+            yield "⌛ Model is being loaded. This may take some time on the first use."
+    # Set up parameters for the query
+    parameters = {
+        "max_new_tokens": max_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "do_sample": True
+    }
+    # Try multiple times if needed
+    max_retries = 5
+    for attempt in range(max_retries):
         try:
+            print(f"Attempt {attempt + 1}/{max_retries} to query the model...")
+            # Make API request
+            result = query_model(messages, parameters)
+            if result:
+                # Handle different response formats
+                if isinstance(result, list) and len(result) > 0:
+                    if "generated_text" in result[0]:
                         response = result[0]["generated_text"]
                         model_warmed_up = True
+                        yield response
+                        return
+                # Direct message response format
+                if isinstance(result, dict) and "generated_text" in result:
+                    response = result["generated_text"]
+                    model_warmed_up = True
+                    yield response
+                    return
+                # For completion format
+                if isinstance(result, str):
+                    model_warmed_up = True
+                    yield result
+                    return
+                # Unknown format, show raw result
+                print(f"Unexpected response format: {json.dumps(result, indent=2)[:500]}...")
+                model_warmed_up = True
+                yield str(result)
+                return
+            # If query_model returned None, it means all its retries failed
+            print(f"Query attempt {attempt + 1} failed completely")
+            if attempt < max_retries - 1:
+                wait_time = 20 * (attempt + 1)
+                yield f"⌛ Still trying to get a response (Attempt {attempt + 1}/{max_retries})..."
+                time.sleep(wait_time)
             else:
+                yield """❌ The model couldn't be accessed after multiple attempts.
+If you're seeing this on the Nvidia L40 hardware, please try:
+1. Restarting the space
+2. Checking your model's size and format
+3. Contacting Hugging Face support if the issue persists"""
+                return
         except Exception as e:
             print(f"Unexpected error: {str(e)}")
+            if attempt < max_retries - 1:
+                wait_time = 15
+                yield f"⌛ An error occurred. Retrying (Attempt {attempt + 1}/{max_retries})..."
+                time.sleep(wait_time)
             else:
+                yield f"❌ An error occurred after multiple attempts: {str(e)}"
+                return
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
         ),
     ],
     description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
+    This model runs on Nvidia L40 GPU hardware for optimal performance."""
 )
 if __name__ == "__main__":
     # Launch the app
     demo.launch()