Spaces:

Trinoid
/

Data_Management_Mistral

Sleeping

App Files Files Community

Frankie-walsh4 commited on Apr 3

Commit

a0ee3bd

1 Parent(s): fdf2b7f

fixes

Browse files

Files changed (1) hide show

app.py +142 -133

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
 import os
 import time
-import torch
-import traceback
 import threading
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
 from peft import PeftModel
@@ -13,118 +13,74 @@ if torch.cuda.is_available():
     print(f"CUDA device: {torch.cuda.get_device_name(0)}")
     print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
-# Global variable to track model loading
 model_loaded = False
 model_loading = False
-loading_error = None
-model = None
-tokenizer = None
-pipe = None
-def load_model_in_thread():
-    """Load the model in a separate thread to avoid blocking the UI"""
-    global model_loaded, model_loading, loading_error, model, tokenizer, pipe
-    if model_loading:
-        return  # Already loading
-    model_loading = True
-    print("Starting model loading process...")
     try:
-        # Load base model
-        model_id = "mistralai/Mistral-7B-Instruct-v0.2"
-        adapter_id = "Trinoid/Data_Management_Mistral"
-        print(f"Loading base model {model_id}...")
-        # Initialize tokenizer
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        print("Tokenizer loaded successfully")
-        # Load the base model in 4-bit
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch.float16,
-            device_map="auto",
-            load_in_4bit=True,
         )
-        print("Base model loaded successfully")
-        # Load and apply the LoRA adapter
-        print(f"Loading adapter {adapter_id}...")
-        model = PeftModel.from_pretrained(model, adapter_id)
-        print("Adapter loaded and applied successfully")
-        # Set up pipeline
-        print("Creating text generation pipeline...")
-        pipe = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            device_map="auto",
-        )
-        print("Pipeline created successfully")
-        model_loaded = True
-        print("Model loading complete! Ready for inference.")
     except Exception as e:
-        loading_error = str(e)
-        print(f"Error loading model: {str(e)}")
-        traceback.print_exc()
-    finally:
-        model_loading = False
-# Start model loading in background thread
-threading.Thread(target=load_model_in_thread, daemon=True).start()
-def format_chat_prompt(messages):
-    """Format messages into a prompt that Mistral-7B-Instruct can understand"""
-    prompt = ""
-    for message in messages:
-        if message["role"] == "system":
-            prompt += f"<s>[INST] {message['content']} [/INST]</s>\n"
-        elif message["role"] == "user":
-            prompt += f"<s>[INST] {message['content']} [/INST]"
-        elif message["role"] == "assistant":
-            prompt += f" {message['content']} </s>\n"
-    return prompt
-def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.95):
-    """Generate a response from the model"""
-    global model_loaded, loading_error, model, tokenizer, pipe
-    if not model_loaded:
-        if loading_error:
-            return f"Error loading model: {loading_error}"
-        return "Model is still loading. Please wait a moment and try again."
-    # Format the prompt for Mistral
-    prompt = format_chat_prompt(messages)
-    # Set up the streamer for incremental generation
-    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
-    # Generate in a separate thread to enable streaming
-    generation_kwargs = {
-        "input_ids": tokenizer.encode(prompt, return_tensors="pt").to("cuda"),
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "do_sample": True,
-        "streamer": streamer,
-    }
-    # Start generation in a thread
-    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    # Stream the output
-    generated_text = ""
-    for new_text in streamer:
-        generated_text += new_text
-        yield generated_text
 def respond(
     message,
@@ -135,21 +91,8 @@ def respond(
     top_p,
 ):
     """Respond to user messages"""
-    global model_loaded, model_loading
-    # Check if model is loaded
-    if not model_loaded:
-        if model_loading:
-            yield "⌛ The model is still loading. This can take a few minutes on first startup. Please wait or try again later."
-            return
-        else:
-            # Try loading the model if it hasn't started yet
-            if not threading.active_count() > 1:  # No background thread running
-                threading.Thread(target=load_model_in_thread, daemon=True).start()
-            yield "⌛ Starting model load now. Please wait a moment and try again."
-            return
-    # Create the messages list
     messages = [{"role": "system", "content": system_message}]
     for val in history:
@@ -160,19 +103,85 @@ def respond(
     messages.append({"role": "user", "content": message})
-    # Generate and stream the response
-    try:
-        for response in generate_response(
-            messages,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p
-        ):
-            yield response
-    except Exception as e:
-        print(f"Error generating response: {str(e)}")
-        traceback.print_exc()
-        yield f"An error occurred while generating the response: {str(e)}"
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
@@ -192,8 +201,8 @@ demo = gr.ChatInterface(
         ),
     ],
     description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
-    The model is loaded directly on the L40 GPU for optimal performance.
-    First-time loading may take a few minutes."""
 )

 import gradio as gr
 import os
 import time
+import json
+import requests
 import threading
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
 from peft import PeftModel
     print(f"CUDA device: {torch.cuda.get_device_name(0)}")
     print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+# Get token from environment
+HF_TOKEN = os.environ.get("HF_TOKEN")
+print(f"HF_TOKEN is {'available' if HF_TOKEN else 'not available'}")
+# Setup API for the Hugging Face Inference API
+MODEL_ID = "Trinoid/Data_Management_Mistral"
+API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
+headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
+# Check if model exists
+try:
+    print(f"Checking if model {MODEL_ID} exists...")
+    response = requests.get(API_URL, headers=headers)
+    print(f"Status: {response.status_code}")
+    if response.status_code == 200:
+        print("Model exists and is accessible")
+    else:
+        print(f"Response: {response.text}")
+except Exception as e:
+    print(f"Error checking model: {str(e)}")
+# Global variable to track model status
 model_loaded = False
 model_loading = False
+estimated_time = None
+def query_model(messages, parameters=None):
+    """Query the model using the Inference API"""
+    payload = {
+        "inputs": messages,
+    }
+    if parameters:
+        payload["parameters"] = parameters
+    print(f"Sending query to API...")
     try:
+        # Single attempt with longer timeout
+        response = requests.post(
+            API_URL,
+            headers=headers,
+            json=payload,
+            timeout=180  # 3 minute timeout
         )
+        print(f"API response status: {response.status_code}")
+        # If successful, return the response
+        if response.status_code == 200:
+            return response.json()
+        # If model is loading, handle it
+        elif response.status_code == 503 and "estimated_time" in response.json():
+            est_time = response.json()["estimated_time"]
+            global estimated_time
+            estimated_time = est_time
+            print(f"Model is loading. Estimated time: {est_time:.2f} seconds")
+            return None
+        # For other errors
+        else:
+            print(f"API error: {response.text}")
+            return None
     except Exception as e:
+        print(f"Request exception: {str(e)}")
+        return None
 def respond(
     message,
     top_p,
 ):
     """Respond to user messages"""
+    # Create the messages list in chat format
     messages = [{"role": "system", "content": system_message}]
     for val in history:
     messages.append({"role": "user", "content": message})
+    # Set up the inference parameters
+    parameters = {
+        "max_new_tokens": max_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "do_sample": True
+    }
+    # Initial message about model status
+    global estimated_time
+    if estimated_time:
+        initial_msg = f"⌛ The model is loading... estimated time: {estimated_time:.0f} seconds. Please be patient."
+    else:
+        initial_msg = "⌛ Working on your request..."
+    yield initial_msg
+    # Try multiple times with increasing waits
+    max_retries = 6
+    for attempt in range(max_retries):
+        # Check if this is a retry
+        if attempt > 0:
+            wait_time = min(60, 10 * attempt)
+            yield f"⌛ Still working on your request... (attempt {attempt+1}/{max_retries})"
+            time.sleep(wait_time)
+        try:
+            # Query the model
+            result = query_model(messages, parameters)
+            if result:
+                # Handle different response formats
+                # List format with generated_text
+                if isinstance(result, list) and len(result) > 0:
+                    if "generated_text" in result[0]:
+                        yield result[0]["generated_text"]
+                        return
+                # Direct message format
+                if isinstance(result, dict) and "generated_text" in result:
+                    yield result["generated_text"]
+                    return
+                # String format
+                if isinstance(result, str):
+                    yield result
+                    return
+                # Raw format as fallback
+                yield str(result)
+                return
+            # If model is still loading, get the latest estimate
+            if estimated_time and attempt < max_retries - 1:
+                response = requests.get(API_URL, headers=headers)
+                if response.status_code == 503 and "estimated_time" in response.json():
+                    estimated_time = response.json()["estimated_time"]
+                    print(f"Updated loading time: {estimated_time:.0f} seconds")
+        except Exception as e:
+            print(f"Error in attempt {attempt+1}: {str(e)}")
+            if attempt == max_retries - 1:
+                yield f"""❌ Sorry, I couldn't generate a response after several attempts.
+Error details: {str(e)}
+Please try again later or contact support if this persists."""
+    # If all retries failed
+    yield """❌ The model couldn't be accessed after multiple attempts.
+This could be due to:
+1. Heavy server load
+2. The model being too large for the current hardware
+3. Temporary service issues
+Please try again later."""
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
         ),
     ],
     description="""This interface uses a fine-tuned Mistral model for Microsoft 365 data management.
+    The model is accessed via the Hugging Face Inference API.
+    First requests may take 2-3 minutes as the model loads."""
 )