Spaces:

Kazel
/

demo

Runtime error

App Files Files Community

Kazel commited on Sep 1

Commit

aada01f

verified ·

1 Parent(s): c70cfb9

Upload 5 files

Browse files

Files changed (4) hide show

app.py +0 -0
middleware.py +4 -4
milvus_manager.py +9 -6
rag.py +179 -36

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

middleware.py CHANGED Viewed

@@ -46,16 +46,16 @@ class Middleware:
-    def search(self, search_queries: list[str]):
-        print(f"Searching for {len(search_queries)} queries")
         final_res = []
         for query in search_queries:
             print(f"Searching for query: {query}")
             query_vec = colpali_manager.process_text([query])[0]
-            search_res = self.milvus_manager.search(query_vec, topk=1)
-            print(f"Search result: {search_res} for query: {query}")
             final_res.append(search_res)
         return final_res

+    def search(self, search_queries: list[str], topk: int = 10):
+        print(f"Searching for {len(search_queries)} queries with topk={topk}")
         final_res = []
         for query in search_queries:
             print(f"Searching for query: {query}")
             query_vec = colpali_manager.process_text([query])[0]
+            search_res = self.milvus_manager.search(query_vec, topk=topk)
+            print(f"Search result: {len(search_res)} results for query: {query}")
             final_res.append(search_res)
         return final_res

milvus_manager.py CHANGED Viewed

@@ -13,7 +13,7 @@ class MilvusManager:
         dotenv_file = dotenv.find_dotenv()
         dotenv.load_dotenv(dotenv_file)
-        self.client = MilvusClient(uri=milvus_uri)
         self.collection_name = collection_name
         self.dim = dim
@@ -50,10 +50,13 @@ class MilvusManager:
         index_params.add_index(
             field_name="vector",
-            metric_type="COSINE",
-            index_type="IVF_FLAT",
             index_name="vector_index",
-            params={ "nlist": 128 }
         )
         self.client.create_index(
@@ -65,7 +68,7 @@ class MilvusManager:
         collections = self.client.list_collections()
         # Set search parameters (here, using Inner Product metric).
-        search_params = {"metric_type": "COSINE", "params": {}} #default metric type is "IP"
         # Set to store unique (doc_id, collection_name) pairs across all collections.
         doc_collection_pairs = set()
@@ -121,7 +124,7 @@ class MilvusManager:
         # Unload the collection after search to free memory.
         self.client.release_collection(collection_name=collection)
-        return scores[:topk] if len(scores) >= topk else scores
         """
         search_params = {"metric_type": "IP", "params": {}}
         results = self.client.search(

         dotenv_file = dotenv.find_dotenv()
         dotenv.load_dotenv(dotenv_file)
+        self.client = MilvusClient(uri="http://localhost:19530", token="root:Milvus")
         self.collection_name = collection_name
         self.dim = dim
         index_params.add_index(
             field_name="vector",
             index_name="vector_index",
+            index_type="HNSW", #use HNSW option if got more mem, if not use IVF for faster processing
+            metric_type=os.environ["metrictype"], #"IP"
+            params={
+                "M": int(os.environ["mnum"]), #M:16 for HNSW, capital M
+                "efConstruction": int(os.environ["efnum"]), #500 for HNSW
+            },
         )
         self.client.create_index(
         collections = self.client.list_collections()
         # Set search parameters (here, using Inner Product metric).
+        search_params = {"metric_type": os.environ["metrictype"], "params": {}} #default metric type is "IP"
         # Set to store unique (doc_id, collection_name) pairs across all collections.
         doc_collection_pairs = set()
         # Unload the collection after search to free memory.
         self.client.release_collection(collection_name=collection)
+        return scores[:topk] if len(scores) >= topk else scores #topk is the number of scores to return back
         """
         search_params = {"metric_type": "IP", "params": {}}
         results = self.client.search(

rag.py CHANGED Viewed

@@ -1,27 +1,77 @@
 import requests
 import os
 from typing import List
 from utils import encode_image
 from PIL import Image
 import torch
 import subprocess
 import psutil
 import torch
 from transformers import AutoModel, AutoTokenizer
-import google.generativeai as genai
 class Rag:
     def get_answer_from_gemini(self, query, imagePaths):
         print(f"Querying Gemini for query={query}, imagePaths={imagePaths}")
         try:
-            genai.configure(api_key="AIzaSyBF-MJKxRROIr-X6YiG1_8uOHrFZDX3IBI")
-            model = genai.GenerativeModel('gemini-2.5-flash')
             images = [Image.open(path) for path in imagePaths]
@@ -45,35 +95,10 @@ class Rag:
         #import environ variables from .env
         import dotenv
-         # Load the .env file
         dotenv_file = dotenv.find_dotenv()
         dotenv.load_dotenv(dotenv_file)
-        """ #scuffed local hf inference (transformers incompatible to colpali version req, use ollama, more reliable, easier to use plus web server ready)
-        print(f"Querying for query={query}, imagesPaths={imagesPaths}")
-        model = AutoModel.from_pretrained(
-            'openbmb/MiniCPM-o-2_6-int4',
-            trust_remote_code=True,
-            attn_implementation='flash_attention_2', # sdpa or flash_attention_2
-            torch_dtype=torch.bfloat16,
-            init_vision=True,
-        )
-        model = model.eval().cuda()
-        tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
-        image = Image.open(imagesPaths[0]).convert('RGB')
-        msgs = [{'role': 'user', 'content': [image, query]}]
-        answer = model.chat(
-            image=None,
-            msgs=msgs,
-            tokenizer=tokenizer
-        )
-        print(answer)
-        return answer
-        """
         #ollama method below
         torch.cuda.empty_cache() #release cuda so that ollama can use gpu!
@@ -82,31 +107,149 @@ class Rag:
         os.environ['OLLAMA_FLASH_ATTENTION'] = os.environ['flashattn'] #int "1"
         if os.environ['ollama'] == "minicpm-v":
             os.environ['ollama'] = "minicpm-v:8b-2.6-q8_0" #set to quantized version
         # Close model thread (colpali)
         print(f"Querying OpenAI for query={query}, imagesPaths={imagesPaths}")
-        from ollama import chat
         try:
             response = chat(
-                    model=os.environ['ollama'],
                     messages=[
                     {
                     'role': 'user',
-                    'content': query,
                     'images': imagesPaths,
                     "temperature":float(os.environ['temperature']), #test if temp makes a diff
                     }
                 ],
                 )
             answer = response.message.content
-            print(answer)
-            return answer
         except Exception as e:
             print(f"An error occurred while querying OpenAI: {e}")
@@ -153,4 +296,4 @@ class Rag:
 #     query = "Based on attached images, how many new cases were reported during second wave peak"
 #     imagesPaths = ["covid_slides_page_8.png", "covid_slides_page_8.png"]
-#     rag.get_answer_from_gemini(query, imagesPaths)

 import requests
 import os
+import re
 from typing import List
 from utils import encode_image
 from PIL import Image
+from ollama import chat
 import torch
 import subprocess
 import psutil
 import torch
 from transformers import AutoModel, AutoTokenizer
+from google import genai
 class Rag:
+    def _clean_raw_token_response(self, response_text):
+        """
+        Clean raw token responses that contain undecoded token IDs
+        This handles cases where models return raw tokens instead of decoded text
+        """
+        if not response_text:
+            return response_text
+        # Check if response contains raw token patterns
+        token_patterns = [
+            r'<unused\d+>',  # unused tokens
+            r'<bos>',        # beginning of sequence
+            r'<eos>',        # end of sequence
+            r'<unk>',        # unknown tokens
+            r'<mask>',       # mask tokens
+            r'<pad>',        # padding tokens
+            r'\[multimodal\]', # multimodal tokens
+        ]
+        # If response contains raw tokens, try to clean them
+        has_raw_tokens = any(re.search(pattern, response_text) for pattern in token_patterns)
+        if has_raw_tokens:
+            print("⚠️  Detected raw token response, attempting to clean...")
+            # Remove common raw token patterns
+            cleaned_text = response_text
+            # Remove unused tokens
+            cleaned_text = re.sub(r'<unused\d+>', '', cleaned_text)
+            # Remove special tokens
+            cleaned_text = re.sub(r'<(bos|eos|unk|mask|pad)>', '', cleaned_text)
+            # Remove multimodal tokens
+            cleaned_text = re.sub(r'\[multimodal\]', '', cleaned_text)
+            # Clean up extra whitespace
+            cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+            # If we still have mostly tokens, return an error message
+            if len(cleaned_text.strip()) < 10:
+                return "❌ **Model Response Error**: The model returned raw token IDs instead of decoded text. This may be due to model configuration issues. Please try:\n\n1. Restarting the Ollama server\n2. Using a different model\n3. Checking model compatibility with multimodal inputs"
+            return cleaned_text
+        return response_text
     def get_answer_from_gemini(self, query, imagePaths):
         print(f"Querying Gemini for query={query}, imagePaths={imagePaths}")
         try:
+            genai.configure(api_key='AIzaSyCwRr9054tCuh2S8yGpwKFvOAxYMT4WNIs')
+            model = genai.GenerativeModel('gemini-2.0-flash')
             images = [Image.open(path) for path in imagePaths]
         #import environ variables from .env
         import dotenv
+        # Load the .env file
         dotenv_file = dotenv.find_dotenv()
         dotenv.load_dotenv(dotenv_file)
         #ollama method below
         torch.cuda.empty_cache() #release cuda so that ollama can use gpu!
         os.environ['OLLAMA_FLASH_ATTENTION'] = os.environ['flashattn'] #int "1"
         if os.environ['ollama'] == "minicpm-v":
             os.environ['ollama'] = "minicpm-v:8b-2.6-q8_0" #set to quantized version
+        elif os.environ['ollama'] == "gemma3":
+            os.environ['ollama'] = "gemma3:12b" #set to upscaled version
+            # Add specific environment variables for Gemma3 to prevent raw token issues
+            os.environ['OLLAMA_KEEP_ALIVE'] = "5m"
+            os.environ['OLLAMA_ORIGINS'] = "*"
         # Close model thread (colpali)
         print(f"Querying OpenAI for query={query}, imagesPaths={imagesPaths}")
         try:
+            # Enhanced prompt for more detailed responses with explicit page usage
+            enhanced_query = f"""
+            Please provide a comprehensive and detailed answer to the following query.
+            Use ALL available information from the provided document images to give a thorough response.
+            Query: {query}
+            CRITICAL INSTRUCTIONS:
+            - You have been provided with {len(imagesPaths)} document page(s)
+            - You MUST reference information from ALL {len(imagesPaths)} page(s) in your response
+            - Do not skip any pages - each page contains relevant information
+            - If you mention one page, you must also mention the others
+            - Ensure your response reflects the complete information from all pages
+            Instructions for detailed response:
+            1. Provide extensive background information and context
+            2. Include specific details, examples, and data points from ALL documents
+            3. Explain concepts thoroughly with step-by-step breakdowns
+            4. Provide comprehensive analysis rather than simple answers when requested
+            5. Explicitly reference each page and what information it contributes
+            6. Cross-reference information between pages when relevant
+            7. Ensure no page is left unmentioned in your analysis
+            SPECIAL INSTRUCTIONS FOR TABULAR DATA:
+            - If the query requests a table, list, or structured data, organize your response in a clear, structured format
+            - Use numbered lists, bullet points, or clear categories when appropriate
+            - Include specific data points or comparisons when available
+            - Structure information in a way that can be easily converted to a table format
+            IMPORTANT: Respond with natural, human-readable text only. Do not include any special tokens, codes, or technical identifiers in your response.
+            Make sure to acknowledge and use information from all {len(imagesPaths)} provided pages.
+            """
+            # Try with current model first
+            current_model = os.environ['ollama']
+            # Set different options based on the model
+            if "gemma3" in current_model.lower():
+                # Specific options for Gemma3 to prevent raw token issues
+                model_options = {
+                    "num_predict": 1024,  # Shorter responses for Gemma3
+                    "stop": ["<eos>", "<|endoftext|>", "</s>", "<|im_end|>"],  # More stop tokens
+                    "top_k": 20,  # Lower top_k for more focused generation
+                    "top_p": 0.8,  # Lower top_p for more deterministic output
+                    "repeat_penalty": 1.2,  # Higher repeat penalty
+                    "seed": 42,  # Consistent results
+                    "temperature": 0.7,  # Lower temperature for more focused responses
+                }
+            else:
+                # Default options for other models
+                model_options = {
+                    "num_predict": 2048,  # Limit response length
+                    "stop": ["<eos>", "<|endoftext|>", "</s>"],  # Stop at end tokens
+                    "top_k": 40,  # Reduce randomness
+                    "top_p": 0.9,  # Nucleus sampling
+                    "repeat_penalty": 1.1,  # Prevent repetition
+                    "seed": 42,  # Consistent results
+                }
             response = chat(
+                    model=current_model,
                     messages=[
                     {
                     'role': 'user',
+                    'content': enhanced_query,
                     'images': imagesPaths,
                     "temperature":float(os.environ['temperature']), #test if temp makes a diff
                     }
                 ],
+                options=model_options
                 )
             answer = response.message.content
+            # Clean the response to handle raw token issues
+            cleaned_answer = self._clean_raw_token_response(answer)
+            # If the cleaned answer is still problematic, try fallback models
+            if cleaned_answer and "❌ **Model Response Error**" in cleaned_answer:
+                print(f"⚠️  Primary model {current_model} failed, trying fallback models...")
+                # List of fallback models to try
+                fallback_models = [
+                    "llama3.2-vision:latest",
+                    "llava:latest",
+                    "bakllava:latest",
+                    "llama3.2:latest"
+                ]
+                for fallback_model in fallback_models:
+                    try:
+                        print(f"🔄 Trying fallback model: {fallback_model}")
+                        response = chat(
+                            model=fallback_model,
+                            messages=[
+                            {
+                            'role': 'user',
+                            'content': enhanced_query,
+                            'images': imagesPaths,
+                            "temperature":float(os.environ['temperature']),
+                            }
+                        ],
+                        options={
+                            "num_predict": 2048,
+                            "stop": ["<eos>", "<|endoftext|>", "</s>"],
+                            "top_k": 40,
+                            "top_p": 0.9,
+                            "repeat_penalty": 1.1,
+                            "seed": 42,
+                        }
+                        )
+                        fallback_answer = response.message.content
+                        cleaned_fallback = self._clean_raw_token_response(fallback_answer)
+                        if cleaned_fallback and "❌ **Model Response Error**" not in cleaned_fallback:
+                            print(f"✅ Fallback model {fallback_model} succeeded")
+                            return cleaned_fallback
+                    except Exception as fallback_error:
+                        print(f"❌ Fallback model {fallback_model} failed: {fallback_error}")
+                        continue
+                # If all fallbacks fail, return the original error
+                return cleaned_answer
+            print(f"Original response: {answer}")
+            print(f"Cleaned response: {cleaned_answer}")
+            return cleaned_answer
         except Exception as e:
             print(f"An error occurred while querying OpenAI: {e}")
 #     query = "Based on attached images, how many new cases were reported during second wave peak"
 #     imagesPaths = ["covid_slides_page_8.png", "covid_slides_page_8.png"]
+#     rag.get_answer_from_gemini(query, imagesPaths)