Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

Xalt8 commited on Jul 12, 2024

Commit

1dcc70b

1 Parent(s): 466b7d1

reranking working

Browse files

Files changed (2) hide show

rag_app/loading_data/load_chroma_db_cross_platform.py +9 -5
rag_app/reranking.py +6 -7

rag_app/loading_data/load_chroma_db_cross_platform.py CHANGED Viewed

@@ -8,9 +8,6 @@ import sys
 import zipfile
-S3_LOCATION = os.getenv("S3_LOCATION")
 def download_chroma_from_s3(s3_location:str,
                             chroma_vs_name:str,
                             vectorstore_folder:str,
@@ -32,20 +29,27 @@ def download_chroma_from_s3(s3_location:str,
         # Initialize an S3 client with unsigned configuration for public access
         s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
         s3.download_file(s3_location, chroma_vs_name, vs_save_path)
         # Extract the zip file
         with zipfile.ZipFile(file=str(vs_save_path), mode='r') as zip_ref:
             zip_ref.extractall(path=vectorstore_folder)
     except Exception as e:
         print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
     # Delete the zip file
     vs_save_path.unlink()
 if __name__ == "__main__":
     chroma_vs_name = "vectorstores/chroma-zurich-mpnet-1500.zip"
-    project_dir = Path().cwd().parent
     vs_destination = str(project_dir / 'vectorstore')
     assert Path(vs_destination).is_dir(), "Cannot find vectorstore folder"

 import zipfile
 def download_chroma_from_s3(s3_location:str,
                             chroma_vs_name:str,
                             vectorstore_folder:str,
         # Initialize an S3 client with unsigned configuration for public access
         s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
         s3.download_file(s3_location, chroma_vs_name, vs_save_path)
+        print('Downloaded file from S3')
         # Extract the zip file
         with zipfile.ZipFile(file=str(vs_save_path), mode='r') as zip_ref:
             zip_ref.extractall(path=vectorstore_folder)
+        print("Extracted zip file")
     except Exception as e:
         print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
     # Delete the zip file
     vs_save_path.unlink()
+    print("Deleting zip file")
 if __name__ == "__main__":
+    S3_LOCATION = os.getenv("S3_LOCATION")
     chroma_vs_name = "vectorstores/chroma-zurich-mpnet-1500.zip"
+    project_dir = Path().cwd().parent.parent
     vs_destination = str(project_dir / 'vectorstore')
     assert Path(vs_destination).is_dir(), "Cannot find vectorstore folder"

rag_app/reranking.py CHANGED Viewed

@@ -80,31 +80,29 @@ def get_reranked_docs_chroma(query:str,
         Returns: A list of documents with the highest rank
     """
-    assert num_docs <= 10, "num_docs should be less than similarity search results"
     embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
                                                    model_name=embedding_model)
     # Load the vectorstore database
     db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
-    # Get 10 documents based on similarity search
     sim_docs =  db.similarity_search(query=query, k=10)
-    # Add the page_content, description and title together
     passages = [doc.page_content for doc in sim_docs]
     # Prepare the payload
     payload = {"inputs":
                {"source_sentence": query,
 	            "sentences": passages}}
     headers = {"Authorization": f"Bearer {hf_api_key}"}
     response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
     if response.status_code != 200:
         print('Something went wrong with the response')
         return
     similarity_scores = response.json()
     ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
     top_k_results = ranked_results[:num_docs]
@@ -113,16 +111,17 @@ def get_reranked_docs_chroma(query:str,
 if __name__ == "__main__":
     HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
     EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
     project_dir = Path().cwd().parent
     path_to_vector_db = str(project_dir/'vectorstore/chroma-zurich-mpnet-1500')
     query = "I'm looking for student insurance"
     re_ranked_docs = get_reranked_docs_chroma(query=query,
                                               path_to_db= path_to_vector_db,
                                               embedding_model=EMBEDDING_MODEL,

         Returns: A list of documents with the highest rank
     """
     embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
                                                    model_name=embedding_model)
     # Load the vectorstore database
     db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
+    # Get k documents based on similarity search
     sim_docs =  db.similarity_search(query=query, k=10)
     passages = [doc.page_content for doc in sim_docs]
     # Prepare the payload
     payload = {"inputs":
                {"source_sentence": query,
 	            "sentences": passages}}
     headers = {"Authorization": f"Bearer {hf_api_key}"}
     response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
+    print(f'{response = }')
     if response.status_code != 200:
         print('Something went wrong with the response')
         return
     similarity_scores = response.json()
     ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
     top_k_results = ranked_results[:num_docs]
 if __name__ == "__main__":
     HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
     EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
     project_dir = Path().cwd().parent
     path_to_vector_db = str(project_dir/'vectorstore/chroma-zurich-mpnet-1500')
+    assert Path(path_to_vector_db).exists(), "Cannot access path_to_vector_db "
     query = "I'm looking for student insurance"
     re_ranked_docs = get_reranked_docs_chroma(query=query,
                                               path_to_db= path_to_vector_db,
                                               embedding_model=EMBEDDING_MODEL,