Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

sabazo commited on Aug 13, 2024

Commit

7034c3c

unverified ·

2 Parent(s): 76dab82 47feab3

Merge pull request #56 from almutareb/52-move-metadata-creation-and-supplement-into-own-module

Browse files

Files changed (17) hide show

.gitignore +5 -1
config.py +16 -1
test_this.py → cookbook/sample_get_faiss.py +0 -0
pytest.ini +2 -0
rag_app/__init__.py +7 -0
rag_app/get_db_retriever.py +0 -30
rag_app/hybrid_search.py +0 -63
rag_app/knowledge_base/build_vector_store.py +0 -85
rag_app/knowledge_base/create_embedding.py +0 -54
rag_app/{multi_index_search.py → knowledge_base/multi_index_search.py} +0 -0
rag_app/knowledge_base/utils.py +116 -4
rag_app/reranking.py +0 -131
rag_app/structured_tools/structured_tools.py +4 -4
rag_app/vector_store_handler/__init__.py +0 -0
rag_app/vector_store_handler/vectorstores.py +325 -0
tests/integration/test_vector_store_integration.py +89 -0
tests/vector_store_handler/test_vectorstores.py +88 -0

.gitignore CHANGED Viewed

@@ -168,4 +168,8 @@ cython_debug/
 # Databases
-*.db

 # Databases
+*.db
+# editor realted files
+.vscode/

config.py CHANGED Viewed

@@ -2,16 +2,31 @@ import os
 from dotenv import load_dotenv
 from rag_app.database.db_handler import DataBaseHandler
 from langchain_huggingface import HuggingFaceEndpoint
 load_dotenv()
 SQLITE_FILE_NAME = os.getenv('SOURCES_CACHE')
-PERSIST_DIRECTORY = os.getenv('VECTOR_DATABASE_LOCATION')
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
 SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
 BERT_MODEL = os.getenv("BERT_MODEL")
 db = DataBaseHandler()
 db.create_all_tables()

 from dotenv import load_dotenv
 from rag_app.database.db_handler import DataBaseHandler
 from langchain_huggingface import HuggingFaceEndpoint
+# from langchain_huggingface import HuggingFaceHubEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings
 load_dotenv()
 SQLITE_FILE_NAME = os.getenv('SOURCES_CACHE')
+VECTOR_DATABASE_LOCATION = os.getenv('VECTOR_DATABASE_LOCATION')
 EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
 SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
 BERT_MODEL = os.getenv("BERT_MODEL")
+FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+# embeddings = HuggingFaceHubEmbeddings(repo_id=EMBEDDING_MODEL)
+model_kwargs = {'device': 'cpu'}
+encode_kwargs = {'normalize_embeddings': False}
+embeddings = HuggingFaceEmbeddings(
+    model_name=EMBEDDING_MODEL,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs
+)
 db = DataBaseHandler()
 db.create_all_tables()

test_this.py → cookbook/sample_get_faiss.py RENAMED Viewed

File without changes

pytest.ini ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [pytest]
2	+ pythonpath = .

rag_app/__init__.py CHANGED Viewed

	@@ -0,0 +1,7 @@

+import sys
+from pathlib import Path
+# Add the project root to the Python path
+project_root = str(Path(__file__).parent.parent)
+if project_root not in sys.path:
+    sys.path.append(project_root)

rag_app/get_db_retriever.py DELETED Viewed

@@ -1,30 +0,0 @@
-# retriever and qa_chain function
-# HF libraries
-from langchain.llms import HuggingFaceHub
-from langchain_huggingface import HuggingFaceHubEmbeddings
-# vectorestore
-from langchain_community.vectorstores import FAISS
-# retrieval chain
-from langchain.chains import RetrievalQA
-# prompt template
-from langchain.prompts import PromptTemplate
-from langchain.memory import ConversationBufferMemory
-def get_db_retriever(vector_db:str=None):
-    model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
-    embeddings = HuggingFaceHubEmbeddings(repo_id=model_name)
-    #db = Chroma(persist_directory="./vectorstore/lc-chroma-multi-mpnet-500", embedding_function=embeddings)
-    #db.get()
-    if not vector_db:
-        FAISS_INDEX_PATH='./vectorstore/py-faiss-multi-mpnet-500'
-    else:
-        FAISS_INDEX_PATH=vector_db
-    db = FAISS.load_local(FAISS_INDEX_PATH, embeddings)
-    retriever = db.as_retriever()
-    return retriever

rag_app/hybrid_search.py DELETED Viewed

@@ -1,63 +0,0 @@
-from pathlib import Path
-from langchain_community.vectorstores import FAISS
-from dotenv import load_dotenv
-import os
-from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
-from langchain.retrievers import EnsembleRetriever
-from langchain_community.retrievers import BM25Retriever
-def get_hybrid_search_results(query:str,
-                              path_to_db:str,
-                              embedding_model:str,
-                              hf_api_key:str,
-                              num_docs:int=5) -> list:
-    """ Uses an ensemble retriever of BM25 and FAISS to return k num documents
-        Args:
-            query (str): The search query
-            path_to_db (str): Path to the vectorstore database
-            embedding_model (str): Embedding model used in the vector store
-            num_docs (int): Number of documents to return
-        Returns
-            List of documents
-    """
-    embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
-                                                   model_name=embedding_model)
-    # Load the vectorstore database
-    db = FAISS.load_local(folder_path=path_to_db,
-                          embeddings=embeddings,
-                          allow_dangerous_deserialization=True)
-    all_docs = db.similarity_search("", k=db.index.ntotal)
-    bm25_retriever = BM25Retriever.from_documents(all_docs)
-    bm25_retriever.k = num_docs  # How many results you want
-    faiss_retriever = db.as_retriever(search_kwargs={'k': num_docs})
-    ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
-                                           weights=[0.5,0.5])
-    results = ensemble_retriever.invoke(input=query)
-    return results
-if __name__ == "__main__":
-    query = "Haustierversicherung"
-    HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
-    path_to_vector_db = Path("..")/'vectorstore/faiss-insurance-agent-500'
-    results = get_hybrid_search_results(query=query,
-                                    path_to_db=path_to_vector_db,
-                                    embedding_model=EMBEDDING_MODEL,
-                                    hf_api_key=HUGGINGFACEHUB_API_TOKEN)
-    for doc in results:
-        print(doc)
-        print()

rag_app/knowledge_base/build_vector_store.py DELETED Viewed

@@ -1,85 +0,0 @@
-# vectorization functions
-from langchain_community.vectorstores import FAISS
-from langchain_community.vectorstores import Chroma
-#from langchain_community.document_loaders import DirectoryLoader
-#from langchain_text_splitters import RecursiveCharacterTextSplitter
-#from langchain_community.embeddings.sentence_transformer import (
-#    SentenceTransformerEmbeddings,
-#)
-#from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.retrievers import BM25Retriever
-from rag_app.knowledge_base.create_embedding import create_embeddings
-from rag_app.utils.generate_summary import generate_description, generate_keywords
-import time
-import os
-#from dotenv import load_dotenv
-def build_vector_store(
-        docs: list,
-        db_path: str,
-        embedding_model: str,
-        new_db:bool=False,
-        chunk_size:int=500,
-        chunk_overlap:int=50,
-        ):
-    """
-    """
-    if db_path is None:
-        FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
-    else:
-        FAISS_INDEX_PATH = db_path
-    embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
-    # for chunk in chunks:
-    #     keywords=generate_keywords(chunk)
-    #     description=generate_description(chunk)
-    #     chunk.metadata['keywords']=keywords
-    #     chunk.metadata['description']=description
-    #load chunks into vector store
-    print(f'Loading chunks into faiss vector store ...')
-    st = time.time()
-    if new_db:
-        db_faiss = FAISS.from_documents(chunks, embeddings)
-        bm25_retriever = BM25Retriever.from_documents(chunks)
-    else:
-        db_faiss = FAISS.add_documents(chunks, embeddings)
-        bm25_retriever = BM25Retriever.add_documents(chunks)
-    db_faiss.save_local(FAISS_INDEX_PATH)
-    et = time.time() - st
-    print(f'Time taken: {et} seconds.')
-    print(f'Loading chunks into chroma vector store ...')
-    st = time.time()
-    persist_directory='./vectorstore/chroma-insurance-agent-1500'
-    db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
-    et = time.time() - st
-    print(f'Time taken: {et} seconds.')
-    result = f"built vectore store at {FAISS_INDEX_PATH}"
-    return result
-# # Path for saving the FAISS index
-# FAISS_INDEX_PATH = "./vectorstore/lc-faiss-multi-mpnet-500"
-# try:
-#     # Stage two: Vectorization of the document chunks
-#     model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"  # Model used for embedding
-#     # Initialize HuggingFace embeddings with the specified model
-#     embeddings = HuggingFaceEmbeddings(model_name=model_name)
-#     print(f'Loading chunks into vector store ...')
-#     st = time.time()  # Start time for performance measurement
-#     # Create a FAISS vector store from the document chunks and save it locally
-#     db = FAISS.from_documents(filter_complex_metadata(chunks), embeddings)
-#     db.save_local(FAISS_INDEX_PATH)
-#     et = time.time() - st  # Calculate time taken for vectorization
-#     print(f'Time taken for vectorization and saving: {et} seconds.')
-# except Exception as e:
-#     print(f"Error during vectorization or FAISS index saving: {e}", file=sys.stderr)
-# alternatively download a preparaed vectorized index from S3 and load the index into vectorstore
-# Import necessary libraries for AWS S3 interaction, file handling, and FAISS vector stores

rag_app/knowledge_base/create_embedding.py DELETED Viewed

@@ -1,54 +0,0 @@
-# embeddings functions
-#from langchain_community.vectorstores import FAISS
-#from langchain_community.document_loaders import ReadTheDocsLoader
-#from langchain_community.vectorstores.utils import filter_complex_metadata
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-# from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.embeddings.sentence_transformer import (
-    SentenceTransformerEmbeddings,
-)
-import time
-from langchain_core.documents import Document
-def create_embeddings(
-        docs: list[Document],
-        chunk_size:int = 500,
-        chunk_overlap:int = 50,
-        embedding_model: str = "sentence-transformers/multi-qa-mpnet-base-dot-v1",
-        ):
-    """given a sequence of `Document` objects this fucntion will
-    generate embeddings for it.
-    ## argument
-    :params docs (list[Document]) -> list of `list[Document]`
-    :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
-    :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
-    :params embedding_model (str) -> the huggingspace model that will embed the documents
-    ## Return
-    Tuple of embedding and chunks
-    """
-    text_splitter = RecursiveCharacterTextSplitter(
-        separators=["\n\n", "\n", "(?<=\. )", " ", ""],
-        chunk_size = chunk_size,
-        chunk_overlap  = chunk_overlap,
-        length_function = len,
-    )
-    # Stage one: read all the docs, split them into chunks.
-    st = time.time()
-    print('Loading documents and creating chunks ...')
-    # Split each document into chunks using the configured text splitter
-    chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
-    et = time.time() - st
-    print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
-    #Stage two: embed the docs.
-    #embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
-    embeddings = SentenceTransformerEmbeddings(model_name=embedding_model)
-    print(f"created a total of {len(chunks)} chunks")
-    return embeddings,chunks

rag_app/{multi_index_search.py → knowledge_base/multi_index_search.py} RENAMED Viewed

File without changes

rag_app/knowledge_base/utils.py CHANGED Viewed

@@ -1,10 +1,75 @@
 from langchain_core.documents import Document
 from chains import generate_document_summary_prompt
-from config import SEVEN_B_LLM_MODEL
 def generate_document_summaries(
-        docs: list[Document]
     ) -> list[Document]:
     """
     Generates summaries for a list of Document objects and updates their metadata with the summaries.
@@ -27,7 +92,7 @@ def generate_document_summaries(
     for doc in new_docs:
-        genrate_summary_chain = generate_document_summary_prompt | SEVEN_B_LLM_MODEL
         summary = genrate_summary_chain.invoke(
             {"document":str(doc.metadata)}
         )
@@ -36,4 +101,51 @@ def generate_document_summaries(
             {"summary":summary}
         )
-    return new_docs

 from langchain_core.documents import Document
 from chains import generate_document_summary_prompt
+# embeddings functions
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
+import time
+from langchain_core.language_models import BaseChatModel
+from langchain.retrievers import VectorStoreRetriever
+from langchain_core.vectorstores import VectorStoreRetriever
+# vectorization functions
+from langchain_community.vectorstores import FAISS
+from langchain_community.vectorstores import Chroma
+from langchain_community.retrievers import BM25Retriever
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+from pathlib import Path
+from langchain_community.vectorstores import FAISS
+from dotenv import load_dotenv
+import os
+import requests
+from rag_app.knowledge_base.utils import create_embeddings
+from rag_app.utils.generate_summary import generate_description, generate_keywords
+from config import EMBEDDING_MODEL, FAISS_INDEX_PATH, SEVEN_B_LLM_MODEL
+def create_embeddings(
+        docs: list[Document],
+        chunk_size:int = 500,
+        chunk_overlap:int = 50,
+        ):
+    """given a sequence of `Document` objects this fucntion will
+    generate embeddings for it.
+    ## argument
+    :params docs (list[Document]) -> list of `list[Document]`
+    :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
+    :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
+    :params embedding_model (str) -> the huggingspace model that will embed the documents
+    ## Return
+    Tuple of embedding and chunks
+    """
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators=["\n\n", "\n", "(?<=\. )", " ", ""],
+        chunk_size = chunk_size,
+        chunk_overlap  = chunk_overlap,
+        length_function = len,
+    )
+    # Stage one: read all the docs, split them into chunks.
+    st = time.time()
+    print('Loading documents and creating chunks ...')
+    # Split each document into chunks using the configured text splitter
+    chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
+    et = time.time() - st
+    print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
+    #Stage two: embed the docs.
+    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
+    print(f"created a total of {len(chunks)} chunks")
+    return embeddings,chunks
 def generate_document_summaries(
+        docs: list[Document],
+        llm:BaseChatModel= SEVEN_B_LLM_MODEL,
     ) -> list[Document]:
     """
     Generates summaries for a list of Document objects and updates their metadata with the summaries.
     for doc in new_docs:
+        genrate_summary_chain = generate_document_summary_prompt | llm
         summary = genrate_summary_chain.invoke(
             {"document":str(doc.metadata)}
         )
             {"summary":summary}
         )
+    return new_docs
+def build_vector_store(
+        docs: list,
+        embedding_model: str,
+        new_db:bool=False,
+        chunk_size:int=500,
+        chunk_overlap:int=50,
+        ):
+    """
+    """
+    embeddings,chunks = create_embeddings(
+        docs,
+        chunk_size,
+        chunk_overlap,
+        embedding_model
+        )
+    #load chunks into vector store
+    print(f'Loading chunks into faiss vector store ...')
+    st = time.time()
+    if new_db:
+        db_faiss = FAISS.from_documents(chunks, embeddings)
+        bm25_retriever = BM25Retriever.from_documents(chunks)
+    else:
+        db_faiss = FAISS.add_documents(chunks, embeddings)
+        bm25_retriever = BM25Retriever.add_documents(chunks)
+    db_faiss.save_local(FAISS_INDEX_PATH)
+    et = time.time() - st
+    print(f'Time taken: {et} seconds.')
+    print(f'Loading chunks into chroma vector store ...')
+    st = time.time()
+    persist_directory='./vectorstore/chroma-insurance-agent-1500'
+    db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
+    et = time.time() - st
+    print(f'Time taken: {et} seconds.')
+    result = f"built vectore store at {FAISS_INDEX_PATH}"
+    return result

rag_app/reranking.py DELETED Viewed

@@ -1,131 +0,0 @@
-# from get_db_retriever import get_db_retriever
-from pathlib import Path
-from langchain_community.vectorstores import FAISS
-from dotenv import load_dotenv
-import os
-from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
-import requests
-from langchain_community.vectorstores import Chroma
-load_dotenv()
-def get_reranked_docs_faiss(query:str,
-                      path_to_db:str,
-                      embedding_model:str,
-                      hf_api_key:str,
-                      num_docs:int=5) -> list:
-    """ Re-ranks the similarity search results and returns top-k highest ranked docs
-        Args:
-            query (str): The search query
-            path_to_db (str): Path to the vectorstore database
-            embedding_model (str): Embedding model used in the vector store
-            num_docs (int): Number of documents to return
-        Returns: A list of documents with the highest rank
-    """
-    assert num_docs <= 10, "num_docs should be less than similarity search results"
-    embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
-                                                   model_name=embedding_model)
-    # Load the vectorstore database
-    db = FAISS.load_local(folder_path=path_to_db,
-                          embeddings=embeddings,
-                          allow_dangerous_deserialization=True)
-    # Get 10 documents based on similarity search
-    docs =  db.similarity_search(query=query, k=10)
-    # Add the page_content, description and title together
-    passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
-                for doc in docs]
-    # Prepare the payload
-    inputs = [{"text": query, "text_pair": passage} for passage in passages]
-    API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
-    headers = {"Authorization": f"Bearer {hf_api_key}"}
-    response = requests.post(API_URL, headers=headers, json=inputs)
-    scores = response.json()
-    try:
-        relevance_scores = [item[1]['score'] for item in scores]
-    except ValueError as e:
-        print('Could not get the relevance_scores -> something might be wrong with the json output')
-        return
-    if relevance_scores:
-        ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
-        top_k_results = ranked_results[:num_docs]
-        return [doc for doc, _, _ in top_k_results]
-def get_reranked_docs_chroma(query:str,
-                      path_to_db:str,
-                      embedding_model:str,
-                      hf_api_key:str,
-                      reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2",
-                      num_docs:int=5) -> list:
-    """ Re-ranks the similarity search results and returns top-k highest ranked docs
-        Args:
-            query (str): The search query
-            path_to_db (str): Path to the vectorstore database
-            embedding_model (str): Embedding model used in the vector store
-            num_docs (int): Number of documents to return
-        Returns: A list of documents with the highest rank
-    """
-    embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
-                                                   model_name=embedding_model)
-    # Load the vectorstore database
-    db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
-    # Get k documents based on similarity search
-    sim_docs =  db.similarity_search(query=query, k=10)
-    passages = [doc.page_content for doc in sim_docs]
-    # Prepare the payload
-    payload = {"inputs":
-               {"source_sentence": query,
-	            "sentences": passages}}
-    headers = {"Authorization": f"Bearer {hf_api_key}"}
-    response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
-    print(f'{response = }')
-    if response.status_code != 200:
-        print('Something went wrong with the response')
-        return
-    similarity_scores = response.json()
-    ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
-    top_k_results = ranked_results[:num_docs]
-    return [doc for doc, _, _ in top_k_results]
-if __name__ == "__main__":
-    HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-    EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
-    project_dir = Path().cwd().parent
-    path_to_vector_db = str(project_dir/'vectorstore/chroma-zurich-mpnet-1500')
-    assert Path(path_to_vector_db).exists(), "Cannot access path_to_vector_db "
-    query = "I'm looking for student insurance"
-    re_ranked_docs = get_reranked_docs_chroma(query=query,
-                                              path_to_db= path_to_vector_db,
-                                              embedding_model=EMBEDDING_MODEL,
-                                              hf_api_key=HUGGINGFACEHUB_API_TOKEN)
-    print(f"{re_ranked_docs=}")

rag_app/structured_tools/structured_tools.py CHANGED Viewed

@@ -13,9 +13,9 @@ from rag_app.utils.utils import (
 )
 import chromadb
 import os
-from config import db, PERSIST_DIRECTORY, EMBEDDING_MODEL
-if not os.path.exists(PERSIST_DIRECTORY):
     get_chroma_vs()
 @tool
@@ -24,7 +24,7 @@ def memory_search(query:str) -> str:
         This is your primary source to start your search with checking what you already have learned from the past, before going online."""
     # Since we have more than one collections we should change the name of this tool
     client = chromadb.PersistentClient(
-     path=PERSIST_DIRECTORY,
     )
     collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
@@ -71,7 +71,7 @@ def knowledgeBase_search(query:str) -> str:
     # #collection_name=collection_name,
     # embedding_function=embedding_function,
     # )
-    vector_db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_function)
     retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={'k':5, 'fetch_k':10})
     # This is deprecated, changed to invoke
     # LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.

 )
 import chromadb
 import os
+from config import db, VECTOR_DATABASE_LOCATION, EMBEDDING_MODEL
+if not os.path.exists(VECTOR_DATABASE_LOCATION):
     get_chroma_vs()
 @tool
         This is your primary source to start your search with checking what you already have learned from the past, before going online."""
     # Since we have more than one collections we should change the name of this tool
     client = chromadb.PersistentClient(
+     path=VECTOR_DATABASE_LOCATION,
     )
     collection_name = os.getenv('CONVERSATION_COLLECTION_NAME')
     # #collection_name=collection_name,
     # embedding_function=embedding_function,
     # )
+    vector_db = Chroma(persist_directory=VECTOR_DATABASE_LOCATION, embedding_function=embedding_function)
     retriever = vector_db.as_retriever(search_type="mmr", search_kwargs={'k':5, 'fetch_k':10})
     # This is deprecated, changed to invoke
     # LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.

rag_app/vector_store_handler/__init__.py ADDED Viewed

File without changes

rag_app/vector_store_handler/vectorstores.py ADDED Viewed

	@@ -0,0 +1,325 @@

+from abc import ABC, abstractmethod
+from langchain.vectorstores import Chroma, FAISS
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.document_loaders import TextLoader
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
+import time
+from langchain_core.documents import Document
+from config import EMBEDDING_MODEL, HUGGINGFACEHUB_API_TOKEN
+from langchain.retrievers import EnsembleRetriever
+from langchain_community.retrievers import BM25Retriever
+import requests
+class BaseVectorStore(ABC):
+    """
+    Abstract base class for vector stores.
+    This class defines the interface for vector stores and implements
+    common functionality.
+    """
+    def __init__(self, embedding_model, persist_directory=None):
+        """
+        Initialize the BaseVectorStore.
+        Args:
+            embedding_model: The embedding model to use for vectorizing text.
+            persist_directory (str, optional): Directory to persist the vector store.
+        """
+        self.persist_directory = persist_directory
+        self.embeddings = embedding_model
+        self.vectorstore = None
+    def load_and_process_documents(self, file_path, chunk_size=1000, chunk_overlap=0):
+        """
+        Load and process documents from a file.
+        Args:
+            file_path (str): Path to the file to load.
+            chunk_size (int): Size of text chunks for processing.
+            chunk_overlap (int): Overlap between chunks.
+        Returns:
+            list: Processed documents.
+        """
+        loader = TextLoader(file_path)
+        documents = loader.load()
+        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+        return text_splitter.split_documents(documents)
+    def get_hybrid_search_result(self,query:str):
+        pass
+    @abstractmethod
+    def create_vectorstore(self, texts):
+        """
+        Create a new vector store from the given texts.
+        Args:
+            texts (list): List of texts to vectorize and store.
+        """
+        pass
+    @abstractmethod
+    def load_existing_vectorstore(self):
+        """
+        Load an existing vector store from the persist directory.
+        """
+        pass
+    def similarity_search(self, query):
+        """
+        Perform a similarity search on the vector store.
+        Args:
+            query (str): The query text to search for.
+        Returns:
+            list: Search results.
+        Raises:
+            ValueError: If the vector store is not initialized.
+        """
+        if self.vectorstore is None:
+            raise ValueError("Vector store not initialized. Call create_vectorstore or load_existing_vectorstore first.")
+        return self.vectorstore.similarity_search(query)
+    @abstractmethod
+    def save(self):
+        """
+        Save the current state of the vector store.
+        """
+        pass
+class ChromaVectorStore(BaseVectorStore):
+    """
+    Implementation of BaseVectorStore using Chroma as the backend.
+    """
+    def create_vectorstore(self, texts):
+        """
+        Create a new Chroma vector store from the given texts.
+        Args:
+            texts (list): List of texts to vectorize and store.
+        """
+        self.vectorstore = Chroma.from_documents(
+            texts,
+            self.embeddings,
+            persist_directory=self.persist_directory
+        )
+    def load_existing_vectorstore(self):
+        """
+        Load an existing Chroma vector store from the persist directory.
+        Raises:
+            ValueError: If persist_directory is not set.
+        """
+        if self.persist_directory is not None:
+            self.vectorstore = Chroma(
+                persist_directory=self.persist_directory,
+                embedding_function=self.embeddings
+            )
+        else:
+            raise ValueError("Persist directory is required for loading Chroma.")
+    def save(self):
+        """
+        Save the current state of the Chroma vector store.
+        Raises:
+            ValueError: If the vector store is not initialized.
+        """
+        if not self.vectorstore:
+            raise ValueError("Vector store not initialized. Nothing to save.")
+        self.vectorstore.persist()
+    def get_reranked_docs(
+        self,
+        query:str,
+        num_docs:int=5
+        ):
+        """ Re-ranks the similarity search results and returns top-k highest ranked docs
+        Args:
+            query (str): The search query
+            path_to_db (str): Path to the vectorstore database
+            embedding_model (str): Embedding model used in the vector store
+            num_docs (int): Number of documents to return
+        Returns: A list of documents with the highest rank
+        """
+        # Get k documents based on similarity search
+        sim_docs =  self.vectorstore.similarity_search(query=query, k=10)
+        # Add the page_content, description and title together
+        passages = [doc.page_content for doc in sim_docs]
+        # Prepare the payload
+        payload = {"inputs":
+                {"source_sentence": query,
+                    "sentences": passages}}
+        headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
+        reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2"
+        response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
+        print(f'{response = }')
+        if response.status_code != 200:
+            print('Something went wrong with the response')
+            return
+        similarity_scores = response.json()
+        ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
+        top_k_results = ranked_results[:num_docs]
+        return [doc for doc, _, _ in top_k_results]
+class FAISSVectorStore(BaseVectorStore):
+    """
+    Implementation of BaseVectorStore using FAISS as the backend.
+    """
+    def create_vectorstore(self, texts):
+        """
+        Create a new FAISS vector store from the given texts.
+        Args:
+            texts (list): List of texts to vectorize and store.
+        """
+        self.vectorstore = FAISS.from_documents(texts, self.embeddings)
+    def load_existing_vectorstore(self,allow_dangerous_deserialization:bool=False):
+        """
+        Load an existing FAISS vector store from the persist directory.
+        Raises:
+            ValueError: If persist_directory is not set.
+        """
+        if self.persist_directory:
+            self.vectorstore = FAISS.load_local(self.persist_directory, self.embeddings, allow_dangerous_deserialization)
+        else:
+            raise ValueError("Persist directory is required for loading FAISS.")
+    def save(self):
+        """
+        Save the current state of the FAISS vector store.
+        Raises:
+            ValueError: If the vector store is not initialized.
+        """
+        if self.vectorstore is None:
+            raise ValueError("Vector store not initialized. Nothing to save.")
+        self.vectorstore.save_local(self.persist_directory)
+    def get_hybrid_search_result(
+        self,
+        query:str,
+        num_docs:int=5
+        )-> list[Document]:
+        """ Uses an ensemble retriever of BM25 and FAISS to return k num documents
+        Args:
+            query (str): The search query
+            path_to_db (str): Path to the vectorstore database
+            embedding_model (str): Embedding model used in the vector store
+            num_docs (int): Number of documents to return
+        Returns
+            List of documents
+        """
+        all_docs = self.vectorstore.similarity_search("", k=self.vectorstore.index.ntotal)
+        bm25_retriever = BM25Retriever.from_documents(all_docs)
+        bm25_retriever.k = num_docs  # How many results you want
+        faiss_retriever = self.vectorstore.as_retriever(search_kwargs={'k': num_docs})
+        ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
+                                            weights=[0.5,0.5])
+        results = ensemble_retriever.invoke(input=query)
+        return results
+    def get_reranked_docs(
+        self,
+        query:str,
+        num_docs:int=5
+        ):
+        # Get 10 documents based on similarity search
+        docs = self.vectorstore.similarity_search(query=query, k=10)
+        # Add the page_content, description and title together
+        passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
+                for doc in docs]
+        # Prepare the payload
+        inputs = [{"text": query, "text_pair": passage} for passage in passages]
+        API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
+        headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
+        response = requests.post(API_URL, headers=headers, json=inputs)
+        scores = response.json()
+        try:
+            relevance_scores = [item[1]['score'] for item in scores]
+        except ValueError as e:
+            print('Could not get the relevance_scores -> something might be wrong with the json output')
+            return
+        if relevance_scores:
+            ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
+            top_k_results = ranked_results[:num_docs]
+            return [doc for doc, _, _ in top_k_results]
+# Usage example:
+def main():
+    """
+    Example usage of the vector store classes.
+    """
+    # Create an embedding model
+    embedding_model = OpenAIEmbeddings()
+    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
+    # Using Chroma
+    chroma_store = ChromaVectorStore(embedding_model, persist_directory="./chroma_store")
+    texts = chroma_store.load_and_process_documents("docs/placeholder.txt")
+    chroma_store.create_vectorstore(texts)
+    results = chroma_store.similarity_search("Your query here")
+    print("Chroma results:", results[0].page_content)
+    chroma_store.save()
+    # Load existing Chroma store
+    existing_chroma = ChromaVectorStore(embedding_model, persist_directory="./chroma_store")
+    existing_chroma.load_existing_vectorstore()
+    results = existing_chroma.similarity_search("Another query")
+    print("Existing Chroma results:", results[0].page_content)
+    # Using FAISS
+    faiss_store = FAISSVectorStore(embedding_model, persist_directory="./faiss_store")
+    texts = faiss_store.load_and_process_documents("path/to/your/file.txt")
+    faiss_store.create_vectorstore(texts)
+    results = faiss_store.similarity_search("Your query here")
+    print("FAISS results:", results[0].page_content)
+    faiss_store.save()
+    # Load existing FAISS store
+    existing_faiss = FAISSVectorStore(embedding_model, persist_directory="./faiss_store")
+    existing_faiss.load_existing_vectorstore()
+    results = existing_faiss.similarity_search("Another query")
+    print("Existing FAISS results:", results[0].page_content)
+if __name__ == "__main__":
+    main()

tests/integration/test_vector_store_integration.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import pytest
+from langchain.schema import Document
+from rag_app.vector_store_handler.vectorstores import ChromaVectorStore, FAISSVectorStore
+# from rag_app.database.init_db import db
+from config import EMBEDDING_MODEL, VECTOR_DATABASE_LOCATION
+from langchain.embeddings import HuggingFaceEmbeddings  # Or whatever embedding you're using
+@pytest.fixture(scope="module")
+def embedding_model():
+    return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+@pytest.fixture(params=[ChromaVectorStore, FAISSVectorStore])
+def vector_store(request, embedding_model, tmp_path):
+    store = request.param(embedding_model, persist_directory=str(tmp_path))
+    yield store
+    # Clean up (if necessary)
+    if hasattr(store, 'vectorstore'):
+        store.vectorstore.delete_collection()
+@pytest.fixture
+def sample_documents():
+    return [
+        Document(page_content="This is a test document about AI."),
+        Document(page_content="Another document discussing machine learning."),
+        Document(page_content="A third document about natural language processing.")
+    ]
+def test_create_and_search(vector_store, sample_documents):
+    # Create vector store
+    vector_store.create_vectorstore(sample_documents)
+    # Perform a search
+    results = vector_store.similarity_search("AI and machine learning")
+    assert len(results) > 0
+    assert any("AI" in doc.page_content for doc in results)
+    assert any("machine learning" in doc.page_content for doc in results)
+def test_save_and_load(vector_store, sample_documents, tmp_path):
+    # Create and save vector store
+    vector_store.create_vectorstore(sample_documents)
+    vector_store.save()
+    # Load the vector store
+    loaded_store = type(vector_store)(vector_store.embeddings, persist_directory=str(tmp_path))
+    loaded_store.load_existing_vectorstore()
+    # Perform a search on the loaded store
+    results = loaded_store.similarity_search("natural language processing")
+    assert len(results) > 0
+    assert any("natural language processing" in doc.page_content for doc in results)
+def test_update_vectorstore(vector_store, sample_documents):
+    # Create initial vector store
+    vector_store.create_vectorstore(sample_documents)
+    # Add a new document
+    new_doc = Document(page_content="A new document about deep learning.")
+    vector_store.vectorstore.add_documents([new_doc])
+    # Search for the new content
+    results = vector_store.similarity_search("deep learning")
+    assert len(results) > 0
+    assert any("deep learning" in doc.page_content for doc in results)
+@pytest.mark.parametrize("query,expected_content", [
+    ("AI", "AI"),
+    ("machine learning", "machine learning"),
+    ("natural language processing", "natural language processing")
+])
+def test_search_accuracy(vector_store, sample_documents, query, expected_content):
+    vector_store.create_vectorstore(sample_documents)
+    results = vector_store.similarity_search(query)
+    assert any(expected_content in doc.page_content for doc in results)
+# def test_database_integration(vector_store, sample_documents):
+#     # This test assumes your vector store interacts with the database in some way
+#     # You may need to adjust this based on your actual implementation
+#     vector_store.create_vectorstore(sample_documents)
+#     # Here, you might add some assertions about how the vector store interacts with the database
+#     # For example, if you're storing metadata about the documents in the database:
+#     for doc in sample_documents:
+#         result = db.session.query(YourDocumentModel).filter_by(content=doc.page_content).first()
+#         assert result is not None
+# Add more integration tests as needed

tests/vector_store_handler/test_vectorstores.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import unittest
+from unittest.mock import MagicMock, patch
+# from langchain.embeddings import OpenAIEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings
+# from langchain.schema import Document
+from langchain_core.documents import Document
+# Update the import to reflect your project structure
+from rag_app.vector_store_handler.vectorstores import BaseVectorStore, ChromaVectorStore, FAISSVectorStore
+class TestBaseVectorStore(unittest.TestCase):
+    def setUp(self):
+        self.embedding_model = MagicMock(spec=HuggingFaceEmbeddings)
+        self.base_store = BaseVectorStore(self.embedding_model, "test_dir")
+    def test_init(self):
+        self.assertEqual(self.base_store.persist_directory, "test_dir")
+        self.assertEqual(self.base_store.embeddings, self.embedding_model)
+        self.assertIsNone(self.base_store.vectorstore)
+    @patch('rag_app.vector_store_handler.vectorstores.TextLoader')
+    @patch('rag_app.vector_store_handler.vectorstores.CharacterTextSplitter')
+    def test_load_and_process_documents(self, mock_splitter, mock_loader):
+        mock_loader.return_value.load.return_value = ["doc1", "doc2"]
+        mock_splitter.return_value.split_documents.return_value = ["split1", "split2"]
+        result = self.base_store.load_and_process_documents("test.txt")
+        mock_loader.assert_called_once_with("test.txt")
+        mock_splitter.assert_called_once_with(chunk_size=1000, chunk_overlap=0)
+        self.assertEqual(result, ["split1", "split2"])
+    def test_similarity_search_not_initialized(self):
+        with self.assertRaises(ValueError):
+            self.base_store.similarity_search("query")
+class TestChromaVectorStore(unittest.TestCase):
+    def setUp(self):
+        self.embedding_model = MagicMock(spec=HuggingFaceEmbeddings)
+        self.chroma_store = ChromaVectorStore(self.embedding_model, "test_dir")
+    @patch('rag_app.vector_store_handler.vectorstores.Chroma')
+    def test_create_vectorstore(self, mock_chroma):
+        texts = [Document(page_content="test")]
+        self.chroma_store.create_vectorstore(texts)
+        mock_chroma.from_documents.assert_called_once_with(
+            texts,
+            self.embedding_model,
+            persist_directory="test_dir"
+        )
+    @patch('rag_app.vector_store_handler.vectorstores.Chroma')
+    def test_load_existing_vectorstore(self, mock_chroma):
+        self.chroma_store.load_existing_vectorstore()
+        mock_chroma.assert_called_once_with(
+            persist_directory="test_dir",
+            embedding_function=self.embedding_model
+        )
+    def test_save(self):
+        self.chroma_store.vectorstore = MagicMock()
+        self.chroma_store.save()
+        self.chroma_store.vectorstore.persist.assert_called_once()
+class TestFAISSVectorStore(unittest.TestCase):
+    def setUp(self):
+        self.embedding_model = MagicMock(spec=HuggingFaceEmbeddings)
+        self.faiss_store = FAISSVectorStore(self.embedding_model, "test_dir")
+    @patch('rag_app.vector_store_handler.vectorstores.FAISS')
+    def test_create_vectorstore(self, mock_faiss):
+        texts = [Document(page_content="test")]
+        self.faiss_store.create_vectorstore(texts)
+        mock_faiss.from_documents.assert_called_once_with(texts, self.embedding_model)
+    @patch('rag_app.vector_store_handler.vectorstores.FAISS')
+    def test_load_existing_vectorstore(self, mock_faiss):
+        self.faiss_store.load_existing_vectorstore()
+        mock_faiss.load_local.assert_called_once_with("test_dir", self.embedding_model)
+    @patch('rag_app.vector_store_handler.vectorstores.FAISS')
+    def test_save(self, mock_faiss):
+        self.faiss_store.vectorstore = MagicMock()
+        self.faiss_store.save()
+        self.faiss_store.vectorstore.save_local.assert_called_once_with("test_dir")
+if __name__ == '__main__':
+    unittest.main()