Spaces:
Sleeping
Sleeping
attempt to refactor module
Browse files- config.py +5 -0
- rag_app/knowledge_base/build_vector_store.py +0 -57
- rag_app/knowledge_base/get_db_retriever.py +0 -61
- rag_app/knowledge_base/hybrid_search.py +0 -63
- rag_app/knowledge_base/reranking.py +0 -137
- rag_app/knowledge_base/utils.py +171 -1
- rag_app/vector_store_handler/vectorstores.py +102 -2
config.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
from dotenv import load_dotenv
|
3 |
from rag_app.database.db_handler import DataBaseHandler
|
4 |
from langchain_huggingface import HuggingFaceEndpoint
|
|
|
5 |
|
6 |
load_dotenv()
|
7 |
|
@@ -11,8 +12,12 @@ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
|
|
11 |
SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
|
12 |
BERT_MODEL = os.getenv("BERT_MODEL")
|
13 |
FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
|
|
|
14 |
|
15 |
|
|
|
|
|
|
|
16 |
db = DataBaseHandler()
|
17 |
|
18 |
db.create_all_tables()
|
|
|
2 |
from dotenv import load_dotenv
|
3 |
from rag_app.database.db_handler import DataBaseHandler
|
4 |
from langchain_huggingface import HuggingFaceEndpoint
|
5 |
+
from langchain_huggingface import HuggingFaceHubEmbeddings
|
6 |
|
7 |
load_dotenv()
|
8 |
|
|
|
12 |
SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
|
13 |
BERT_MODEL = os.getenv("BERT_MODEL")
|
14 |
FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
|
15 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
16 |
|
17 |
|
18 |
+
|
19 |
+
embeddings = HuggingFaceHubEmbeddings(repo_id=EMBEDDING_MODEL)
|
20 |
+
|
21 |
db = DataBaseHandler()
|
22 |
|
23 |
db.create_all_tables()
|
rag_app/knowledge_base/build_vector_store.py
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
# vectorization functions
|
2 |
-
from langchain_community.vectorstores import FAISS
|
3 |
-
from langchain_community.vectorstores import Chroma
|
4 |
-
from langchain_community.retrievers import BM25Retriever
|
5 |
-
|
6 |
-
from rag_app.knowledge_base.create_embedding import create_embeddings
|
7 |
-
from rag_app.utils.generate_summary import generate_description, generate_keywords
|
8 |
-
|
9 |
-
import time
|
10 |
-
import os
|
11 |
-
|
12 |
-
from config import FAISS_INDEX_PATH
|
13 |
-
|
14 |
-
def build_vector_store(
|
15 |
-
docs: list,
|
16 |
-
embedding_model: str,
|
17 |
-
new_db:bool=False,
|
18 |
-
chunk_size:int=500,
|
19 |
-
chunk_overlap:int=50,
|
20 |
-
):
|
21 |
-
"""
|
22 |
-
|
23 |
-
"""
|
24 |
-
|
25 |
-
embeddings,chunks = create_embeddings(
|
26 |
-
docs,
|
27 |
-
chunk_size,
|
28 |
-
chunk_overlap,
|
29 |
-
embedding_model
|
30 |
-
)
|
31 |
-
|
32 |
-
#load chunks into vector store
|
33 |
-
print(f'Loading chunks into faiss vector store ...')
|
34 |
-
|
35 |
-
st = time.time()
|
36 |
-
if new_db:
|
37 |
-
db_faiss = FAISS.from_documents(chunks, embeddings)
|
38 |
-
bm25_retriever = BM25Retriever.from_documents(chunks)
|
39 |
-
else:
|
40 |
-
db_faiss = FAISS.add_documents(chunks, embeddings)
|
41 |
-
bm25_retriever = BM25Retriever.add_documents(chunks)
|
42 |
-
|
43 |
-
db_faiss.save_local(FAISS_INDEX_PATH)
|
44 |
-
et = time.time() - st
|
45 |
-
print(f'Time taken: {et} seconds.')
|
46 |
-
|
47 |
-
print(f'Loading chunks into chroma vector store ...')
|
48 |
-
|
49 |
-
st = time.time()
|
50 |
-
persist_directory='./vectorstore/chroma-insurance-agent-1500'
|
51 |
-
db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
|
52 |
-
et = time.time() - st
|
53 |
-
|
54 |
-
print(f'Time taken: {et} seconds.')
|
55 |
-
result = f"built vectore store at {FAISS_INDEX_PATH}"
|
56 |
-
return result
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_app/knowledge_base/get_db_retriever.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
# retriever and qa_chain function
|
2 |
-
|
3 |
-
# HF libraries
|
4 |
-
from langchain.llms import HuggingFaceHub
|
5 |
-
from langchain_huggingface import HuggingFaceHubEmbeddings
|
6 |
-
# vectorestore
|
7 |
-
from langchain_community.vectorstores import FAISS
|
8 |
-
# retrieval chain
|
9 |
-
from langchain.chains import RetrievalQA
|
10 |
-
# prompt template
|
11 |
-
from langchain.prompts import PromptTemplate
|
12 |
-
from langchain.memory import ConversationBufferMemory
|
13 |
-
from config import EMBEDDING_MODEL, VECTOR_DATABASE_LOCATION
|
14 |
-
|
15 |
-
|
16 |
-
def get_db_retriever():
|
17 |
-
"""
|
18 |
-
Creates and returns a retriever object based on a FAISS vector database.
|
19 |
-
|
20 |
-
This function initializes an embedding model and loads a pre-existing FAISS
|
21 |
-
vector database from a local location. It then creates a retriever from this
|
22 |
-
database.
|
23 |
-
|
24 |
-
Returns:
|
25 |
-
--------
|
26 |
-
retriever : langchain.vectorstores.FAISS.VectorStoreRetriever
|
27 |
-
A retriever object that can be used to fetch relevant documents from the
|
28 |
-
vector database.
|
29 |
-
|
30 |
-
Global Variables Used:
|
31 |
-
----------------------
|
32 |
-
EMBEDDING_MODEL : str
|
33 |
-
The identifier for the Hugging Face Hub embedding model to be used.
|
34 |
-
VECTOR_DATABASE_LOCATION : str
|
35 |
-
The local path where the FAISS vector database is stored.
|
36 |
-
|
37 |
-
Dependencies:
|
38 |
-
-------------
|
39 |
-
- langchain_huggingface.HuggingFaceHubEmbeddings
|
40 |
-
- langchain_community.vectorstores.FAISS
|
41 |
-
|
42 |
-
Note:
|
43 |
-
-----
|
44 |
-
This function assumes that a FAISS vector database has already been created
|
45 |
-
and saved at the location specified by VECTOR_DATABASE_LOCATION.
|
46 |
-
"""
|
47 |
-
|
48 |
-
# Initialize the embedding model
|
49 |
-
embeddings = HuggingFaceHubEmbeddings(repo_id=EMBEDDING_MODEL)
|
50 |
-
|
51 |
-
# Load the FAISS vector database from the local storage
|
52 |
-
db = FAISS.load_local(
|
53 |
-
VECTOR_DATABASE_LOCATION,
|
54 |
-
embeddings,
|
55 |
-
)
|
56 |
-
|
57 |
-
# Create and return a retriever from the loaded database
|
58 |
-
retriever = db.as_retriever()
|
59 |
-
|
60 |
-
return retriever
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_app/knowledge_base/hybrid_search.py
DELETED
@@ -1,63 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
from langchain_community.vectorstores import FAISS
|
3 |
-
from dotenv import load_dotenv
|
4 |
-
import os
|
5 |
-
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
6 |
-
from langchain.retrievers import EnsembleRetriever
|
7 |
-
from langchain_community.retrievers import BM25Retriever
|
8 |
-
|
9 |
-
|
10 |
-
def get_hybrid_search_results(query:str,
|
11 |
-
path_to_db:str,
|
12 |
-
embedding_model:str,
|
13 |
-
hf_api_key:str,
|
14 |
-
num_docs:int=5) -> list:
|
15 |
-
""" Uses an ensemble retriever of BM25 and FAISS to return k num documents
|
16 |
-
|
17 |
-
Args:
|
18 |
-
query (str): The search query
|
19 |
-
path_to_db (str): Path to the vectorstore database
|
20 |
-
embedding_model (str): Embedding model used in the vector store
|
21 |
-
num_docs (int): Number of documents to return
|
22 |
-
|
23 |
-
Returns
|
24 |
-
List of documents
|
25 |
-
|
26 |
-
"""
|
27 |
-
|
28 |
-
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
|
29 |
-
model_name=embedding_model)
|
30 |
-
# Load the vectorstore database
|
31 |
-
db = FAISS.load_local(folder_path=path_to_db,
|
32 |
-
embeddings=embeddings,
|
33 |
-
allow_dangerous_deserialization=True)
|
34 |
-
|
35 |
-
all_docs = db.similarity_search("", k=db.index.ntotal)
|
36 |
-
|
37 |
-
bm25_retriever = BM25Retriever.from_documents(all_docs)
|
38 |
-
bm25_retriever.k = num_docs # How many results you want
|
39 |
-
|
40 |
-
faiss_retriever = db.as_retriever(search_kwargs={'k': num_docs})
|
41 |
-
|
42 |
-
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
|
43 |
-
weights=[0.5,0.5])
|
44 |
-
|
45 |
-
results = ensemble_retriever.invoke(input=query)
|
46 |
-
return results
|
47 |
-
|
48 |
-
|
49 |
-
if __name__ == "__main__":
|
50 |
-
query = "Haustierversicherung"
|
51 |
-
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
52 |
-
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
|
53 |
-
|
54 |
-
path_to_vector_db = Path("..")/'vectorstore/faiss-insurance-agent-500'
|
55 |
-
|
56 |
-
results = get_hybrid_search_results(query=query,
|
57 |
-
path_to_db=path_to_vector_db,
|
58 |
-
embedding_model=EMBEDDING_MODEL,
|
59 |
-
hf_api_key=HUGGINGFACEHUB_API_TOKEN)
|
60 |
-
|
61 |
-
for doc in results:
|
62 |
-
print(doc)
|
63 |
-
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_app/knowledge_base/reranking.py
DELETED
@@ -1,137 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
from langchain_community.vectorstores import FAISS
|
3 |
-
from dotenv import load_dotenv
|
4 |
-
import os
|
5 |
-
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
6 |
-
import requests
|
7 |
-
from langchain_community.vectorstores import Chroma
|
8 |
-
|
9 |
-
|
10 |
-
load_dotenv()
|
11 |
-
|
12 |
-
|
13 |
-
def get_reranked_docs_faiss(
|
14 |
-
query:str,
|
15 |
-
path_to_db:str,
|
16 |
-
embedding_model:str,
|
17 |
-
hf_api_key:str,
|
18 |
-
num_docs:int=5
|
19 |
-
) -> list:
|
20 |
-
""" Re-ranks the similarity search results and returns top-k highest ranked docs
|
21 |
-
|
22 |
-
Args:
|
23 |
-
query (str): The search query
|
24 |
-
path_to_db (str): Path to the vectorstore database
|
25 |
-
embedding_model (str): Embedding model used in the vector store
|
26 |
-
num_docs (int): Number of documents to return
|
27 |
-
|
28 |
-
Returns: A list of documents with the highest rank
|
29 |
-
"""
|
30 |
-
assert num_docs <= 10, "num_docs should be less than similarity search results"
|
31 |
-
|
32 |
-
embeddings = HuggingFaceInferenceAPIEmbeddings(
|
33 |
-
api_key=hf_api_key,
|
34 |
-
model_name=embedding_model
|
35 |
-
)
|
36 |
-
|
37 |
-
# Load the vectorstore database
|
38 |
-
db = FAISS.load_local(
|
39 |
-
folder_path=path_to_db,
|
40 |
-
embeddings=embeddings,
|
41 |
-
allow_dangerous_deserialization=True
|
42 |
-
)
|
43 |
-
|
44 |
-
# Get 10 documents based on similarity search
|
45 |
-
docs = db.similarity_search(query=query, k=10)
|
46 |
-
|
47 |
-
# Add the page_content, description and title together
|
48 |
-
passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
|
49 |
-
for doc in docs]
|
50 |
-
|
51 |
-
# Prepare the payload
|
52 |
-
inputs = [{"text": query, "text_pair": passage} for passage in passages]
|
53 |
-
|
54 |
-
API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
|
55 |
-
headers = {"Authorization": f"Bearer {hf_api_key}"}
|
56 |
-
|
57 |
-
response = requests.post(API_URL, headers=headers, json=inputs)
|
58 |
-
scores = response.json()
|
59 |
-
|
60 |
-
try:
|
61 |
-
relevance_scores = [item[1]['score'] for item in scores]
|
62 |
-
except ValueError as e:
|
63 |
-
print('Could not get the relevance_scores -> something might be wrong with the json output')
|
64 |
-
return
|
65 |
-
|
66 |
-
if relevance_scores:
|
67 |
-
ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
|
68 |
-
top_k_results = ranked_results[:num_docs]
|
69 |
-
return [doc for doc, _, _ in top_k_results]
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
def get_reranked_docs_chroma(query:str,
|
74 |
-
path_to_db:str,
|
75 |
-
embedding_model:str,
|
76 |
-
hf_api_key:str,
|
77 |
-
reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2",
|
78 |
-
num_docs:int=5) -> list:
|
79 |
-
""" Re-ranks the similarity search results and returns top-k highest ranked docs
|
80 |
-
|
81 |
-
Args:
|
82 |
-
query (str): The search query
|
83 |
-
path_to_db (str): Path to the vectorstore database
|
84 |
-
embedding_model (str): Embedding model used in the vector store
|
85 |
-
num_docs (int): Number of documents to return
|
86 |
-
|
87 |
-
Returns: A list of documents with the highest rank
|
88 |
-
"""
|
89 |
-
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
|
90 |
-
model_name=embedding_model)
|
91 |
-
# Load the vectorstore database
|
92 |
-
db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
|
93 |
-
|
94 |
-
# Get k documents based on similarity search
|
95 |
-
sim_docs = db.similarity_search(query=query, k=10)
|
96 |
-
|
97 |
-
passages = [doc.page_content for doc in sim_docs]
|
98 |
-
|
99 |
-
# Prepare the payload
|
100 |
-
payload = {"inputs":
|
101 |
-
{"source_sentence": query,
|
102 |
-
"sentences": passages}}
|
103 |
-
|
104 |
-
headers = {"Authorization": f"Bearer {hf_api_key}"}
|
105 |
-
|
106 |
-
response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
|
107 |
-
print(f'{response = }')
|
108 |
-
if response.status_code != 200:
|
109 |
-
print('Something went wrong with the response')
|
110 |
-
return
|
111 |
-
|
112 |
-
similarity_scores = response.json()
|
113 |
-
ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
|
114 |
-
top_k_results = ranked_results[:num_docs]
|
115 |
-
return [doc for doc, _, _ in top_k_results]
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
if __name__ == "__main__":
|
120 |
-
|
121 |
-
|
122 |
-
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
123 |
-
EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
124 |
-
|
125 |
-
project_dir = Path().cwd().parent
|
126 |
-
path_to_vector_db = str(project_dir/'vectorstore/chroma-zurich-mpnet-1500')
|
127 |
-
assert Path(path_to_vector_db).exists(), "Cannot access path_to_vector_db "
|
128 |
-
|
129 |
-
query = "I'm looking for student insurance"
|
130 |
-
|
131 |
-
re_ranked_docs = get_reranked_docs_chroma(query=query,
|
132 |
-
path_to_db= path_to_vector_db,
|
133 |
-
embedding_model=EMBEDDING_MODEL,
|
134 |
-
hf_api_key=HUGGINGFACEHUB_API_TOKEN)
|
135 |
-
|
136 |
-
|
137 |
-
print(f"{re_ranked_docs=}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_app/knowledge_base/utils.py
CHANGED
@@ -11,6 +11,28 @@ from langchain_core.documents import Document
|
|
11 |
from config import EMBEDDING_MODEL
|
12 |
from langchain.retrievers import VectorStoreRetriever
|
13 |
from langchain_core.vectorstores import VectorStoreRetriever
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def create_embeddings(
|
16 |
docs: list[Document],
|
@@ -86,4 +108,152 @@ def generate_document_summaries(
|
|
86 |
{"summary":summary}
|
87 |
)
|
88 |
|
89 |
-
return new_docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from config import EMBEDDING_MODEL
|
12 |
from langchain.retrievers import VectorStoreRetriever
|
13 |
from langchain_core.vectorstores import VectorStoreRetriever
|
14 |
+
# vectorization functions
|
15 |
+
from langchain_community.vectorstores import FAISS
|
16 |
+
from langchain_community.vectorstores import Chroma
|
17 |
+
from langchain_community.retrievers import BM25Retriever
|
18 |
+
|
19 |
+
from rag_app.knowledge_base.utils import create_embeddings
|
20 |
+
from rag_app.utils.generate_summary import generate_description, generate_keywords
|
21 |
+
|
22 |
+
import time
|
23 |
+
import os
|
24 |
+
|
25 |
+
from config import FAISS_INDEX_PATH
|
26 |
+
|
27 |
+
from pathlib import Path
|
28 |
+
from langchain_community.vectorstores import FAISS
|
29 |
+
from dotenv import load_dotenv
|
30 |
+
import os
|
31 |
+
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
|
32 |
+
import requests
|
33 |
+
from langchain_community.vectorstores import Chroma
|
34 |
+
|
35 |
+
|
36 |
|
37 |
def create_embeddings(
|
38 |
docs: list[Document],
|
|
|
108 |
{"summary":summary}
|
109 |
)
|
110 |
|
111 |
+
return new_docs
|
112 |
+
|
113 |
+
|
114 |
+
def build_vector_store(
|
115 |
+
docs: list,
|
116 |
+
embedding_model: str,
|
117 |
+
new_db:bool=False,
|
118 |
+
chunk_size:int=500,
|
119 |
+
chunk_overlap:int=50,
|
120 |
+
):
|
121 |
+
"""
|
122 |
+
|
123 |
+
"""
|
124 |
+
|
125 |
+
embeddings,chunks = create_embeddings(
|
126 |
+
docs,
|
127 |
+
chunk_size,
|
128 |
+
chunk_overlap,
|
129 |
+
embedding_model
|
130 |
+
)
|
131 |
+
|
132 |
+
#load chunks into vector store
|
133 |
+
print(f'Loading chunks into faiss vector store ...')
|
134 |
+
|
135 |
+
st = time.time()
|
136 |
+
if new_db:
|
137 |
+
db_faiss = FAISS.from_documents(chunks, embeddings)
|
138 |
+
bm25_retriever = BM25Retriever.from_documents(chunks)
|
139 |
+
else:
|
140 |
+
db_faiss = FAISS.add_documents(chunks, embeddings)
|
141 |
+
bm25_retriever = BM25Retriever.add_documents(chunks)
|
142 |
+
|
143 |
+
db_faiss.save_local(FAISS_INDEX_PATH)
|
144 |
+
et = time.time() - st
|
145 |
+
print(f'Time taken: {et} seconds.')
|
146 |
+
|
147 |
+
print(f'Loading chunks into chroma vector store ...')
|
148 |
+
|
149 |
+
st = time.time()
|
150 |
+
persist_directory='./vectorstore/chroma-insurance-agent-1500'
|
151 |
+
db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
|
152 |
+
et = time.time() - st
|
153 |
+
|
154 |
+
print(f'Time taken: {et} seconds.')
|
155 |
+
result = f"built vectore store at {FAISS_INDEX_PATH}"
|
156 |
+
return result
|
157 |
+
|
158 |
+
def get_reranked_docs_faiss(
|
159 |
+
query:str,
|
160 |
+
path_to_db:str,
|
161 |
+
embedding_model:str,
|
162 |
+
hf_api_key:str,
|
163 |
+
num_docs:int=5
|
164 |
+
) -> list:
|
165 |
+
""" Re-ranks the similarity search results and returns top-k highest ranked docs
|
166 |
+
|
167 |
+
Args:
|
168 |
+
query (str): The search query
|
169 |
+
path_to_db (str): Path to the vectorstore database
|
170 |
+
embedding_model (str): Embedding model used in the vector store
|
171 |
+
num_docs (int): Number of documents to return
|
172 |
+
|
173 |
+
Returns: A list of documents with the highest rank
|
174 |
+
"""
|
175 |
+
assert num_docs <= 10, "num_docs should be less than similarity search results"
|
176 |
+
|
177 |
+
embeddings = HuggingFaceInferenceAPIEmbeddings(
|
178 |
+
api_key=hf_api_key,
|
179 |
+
model_name=embedding_model
|
180 |
+
)
|
181 |
+
|
182 |
+
# Load the vectorstore database
|
183 |
+
db = FAISS.load_local(
|
184 |
+
folder_path=path_to_db,
|
185 |
+
embeddings=embeddings,
|
186 |
+
allow_dangerous_deserialization=True
|
187 |
+
)
|
188 |
+
|
189 |
+
# Get 10 documents based on similarity search
|
190 |
+
docs = db.similarity_search(query=query, k=10)
|
191 |
+
|
192 |
+
# Add the page_content, description and title together
|
193 |
+
passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
|
194 |
+
for doc in docs]
|
195 |
+
|
196 |
+
# Prepare the payload
|
197 |
+
inputs = [{"text": query, "text_pair": passage} for passage in passages]
|
198 |
+
|
199 |
+
API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
|
200 |
+
headers = {"Authorization": f"Bearer {hf_api_key}"}
|
201 |
+
|
202 |
+
response = requests.post(API_URL, headers=headers, json=inputs)
|
203 |
+
scores = response.json()
|
204 |
+
|
205 |
+
try:
|
206 |
+
relevance_scores = [item[1]['score'] for item in scores]
|
207 |
+
except ValueError as e:
|
208 |
+
print('Could not get the relevance_scores -> something might be wrong with the json output')
|
209 |
+
return
|
210 |
+
|
211 |
+
if relevance_scores:
|
212 |
+
ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
|
213 |
+
top_k_results = ranked_results[:num_docs]
|
214 |
+
return [doc for doc, _, _ in top_k_results]
|
215 |
+
|
216 |
+
|
217 |
+
def get_reranked_docs_chroma(query:str,
|
218 |
+
path_to_db:str,
|
219 |
+
embedding_model:str,
|
220 |
+
hf_api_key:str,
|
221 |
+
reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2",
|
222 |
+
num_docs:int=5) -> list:
|
223 |
+
""" Re-ranks the similarity search results and returns top-k highest ranked docs
|
224 |
+
|
225 |
+
Args:
|
226 |
+
query (str): The search query
|
227 |
+
path_to_db (str): Path to the vectorstore database
|
228 |
+
embedding_model (str): Embedding model used in the vector store
|
229 |
+
num_docs (int): Number of documents to return
|
230 |
+
|
231 |
+
Returns: A list of documents with the highest rank
|
232 |
+
"""
|
233 |
+
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
|
234 |
+
model_name=embedding_model)
|
235 |
+
# Load the vectorstore database
|
236 |
+
db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
|
237 |
+
|
238 |
+
# Get k documents based on similarity search
|
239 |
+
sim_docs = db.similarity_search(query=query, k=10)
|
240 |
+
|
241 |
+
passages = [doc.page_content for doc in sim_docs]
|
242 |
+
|
243 |
+
# Prepare the payload
|
244 |
+
payload = {"inputs":
|
245 |
+
{"source_sentence": query,
|
246 |
+
"sentences": passages}}
|
247 |
+
|
248 |
+
headers = {"Authorization": f"Bearer {hf_api_key}"}
|
249 |
+
|
250 |
+
response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
|
251 |
+
print(f'{response = }')
|
252 |
+
if response.status_code != 200:
|
253 |
+
print('Something went wrong with the response')
|
254 |
+
return
|
255 |
+
|
256 |
+
similarity_scores = response.json()
|
257 |
+
ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
|
258 |
+
top_k_results = ranked_results[:num_docs]
|
259 |
+
return [doc for doc, _, _ in top_k_results]
|
rag_app/vector_store_handler/vectorstores.py
CHANGED
@@ -10,7 +10,10 @@ from langchain_community.embeddings.sentence_transformer import (
|
|
10 |
)
|
11 |
import time
|
12 |
from langchain_core.documents import Document
|
13 |
-
from config import EMBEDDING_MODEL
|
|
|
|
|
|
|
14 |
|
15 |
class BaseVectorStore(ABC):
|
16 |
"""
|
@@ -48,7 +51,10 @@ class BaseVectorStore(ABC):
|
|
48 |
documents = loader.load()
|
49 |
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
50 |
return text_splitter.split_documents(documents)
|
51 |
-
|
|
|
|
|
|
|
52 |
@abstractmethod
|
53 |
def create_vectorstore(self, texts):
|
54 |
"""
|
@@ -89,6 +95,7 @@ class BaseVectorStore(ABC):
|
|
89 |
Save the current state of the vector store.
|
90 |
"""
|
91 |
pass
|
|
|
92 |
|
93 |
class ChromaVectorStore(BaseVectorStore):
|
94 |
"""
|
@@ -133,6 +140,38 @@ class ChromaVectorStore(BaseVectorStore):
|
|
133 |
if not self.vectorstore:
|
134 |
raise ValueError("Vector store not initialized. Nothing to save.")
|
135 |
self.vectorstore.persist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
class FAISSVectorStore(BaseVectorStore):
|
138 |
"""
|
@@ -170,6 +209,67 @@ class FAISSVectorStore(BaseVectorStore):
|
|
170 |
if self.vectorstore is None:
|
171 |
raise ValueError("Vector store not initialized. Nothing to save.")
|
172 |
self.vectorstore.save_local(self.persist_directory)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
# Usage example:
|
175 |
def main():
|
|
|
10 |
)
|
11 |
import time
|
12 |
from langchain_core.documents import Document
|
13 |
+
from config import EMBEDDING_MODEL, HUGGINGFACEHUB_API_TOKEN
|
14 |
+
from langchain.retrievers import EnsembleRetriever
|
15 |
+
from langchain_community.retrievers import BM25Retriever
|
16 |
+
import requests
|
17 |
|
18 |
class BaseVectorStore(ABC):
|
19 |
"""
|
|
|
51 |
documents = loader.load()
|
52 |
text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
53 |
return text_splitter.split_documents(documents)
|
54 |
+
|
55 |
+
def get_hybrid_search_result(self,query:str):
|
56 |
+
pass
|
57 |
+
|
58 |
@abstractmethod
|
59 |
def create_vectorstore(self, texts):
|
60 |
"""
|
|
|
95 |
Save the current state of the vector store.
|
96 |
"""
|
97 |
pass
|
98 |
+
|
99 |
|
100 |
class ChromaVectorStore(BaseVectorStore):
|
101 |
"""
|
|
|
140 |
if not self.vectorstore:
|
141 |
raise ValueError("Vector store not initialized. Nothing to save.")
|
142 |
self.vectorstore.persist()
|
143 |
+
|
144 |
+
def get_reranked_docs(
|
145 |
+
self,
|
146 |
+
query:str,
|
147 |
+
num_docs:int=5
|
148 |
+
):
|
149 |
+
|
150 |
+
# Get 10 documents based on similarity search
|
151 |
+
docs = self.vectorstore.similarity_search(query=query, k=10)
|
152 |
+
|
153 |
+
# Add the page_content, description and title together
|
154 |
+
passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
|
155 |
+
for doc in docs]
|
156 |
+
# Prepare the payload
|
157 |
+
inputs = [{"text": query, "text_pair": passage} for passage in passages]
|
158 |
+
|
159 |
+
API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
|
160 |
+
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
|
161 |
+
|
162 |
+
response = requests.post(API_URL, headers=headers, json=inputs)
|
163 |
+
scores = response.json()
|
164 |
+
|
165 |
+
try:
|
166 |
+
relevance_scores = [item[1]['score'] for item in scores]
|
167 |
+
except ValueError as e:
|
168 |
+
print('Could not get the relevance_scores -> something might be wrong with the json output')
|
169 |
+
return
|
170 |
+
|
171 |
+
if relevance_scores:
|
172 |
+
ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
|
173 |
+
top_k_results = ranked_results[:num_docs]
|
174 |
+
return [doc for doc, _, _ in top_k_results]
|
175 |
|
176 |
class FAISSVectorStore(BaseVectorStore):
|
177 |
"""
|
|
|
209 |
if self.vectorstore is None:
|
210 |
raise ValueError("Vector store not initialized. Nothing to save.")
|
211 |
self.vectorstore.save_local(self.persist_directory)
|
212 |
+
|
213 |
+
def get_hybrid_search_result(
|
214 |
+
self,
|
215 |
+
query:str,
|
216 |
+
num_docs:int=5
|
217 |
+
)-> list[Document]:
|
218 |
+
""" Uses an ensemble retriever of BM25 and FAISS to return k num documents
|
219 |
+
|
220 |
+
Args:
|
221 |
+
query (str): The search query
|
222 |
+
path_to_db (str): Path to the vectorstore database
|
223 |
+
embedding_model (str): Embedding model used in the vector store
|
224 |
+
num_docs (int): Number of documents to return
|
225 |
+
|
226 |
+
Returns
|
227 |
+
List of documents
|
228 |
+
|
229 |
+
"""
|
230 |
+
all_docs = self.vectorstore.similarity_search("", k=self.vectorstore.index.ntotal)
|
231 |
+
bm25_retriever = BM25Retriever.from_documents(all_docs)
|
232 |
+
bm25_retriever.k = num_docs # How many results you want
|
233 |
+
|
234 |
+
faiss_retriever = self.vectorstore.as_retriever(search_kwargs={'k': num_docs})
|
235 |
+
|
236 |
+
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
|
237 |
+
weights=[0.5,0.5])
|
238 |
+
|
239 |
+
results = ensemble_retriever.invoke(input=query)
|
240 |
+
return results
|
241 |
+
|
242 |
+
def get_reranked_docs(
|
243 |
+
self,
|
244 |
+
query:str,
|
245 |
+
num_docs:int=5
|
246 |
+
):
|
247 |
+
|
248 |
+
# Get 10 documents based on similarity search
|
249 |
+
docs = self.vectorstore.similarity_search(query=query, k=10)
|
250 |
+
|
251 |
+
# Add the page_content, description and title together
|
252 |
+
passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
|
253 |
+
for doc in docs]
|
254 |
+
# Prepare the payload
|
255 |
+
inputs = [{"text": query, "text_pair": passage} for passage in passages]
|
256 |
+
|
257 |
+
API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
|
258 |
+
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
|
259 |
+
|
260 |
+
response = requests.post(API_URL, headers=headers, json=inputs)
|
261 |
+
scores = response.json()
|
262 |
+
|
263 |
+
try:
|
264 |
+
relevance_scores = [item[1]['score'] for item in scores]
|
265 |
+
except ValueError as e:
|
266 |
+
print('Could not get the relevance_scores -> something might be wrong with the json output')
|
267 |
+
return
|
268 |
+
|
269 |
+
if relevance_scores:
|
270 |
+
ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
|
271 |
+
top_k_results = ranked_results[:num_docs]
|
272 |
+
return [doc for doc, _, _ in top_k_results]
|
273 |
|
274 |
# Usage example:
|
275 |
def main():
|