isayahc commited on
Commit
3a85771
·
unverified ·
1 Parent(s): 5ffaf21

attempt to refactor module

Browse files
config.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  from dotenv import load_dotenv
3
  from rag_app.database.db_handler import DataBaseHandler
4
  from langchain_huggingface import HuggingFaceEndpoint
 
5
 
6
  load_dotenv()
7
 
@@ -11,8 +12,12 @@ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
11
  SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
12
  BERT_MODEL = os.getenv("BERT_MODEL")
13
  FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
 
14
 
15
 
 
 
 
16
  db = DataBaseHandler()
17
 
18
  db.create_all_tables()
 
2
  from dotenv import load_dotenv
3
  from rag_app.database.db_handler import DataBaseHandler
4
  from langchain_huggingface import HuggingFaceEndpoint
5
+ from langchain_huggingface import HuggingFaceHubEmbeddings
6
 
7
  load_dotenv()
8
 
 
12
  SEVEN_B_LLM_MODEL = os.getenv("SEVEN_B_LLM_MODEL")
13
  BERT_MODEL = os.getenv("BERT_MODEL")
14
  FAISS_INDEX_PATH = os.getenv("FAISS_INDEX_PATH")
15
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
16
 
17
 
18
+
19
+ embeddings = HuggingFaceHubEmbeddings(repo_id=EMBEDDING_MODEL)
20
+
21
  db = DataBaseHandler()
22
 
23
  db.create_all_tables()
rag_app/knowledge_base/build_vector_store.py DELETED
@@ -1,57 +0,0 @@
1
- # vectorization functions
2
- from langchain_community.vectorstores import FAISS
3
- from langchain_community.vectorstores import Chroma
4
- from langchain_community.retrievers import BM25Retriever
5
-
6
- from rag_app.knowledge_base.create_embedding import create_embeddings
7
- from rag_app.utils.generate_summary import generate_description, generate_keywords
8
-
9
- import time
10
- import os
11
-
12
- from config import FAISS_INDEX_PATH
13
-
14
- def build_vector_store(
15
- docs: list,
16
- embedding_model: str,
17
- new_db:bool=False,
18
- chunk_size:int=500,
19
- chunk_overlap:int=50,
20
- ):
21
- """
22
-
23
- """
24
-
25
- embeddings,chunks = create_embeddings(
26
- docs,
27
- chunk_size,
28
- chunk_overlap,
29
- embedding_model
30
- )
31
-
32
- #load chunks into vector store
33
- print(f'Loading chunks into faiss vector store ...')
34
-
35
- st = time.time()
36
- if new_db:
37
- db_faiss = FAISS.from_documents(chunks, embeddings)
38
- bm25_retriever = BM25Retriever.from_documents(chunks)
39
- else:
40
- db_faiss = FAISS.add_documents(chunks, embeddings)
41
- bm25_retriever = BM25Retriever.add_documents(chunks)
42
-
43
- db_faiss.save_local(FAISS_INDEX_PATH)
44
- et = time.time() - st
45
- print(f'Time taken: {et} seconds.')
46
-
47
- print(f'Loading chunks into chroma vector store ...')
48
-
49
- st = time.time()
50
- persist_directory='./vectorstore/chroma-insurance-agent-1500'
51
- db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
52
- et = time.time() - st
53
-
54
- print(f'Time taken: {et} seconds.')
55
- result = f"built vectore store at {FAISS_INDEX_PATH}"
56
- return result
57
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_app/knowledge_base/get_db_retriever.py DELETED
@@ -1,61 +0,0 @@
1
- # retriever and qa_chain function
2
-
3
- # HF libraries
4
- from langchain.llms import HuggingFaceHub
5
- from langchain_huggingface import HuggingFaceHubEmbeddings
6
- # vectorestore
7
- from langchain_community.vectorstores import FAISS
8
- # retrieval chain
9
- from langchain.chains import RetrievalQA
10
- # prompt template
11
- from langchain.prompts import PromptTemplate
12
- from langchain.memory import ConversationBufferMemory
13
- from config import EMBEDDING_MODEL, VECTOR_DATABASE_LOCATION
14
-
15
-
16
- def get_db_retriever():
17
- """
18
- Creates and returns a retriever object based on a FAISS vector database.
19
-
20
- This function initializes an embedding model and loads a pre-existing FAISS
21
- vector database from a local location. It then creates a retriever from this
22
- database.
23
-
24
- Returns:
25
- --------
26
- retriever : langchain.vectorstores.FAISS.VectorStoreRetriever
27
- A retriever object that can be used to fetch relevant documents from the
28
- vector database.
29
-
30
- Global Variables Used:
31
- ----------------------
32
- EMBEDDING_MODEL : str
33
- The identifier for the Hugging Face Hub embedding model to be used.
34
- VECTOR_DATABASE_LOCATION : str
35
- The local path where the FAISS vector database is stored.
36
-
37
- Dependencies:
38
- -------------
39
- - langchain_huggingface.HuggingFaceHubEmbeddings
40
- - langchain_community.vectorstores.FAISS
41
-
42
- Note:
43
- -----
44
- This function assumes that a FAISS vector database has already been created
45
- and saved at the location specified by VECTOR_DATABASE_LOCATION.
46
- """
47
-
48
- # Initialize the embedding model
49
- embeddings = HuggingFaceHubEmbeddings(repo_id=EMBEDDING_MODEL)
50
-
51
- # Load the FAISS vector database from the local storage
52
- db = FAISS.load_local(
53
- VECTOR_DATABASE_LOCATION,
54
- embeddings,
55
- )
56
-
57
- # Create and return a retriever from the loaded database
58
- retriever = db.as_retriever()
59
-
60
- return retriever
61
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_app/knowledge_base/hybrid_search.py DELETED
@@ -1,63 +0,0 @@
1
- from pathlib import Path
2
- from langchain_community.vectorstores import FAISS
3
- from dotenv import load_dotenv
4
- import os
5
- from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
6
- from langchain.retrievers import EnsembleRetriever
7
- from langchain_community.retrievers import BM25Retriever
8
-
9
-
10
- def get_hybrid_search_results(query:str,
11
- path_to_db:str,
12
- embedding_model:str,
13
- hf_api_key:str,
14
- num_docs:int=5) -> list:
15
- """ Uses an ensemble retriever of BM25 and FAISS to return k num documents
16
-
17
- Args:
18
- query (str): The search query
19
- path_to_db (str): Path to the vectorstore database
20
- embedding_model (str): Embedding model used in the vector store
21
- num_docs (int): Number of documents to return
22
-
23
- Returns
24
- List of documents
25
-
26
- """
27
-
28
- embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
29
- model_name=embedding_model)
30
- # Load the vectorstore database
31
- db = FAISS.load_local(folder_path=path_to_db,
32
- embeddings=embeddings,
33
- allow_dangerous_deserialization=True)
34
-
35
- all_docs = db.similarity_search("", k=db.index.ntotal)
36
-
37
- bm25_retriever = BM25Retriever.from_documents(all_docs)
38
- bm25_retriever.k = num_docs # How many results you want
39
-
40
- faiss_retriever = db.as_retriever(search_kwargs={'k': num_docs})
41
-
42
- ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
43
- weights=[0.5,0.5])
44
-
45
- results = ensemble_retriever.invoke(input=query)
46
- return results
47
-
48
-
49
- if __name__ == "__main__":
50
- query = "Haustierversicherung"
51
- HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
52
- EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
53
-
54
- path_to_vector_db = Path("..")/'vectorstore/faiss-insurance-agent-500'
55
-
56
- results = get_hybrid_search_results(query=query,
57
- path_to_db=path_to_vector_db,
58
- embedding_model=EMBEDDING_MODEL,
59
- hf_api_key=HUGGINGFACEHUB_API_TOKEN)
60
-
61
- for doc in results:
62
- print(doc)
63
- print()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_app/knowledge_base/reranking.py DELETED
@@ -1,137 +0,0 @@
1
- from pathlib import Path
2
- from langchain_community.vectorstores import FAISS
3
- from dotenv import load_dotenv
4
- import os
5
- from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
6
- import requests
7
- from langchain_community.vectorstores import Chroma
8
-
9
-
10
- load_dotenv()
11
-
12
-
13
- def get_reranked_docs_faiss(
14
- query:str,
15
- path_to_db:str,
16
- embedding_model:str,
17
- hf_api_key:str,
18
- num_docs:int=5
19
- ) -> list:
20
- """ Re-ranks the similarity search results and returns top-k highest ranked docs
21
-
22
- Args:
23
- query (str): The search query
24
- path_to_db (str): Path to the vectorstore database
25
- embedding_model (str): Embedding model used in the vector store
26
- num_docs (int): Number of documents to return
27
-
28
- Returns: A list of documents with the highest rank
29
- """
30
- assert num_docs <= 10, "num_docs should be less than similarity search results"
31
-
32
- embeddings = HuggingFaceInferenceAPIEmbeddings(
33
- api_key=hf_api_key,
34
- model_name=embedding_model
35
- )
36
-
37
- # Load the vectorstore database
38
- db = FAISS.load_local(
39
- folder_path=path_to_db,
40
- embeddings=embeddings,
41
- allow_dangerous_deserialization=True
42
- )
43
-
44
- # Get 10 documents based on similarity search
45
- docs = db.similarity_search(query=query, k=10)
46
-
47
- # Add the page_content, description and title together
48
- passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
49
- for doc in docs]
50
-
51
- # Prepare the payload
52
- inputs = [{"text": query, "text_pair": passage} for passage in passages]
53
-
54
- API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
55
- headers = {"Authorization": f"Bearer {hf_api_key}"}
56
-
57
- response = requests.post(API_URL, headers=headers, json=inputs)
58
- scores = response.json()
59
-
60
- try:
61
- relevance_scores = [item[1]['score'] for item in scores]
62
- except ValueError as e:
63
- print('Could not get the relevance_scores -> something might be wrong with the json output')
64
- return
65
-
66
- if relevance_scores:
67
- ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
68
- top_k_results = ranked_results[:num_docs]
69
- return [doc for doc, _, _ in top_k_results]
70
-
71
-
72
-
73
- def get_reranked_docs_chroma(query:str,
74
- path_to_db:str,
75
- embedding_model:str,
76
- hf_api_key:str,
77
- reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2",
78
- num_docs:int=5) -> list:
79
- """ Re-ranks the similarity search results and returns top-k highest ranked docs
80
-
81
- Args:
82
- query (str): The search query
83
- path_to_db (str): Path to the vectorstore database
84
- embedding_model (str): Embedding model used in the vector store
85
- num_docs (int): Number of documents to return
86
-
87
- Returns: A list of documents with the highest rank
88
- """
89
- embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
90
- model_name=embedding_model)
91
- # Load the vectorstore database
92
- db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
93
-
94
- # Get k documents based on similarity search
95
- sim_docs = db.similarity_search(query=query, k=10)
96
-
97
- passages = [doc.page_content for doc in sim_docs]
98
-
99
- # Prepare the payload
100
- payload = {"inputs":
101
- {"source_sentence": query,
102
- "sentences": passages}}
103
-
104
- headers = {"Authorization": f"Bearer {hf_api_key}"}
105
-
106
- response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
107
- print(f'{response = }')
108
- if response.status_code != 200:
109
- print('Something went wrong with the response')
110
- return
111
-
112
- similarity_scores = response.json()
113
- ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
114
- top_k_results = ranked_results[:num_docs]
115
- return [doc for doc, _, _ in top_k_results]
116
-
117
-
118
-
119
- if __name__ == "__main__":
120
-
121
-
122
- HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
123
- EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
124
-
125
- project_dir = Path().cwd().parent
126
- path_to_vector_db = str(project_dir/'vectorstore/chroma-zurich-mpnet-1500')
127
- assert Path(path_to_vector_db).exists(), "Cannot access path_to_vector_db "
128
-
129
- query = "I'm looking for student insurance"
130
-
131
- re_ranked_docs = get_reranked_docs_chroma(query=query,
132
- path_to_db= path_to_vector_db,
133
- embedding_model=EMBEDDING_MODEL,
134
- hf_api_key=HUGGINGFACEHUB_API_TOKEN)
135
-
136
-
137
- print(f"{re_ranked_docs=}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_app/knowledge_base/utils.py CHANGED
@@ -11,6 +11,28 @@ from langchain_core.documents import Document
11
  from config import EMBEDDING_MODEL
12
  from langchain.retrievers import VectorStoreRetriever
13
  from langchain_core.vectorstores import VectorStoreRetriever
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def create_embeddings(
16
  docs: list[Document],
@@ -86,4 +108,152 @@ def generate_document_summaries(
86
  {"summary":summary}
87
  )
88
 
89
- return new_docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from config import EMBEDDING_MODEL
12
  from langchain.retrievers import VectorStoreRetriever
13
  from langchain_core.vectorstores import VectorStoreRetriever
14
+ # vectorization functions
15
+ from langchain_community.vectorstores import FAISS
16
+ from langchain_community.vectorstores import Chroma
17
+ from langchain_community.retrievers import BM25Retriever
18
+
19
+ from rag_app.knowledge_base.utils import create_embeddings
20
+ from rag_app.utils.generate_summary import generate_description, generate_keywords
21
+
22
+ import time
23
+ import os
24
+
25
+ from config import FAISS_INDEX_PATH
26
+
27
+ from pathlib import Path
28
+ from langchain_community.vectorstores import FAISS
29
+ from dotenv import load_dotenv
30
+ import os
31
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
32
+ import requests
33
+ from langchain_community.vectorstores import Chroma
34
+
35
+
36
 
37
  def create_embeddings(
38
  docs: list[Document],
 
108
  {"summary":summary}
109
  )
110
 
111
+ return new_docs
112
+
113
+
114
+ def build_vector_store(
115
+ docs: list,
116
+ embedding_model: str,
117
+ new_db:bool=False,
118
+ chunk_size:int=500,
119
+ chunk_overlap:int=50,
120
+ ):
121
+ """
122
+
123
+ """
124
+
125
+ embeddings,chunks = create_embeddings(
126
+ docs,
127
+ chunk_size,
128
+ chunk_overlap,
129
+ embedding_model
130
+ )
131
+
132
+ #load chunks into vector store
133
+ print(f'Loading chunks into faiss vector store ...')
134
+
135
+ st = time.time()
136
+ if new_db:
137
+ db_faiss = FAISS.from_documents(chunks, embeddings)
138
+ bm25_retriever = BM25Retriever.from_documents(chunks)
139
+ else:
140
+ db_faiss = FAISS.add_documents(chunks, embeddings)
141
+ bm25_retriever = BM25Retriever.add_documents(chunks)
142
+
143
+ db_faiss.save_local(FAISS_INDEX_PATH)
144
+ et = time.time() - st
145
+ print(f'Time taken: {et} seconds.')
146
+
147
+ print(f'Loading chunks into chroma vector store ...')
148
+
149
+ st = time.time()
150
+ persist_directory='./vectorstore/chroma-insurance-agent-1500'
151
+ db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
152
+ et = time.time() - st
153
+
154
+ print(f'Time taken: {et} seconds.')
155
+ result = f"built vectore store at {FAISS_INDEX_PATH}"
156
+ return result
157
+
158
+ def get_reranked_docs_faiss(
159
+ query:str,
160
+ path_to_db:str,
161
+ embedding_model:str,
162
+ hf_api_key:str,
163
+ num_docs:int=5
164
+ ) -> list:
165
+ """ Re-ranks the similarity search results and returns top-k highest ranked docs
166
+
167
+ Args:
168
+ query (str): The search query
169
+ path_to_db (str): Path to the vectorstore database
170
+ embedding_model (str): Embedding model used in the vector store
171
+ num_docs (int): Number of documents to return
172
+
173
+ Returns: A list of documents with the highest rank
174
+ """
175
+ assert num_docs <= 10, "num_docs should be less than similarity search results"
176
+
177
+ embeddings = HuggingFaceInferenceAPIEmbeddings(
178
+ api_key=hf_api_key,
179
+ model_name=embedding_model
180
+ )
181
+
182
+ # Load the vectorstore database
183
+ db = FAISS.load_local(
184
+ folder_path=path_to_db,
185
+ embeddings=embeddings,
186
+ allow_dangerous_deserialization=True
187
+ )
188
+
189
+ # Get 10 documents based on similarity search
190
+ docs = db.similarity_search(query=query, k=10)
191
+
192
+ # Add the page_content, description and title together
193
+ passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
194
+ for doc in docs]
195
+
196
+ # Prepare the payload
197
+ inputs = [{"text": query, "text_pair": passage} for passage in passages]
198
+
199
+ API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
200
+ headers = {"Authorization": f"Bearer {hf_api_key}"}
201
+
202
+ response = requests.post(API_URL, headers=headers, json=inputs)
203
+ scores = response.json()
204
+
205
+ try:
206
+ relevance_scores = [item[1]['score'] for item in scores]
207
+ except ValueError as e:
208
+ print('Could not get the relevance_scores -> something might be wrong with the json output')
209
+ return
210
+
211
+ if relevance_scores:
212
+ ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
213
+ top_k_results = ranked_results[:num_docs]
214
+ return [doc for doc, _, _ in top_k_results]
215
+
216
+
217
+ def get_reranked_docs_chroma(query:str,
218
+ path_to_db:str,
219
+ embedding_model:str,
220
+ hf_api_key:str,
221
+ reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2",
222
+ num_docs:int=5) -> list:
223
+ """ Re-ranks the similarity search results and returns top-k highest ranked docs
224
+
225
+ Args:
226
+ query (str): The search query
227
+ path_to_db (str): Path to the vectorstore database
228
+ embedding_model (str): Embedding model used in the vector store
229
+ num_docs (int): Number of documents to return
230
+
231
+ Returns: A list of documents with the highest rank
232
+ """
233
+ embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
234
+ model_name=embedding_model)
235
+ # Load the vectorstore database
236
+ db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
237
+
238
+ # Get k documents based on similarity search
239
+ sim_docs = db.similarity_search(query=query, k=10)
240
+
241
+ passages = [doc.page_content for doc in sim_docs]
242
+
243
+ # Prepare the payload
244
+ payload = {"inputs":
245
+ {"source_sentence": query,
246
+ "sentences": passages}}
247
+
248
+ headers = {"Authorization": f"Bearer {hf_api_key}"}
249
+
250
+ response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
251
+ print(f'{response = }')
252
+ if response.status_code != 200:
253
+ print('Something went wrong with the response')
254
+ return
255
+
256
+ similarity_scores = response.json()
257
+ ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
258
+ top_k_results = ranked_results[:num_docs]
259
+ return [doc for doc, _, _ in top_k_results]
rag_app/vector_store_handler/vectorstores.py CHANGED
@@ -10,7 +10,10 @@ from langchain_community.embeddings.sentence_transformer import (
10
  )
11
  import time
12
  from langchain_core.documents import Document
13
- from config import EMBEDDING_MODEL
 
 
 
14
 
15
  class BaseVectorStore(ABC):
16
  """
@@ -48,7 +51,10 @@ class BaseVectorStore(ABC):
48
  documents = loader.load()
49
  text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
50
  return text_splitter.split_documents(documents)
51
-
 
 
 
52
  @abstractmethod
53
  def create_vectorstore(self, texts):
54
  """
@@ -89,6 +95,7 @@ class BaseVectorStore(ABC):
89
  Save the current state of the vector store.
90
  """
91
  pass
 
92
 
93
  class ChromaVectorStore(BaseVectorStore):
94
  """
@@ -133,6 +140,38 @@ class ChromaVectorStore(BaseVectorStore):
133
  if not self.vectorstore:
134
  raise ValueError("Vector store not initialized. Nothing to save.")
135
  self.vectorstore.persist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  class FAISSVectorStore(BaseVectorStore):
138
  """
@@ -170,6 +209,67 @@ class FAISSVectorStore(BaseVectorStore):
170
  if self.vectorstore is None:
171
  raise ValueError("Vector store not initialized. Nothing to save.")
172
  self.vectorstore.save_local(self.persist_directory)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  # Usage example:
175
  def main():
 
10
  )
11
  import time
12
  from langchain_core.documents import Document
13
+ from config import EMBEDDING_MODEL, HUGGINGFACEHUB_API_TOKEN
14
+ from langchain.retrievers import EnsembleRetriever
15
+ from langchain_community.retrievers import BM25Retriever
16
+ import requests
17
 
18
  class BaseVectorStore(ABC):
19
  """
 
51
  documents = loader.load()
52
  text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
53
  return text_splitter.split_documents(documents)
54
+
55
+ def get_hybrid_search_result(self,query:str):
56
+ pass
57
+
58
  @abstractmethod
59
  def create_vectorstore(self, texts):
60
  """
 
95
  Save the current state of the vector store.
96
  """
97
  pass
98
+
99
 
100
  class ChromaVectorStore(BaseVectorStore):
101
  """
 
140
  if not self.vectorstore:
141
  raise ValueError("Vector store not initialized. Nothing to save.")
142
  self.vectorstore.persist()
143
+
144
+ def get_reranked_docs(
145
+ self,
146
+ query:str,
147
+ num_docs:int=5
148
+ ):
149
+
150
+ # Get 10 documents based on similarity search
151
+ docs = self.vectorstore.similarity_search(query=query, k=10)
152
+
153
+ # Add the page_content, description and title together
154
+ passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
155
+ for doc in docs]
156
+ # Prepare the payload
157
+ inputs = [{"text": query, "text_pair": passage} for passage in passages]
158
+
159
+ API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
160
+ headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
161
+
162
+ response = requests.post(API_URL, headers=headers, json=inputs)
163
+ scores = response.json()
164
+
165
+ try:
166
+ relevance_scores = [item[1]['score'] for item in scores]
167
+ except ValueError as e:
168
+ print('Could not get the relevance_scores -> something might be wrong with the json output')
169
+ return
170
+
171
+ if relevance_scores:
172
+ ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
173
+ top_k_results = ranked_results[:num_docs]
174
+ return [doc for doc, _, _ in top_k_results]
175
 
176
  class FAISSVectorStore(BaseVectorStore):
177
  """
 
209
  if self.vectorstore is None:
210
  raise ValueError("Vector store not initialized. Nothing to save.")
211
  self.vectorstore.save_local(self.persist_directory)
212
+
213
+ def get_hybrid_search_result(
214
+ self,
215
+ query:str,
216
+ num_docs:int=5
217
+ )-> list[Document]:
218
+ """ Uses an ensemble retriever of BM25 and FAISS to return k num documents
219
+
220
+ Args:
221
+ query (str): The search query
222
+ path_to_db (str): Path to the vectorstore database
223
+ embedding_model (str): Embedding model used in the vector store
224
+ num_docs (int): Number of documents to return
225
+
226
+ Returns
227
+ List of documents
228
+
229
+ """
230
+ all_docs = self.vectorstore.similarity_search("", k=self.vectorstore.index.ntotal)
231
+ bm25_retriever = BM25Retriever.from_documents(all_docs)
232
+ bm25_retriever.k = num_docs # How many results you want
233
+
234
+ faiss_retriever = self.vectorstore.as_retriever(search_kwargs={'k': num_docs})
235
+
236
+ ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever],
237
+ weights=[0.5,0.5])
238
+
239
+ results = ensemble_retriever.invoke(input=query)
240
+ return results
241
+
242
+ def get_reranked_docs(
243
+ self,
244
+ query:str,
245
+ num_docs:int=5
246
+ ):
247
+
248
+ # Get 10 documents based on similarity search
249
+ docs = self.vectorstore.similarity_search(query=query, k=10)
250
+
251
+ # Add the page_content, description and title together
252
+ passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
253
+ for doc in docs]
254
+ # Prepare the payload
255
+ inputs = [{"text": query, "text_pair": passage} for passage in passages]
256
+
257
+ API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
258
+ headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
259
+
260
+ response = requests.post(API_URL, headers=headers, json=inputs)
261
+ scores = response.json()
262
+
263
+ try:
264
+ relevance_scores = [item[1]['score'] for item in scores]
265
+ except ValueError as e:
266
+ print('Could not get the relevance_scores -> something might be wrong with the json output')
267
+ return
268
+
269
+ if relevance_scores:
270
+ ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
271
+ top_k_results = ranked_results[:num_docs]
272
+ return [doc for doc, _, _ in top_k_results]
273
 
274
  # Usage example:
275
  def main():