Spaces:
Build error
Build error
| import json | |
| import numpy as np | |
| from langchain.schema import Document | |
| import faiss | |
| from rank_bm25 import BM25Okapi | |
| from data_processing import embedding_model | |
| from sentence_transformers import CrossEncoder | |
| #import string | |
| # import nltk | |
| # nltk.download('punkt') | |
| # nltk.download('punkt_tab') | |
| from nltk.tokenize import word_tokenize | |
| reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") | |
| retrieved_docs = None | |
| # Tokenize the documents and remove punctuation | |
| # def preprocess(doc): | |
| # return [word.lower() for word in word_tokenize(doc) if word not in string.punctuation] | |
| def retrieve_documents_hybrid(query, q_dataset, top_k=5): | |
| with open( f"data_local/{q_dataset}_chunked_docs.json", "r") as f: | |
| chunked_documents = json.load(f) # Contains all documents for this dataset | |
| faiss_index_path = f"data_local/{q_dataset}_quantized.faiss" | |
| index = faiss.read_index(faiss_index_path) | |
| # Tokenize documents for BM25 | |
| tokenized_docs = [doc.split() for doc in chunked_documents] | |
| bm25 = BM25Okapi(tokenized_docs) | |
| query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32) | |
| query_embedding = query_embedding.reshape(1, -1) | |
| # FAISS Search | |
| _, faiss_indices = index.search(query_embedding, top_k) | |
| faiss_docs = [chunked_documents[i] for i in faiss_indices[0]] | |
| # BM25 Search | |
| tokenized_query = query.split() #preprocess(query) | |
| bm25_scores = bm25.get_scores(tokenized_query) | |
| bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k] | |
| bm25_docs = [chunked_documents[i] for i in bm25_top_indices] | |
| # Combine FAISS + BM25 scores and retrieve docs | |
| # combined_results = set(bm25_top_indices).union(set(faiss_indices[0])) | |
| # combined_scores = rerank_docs_bm25faiss_scores(combined_results,bm25_scores, faiss_distances,faiss_indices) | |
| # reranked_docs = [chunked_documents[result[0]] for result in combined_scores[:top_k]] | |
| # Merge FAISS + BM25 Results and re-rank | |
| retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k] | |
| reranked_docs = rerank_documents(query, retrieved_docs) | |
| return reranked_docs | |
| def rerank_docs_bm25faiss_scores(combined_results_,bm25_scores_, faiss_distances_,faiss_indices_): | |
| final_results = [] | |
| for idx in combined_results_: | |
| # Combine BM25 score and FAISS score for ranking (this could be more sophisticated) | |
| bm25_score = bm25_scores_[idx] | |
| faiss_score = 1 / (1 + faiss_distances_[0][np.where(faiss_indices_[0] == idx)]) # Inverse distance for relevance | |
| final_results.append((idx, bm25_score, faiss_score)) | |
| # Sort final results by combined score (you can adjust the ranking strategy here) | |
| final_results.sort(key=lambda x: (x[1] + x[2]), reverse=True) | |
| return final_results | |
| # Retrieval Function | |
| # def retrieve_documents(query, top_k=5): | |
| # query_dataset = find_query_dataset(query) | |
| # #index, chunk_docs = load_data_from_faiss(query) | |
| # with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f: | |
| # documents = json.load(f) # Contains all documents for this dataset | |
| # faiss_index_path = f"data_local/{query_dataset}_quantized.faiss" | |
| # index = faiss.read_index(faiss_index_path) | |
| # query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32) | |
| # _, nearest_indices = index.search(query_embedding, top_k) | |
| # retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]] | |
| # return retrieved_docs | |
| def remove_duplicate_documents(documents): | |
| unique_documents = [] | |
| seen_documents = set() | |
| for doc in documents: | |
| doc_content = doc.page_content | |
| if doc_content not in seen_documents: | |
| unique_documents.append(doc) | |
| seen_documents.add(doc_content) | |
| return unique_documents | |
| def find_query_dataset(query): | |
| index = faiss.read_index("data_local/question_quantized.faiss") | |
| with open("data_local/dataset_mapping.json", "r") as f: | |
| dataset_names = json.load(f) | |
| question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32) | |
| _, nearest_index = index.search(question_embedding, 1) | |
| best_dataset = dataset_names[nearest_index[0][0]] | |
| return best_dataset | |
| def rerank_documents(query, retrieved_docs): | |
| doc_texts = [doc for doc in retrieved_docs] | |
| scores = reranker.predict([[query, doc] for doc in doc_texts]) | |
| ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)] | |
| return ranked_docs[:5] # Return top k most relevant | |