Spaces:

cb1716pics
/

23RAG7

Build error

App Files Files Community

23RAG7 / retrieval.py

cb1716pics

Upload 2 files

01c5a73 verified 9 months ago

raw

history blame contribute delete

4.69 kB

	import json
	import numpy as np
	from langchain.schema import Document
	import faiss
	from rank_bm25 import BM25Okapi
	from data_processing import embedding_model
	from sentence_transformers import CrossEncoder

	#import string
	# import nltk
	# nltk.download('punkt')
	# nltk.download('punkt_tab')

	from nltk.tokenize import word_tokenize

	reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

	retrieved_docs = None

	# Tokenize the documents and remove punctuation
	# def preprocess(doc):
	# return [word.lower() for word in word_tokenize(doc) if word not in string.punctuation]

	def retrieve_documents_hybrid(query, q_dataset, top_k=5):
	with open( f"data_local/{q_dataset}_chunked_docs.json", "r") as f:
	chunked_documents = json.load(f) # Contains all documents for this dataset

	faiss_index_path = f"data_local/{q_dataset}_quantized.faiss"
	index = faiss.read_index(faiss_index_path)

	# Tokenize documents for BM25
	tokenized_docs = [doc.split() for doc in chunked_documents]
	bm25 = BM25Okapi(tokenized_docs)

	query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
	query_embedding = query_embedding.reshape(1, -1)

	# FAISS Search
	_, faiss_indices = index.search(query_embedding, top_k)
	faiss_docs = [chunked_documents[i] for i in faiss_indices[0]]

	# BM25 Search
	tokenized_query = query.split() #preprocess(query)
	bm25_scores = bm25.get_scores(tokenized_query)
	bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
	bm25_docs = [chunked_documents[i] for i in bm25_top_indices]

	# Combine FAISS + BM25 scores and retrieve docs
	# combined_results = set(bm25_top_indices).union(set(faiss_indices[0]))

	# combined_scores = rerank_docs_bm25faiss_scores(combined_results,bm25_scores, faiss_distances,faiss_indices)
	# reranked_docs = [chunked_documents[result[0]] for result in combined_scores[:top_k]]

	# Merge FAISS + BM25 Results and re-rank
	retrieved_docs = list(set(faiss_docs + bm25_docs))[:top_k]
	reranked_docs = rerank_documents(query, retrieved_docs)

	return reranked_docs

	def rerank_docs_bm25faiss_scores(combined_results_,bm25_scores_, faiss_distances_,faiss_indices_):
	final_results = []
	for idx in combined_results_:
	# Combine BM25 score and FAISS score for ranking (this could be more sophisticated)
	bm25_score = bm25_scores_[idx]
	faiss_score = 1 / (1 + faiss_distances_[0][np.where(faiss_indices_[0] == idx)]) # Inverse distance for relevance
	final_results.append((idx, bm25_score, faiss_score))

	# Sort final results by combined score (you can adjust the ranking strategy here)
	final_results.sort(key=lambda x: (x[1] + x[2]), reverse=True)

	return final_results


	# Retrieval Function
	# def retrieve_documents(query, top_k=5):
	# query_dataset = find_query_dataset(query)
	# #index, chunk_docs = load_data_from_faiss(query)

	# with open( f"data_local/{query_dataset}_chunked_docs.json", "r") as f:
	# documents = json.load(f) # Contains all documents for this dataset

	# faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
	# index = faiss.read_index(faiss_index_path)

	# query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)

	# _, nearest_indices = index.search(query_embedding, top_k)

	# retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]

	# return retrieved_docs

	def remove_duplicate_documents(documents):
	unique_documents = []
	seen_documents = set()
	for doc in documents:
	doc_content = doc.page_content
	if doc_content not in seen_documents:
	unique_documents.append(doc)
	seen_documents.add(doc_content)
	return unique_documents

	def find_query_dataset(query):
	index = faiss.read_index("data_local/question_quantized.faiss")

	with open("data_local/dataset_mapping.json", "r") as f:
	dataset_names = json.load(f)

	question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
	_, nearest_index = index.search(question_embedding, 1)
	best_dataset = dataset_names[nearest_index[0][0]]
	return best_dataset

	def rerank_documents(query, retrieved_docs):
	doc_texts = [doc for doc in retrieved_docs]
	scores = reranker.predict([[query, doc] for doc in doc_texts])
	ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
	return ranked_docs[:5] # Return top k most relevant