| import os | |
| import PyPDF2 | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| from rapidfuzz import fuzz | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| faiss_index = None | |
| pdf_chunks = [] | |
| chunk_texts = [] | |
| def process_pdfs(pdf_files): | |
| global faiss_index, pdf_chunks, chunk_texts | |
| all_text = "" | |
| chunk_texts = [] | |
| for pdf_file in pdf_files: | |
| reader = PyPDF2.PdfReader(pdf_file.name) | |
| for page in reader.pages: | |
| all_text += page.extract_text() + "\n" | |
| chunk_size = 500 | |
| pdf_chunks = [all_text[i:i+chunk_size] for i in range(0, len(all_text), chunk_size)] | |
| chunk_texts = pdf_chunks | |
| embeddings = embedder.encode(pdf_chunks, convert_to_numpy=True) | |
| dim = embeddings.shape[1] | |
| faiss_index = faiss.IndexFlatL2(dim) | |
| faiss_index.add(embeddings) | |
| return f"Processed {len(pdf_chunks)} chunks from {len(pdf_files)} PDF(s)." | |
| def semantic_search(query, top_k=3): | |
| global faiss_index, chunk_texts | |
| if faiss_index is None or not chunk_texts: | |
| return [] | |
| query_emb = embedder.encode([query], convert_to_numpy=True) | |
| D, I = faiss_index.search(query_emb, top_k) | |
| return [chunk_texts[i] for i in I[0] if i < len(chunk_texts)] | |
| def keyword_search(query, top_k=3): | |
| global chunk_texts | |
| if not chunk_texts: | |
| return [] | |
| scored = [(chunk, fuzz.partial_ratio(query.lower(), chunk.lower())) for chunk in chunk_texts] | |
| scored = sorted(scored, key=lambda x: x[1], reverse=True) | |
| return [chunk for chunk, score in scored[:top_k]] | |
| def retrieve_context(query, top_k=3): | |
| semantic_results = semantic_search(query, top_k) | |
| keyword_results = keyword_search(query, top_k) | |
| combined = [] | |
| seen = set() | |
| for chunk in semantic_results + keyword_results: | |
| if chunk not in seen: | |
| combined.append(chunk) | |
| seen.add(chunk) | |
| return "\n".join(combined) |