import os from typing import List from docx import Document as DocxDocument from langchain.schema import Document from langchain_community.document_loaders import TextLoader, PyPDFLoader from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from sklearn.metrics.pairwise import cosine_similarity # Load individual file types def load_txt(path: str) -> List[Document]: return TextLoader(path, encoding="utf-8").load() def load_pdf(path: str) -> List[Document]: return PyPDFLoader(path).load() def load_docx(path: str) -> List[Document]: full_text = "\n".join([para.text.strip() for para in DocxDocument(path).paragraphs if para.text.strip()]) return [Document(page_content=full_text, metadata={"source": path})] # Load multiple documents from list of paths def load_documents(paths: List[str]) -> List[Document]: all_docs = [] for path in paths: ext = os.path.splitext(path)[1].lower() if ext == ".txt": all_docs.extend(load_txt(path)) elif ext == ".pdf": all_docs.extend(load_pdf(path)) elif ext == ".docx": all_docs.extend(load_docx(path)) return all_docs # Split using recursive method def recursive_split(docs: List[Document], chunk_size=1024, overlap=100) -> List[Document]: splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=overlap, separators=["\n\n", "\n", ".", "ред"] ) return splitter.split_documents(docs) # Split by paragraph blocks def paragraph_split(docs: List[Document], chunk_size=512) -> List[Document]: chunks = [] for doc in docs: paras = doc.page_content.split("\n\n") buf = "" for para in paras: if len(buf) + len(para) <= chunk_size: buf += para + "\n\n" else: chunks.append(Document(page_content=buf.strip(), metadata=doc.metadata)) buf = para + "\n\n" if buf: chunks.append(Document(page_content=buf.strip(), metadata=doc.metadata)) return chunks # Split based on semantic similarity def semantic_split(docs: List[Document], embedder, threshold=0.85) -> List[Document]: chunks = [] for doc in docs: paras = doc.page_content.split("\n\n") if len(paras) < 2: chunks.append(doc) continue buf = [] for i in range(len(paras) - 1): buf.append(paras[i]) v1 = embedder.embed_query("passage: " + paras[i]) v2 = embedder.embed_query("passage: " + paras[i + 1]) sim = cosine_similarity([v1], [v2])[0][0] if sim < threshold: chunks.append(Document("\n\n".join(buf).strip(), doc.metadata)) buf = [] buf.append(paras[-1]) if buf: chunks.append(Document("\n\n".join(buf).strip(), doc.metadata)) return chunks # Save document chunks to FAISS vectorstore def save_vectorstore(docs: List[Document], path: str, embedder): vectordb = FAISS.from_documents(docs, embedder) vectordb.save_local(path) print(f"Saved to: {path}") # Build separate FAISS DBs for Bangla and English files def build_dual_stores(bn_files: List[str], en_files: List[str], strategy="semantic"): embedder = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base") splitter = { "semantic": lambda d: semantic_split(d, embedder), "paragraph": paragraph_split, "recursive": recursive_split }.get(strategy) if not splitter: raise ValueError(f"Invalid strategy: {strategy}") bn_chunks = splitter(load_documents(bn_files)) save_vectorstore(bn_chunks, f"vectorstore_bn_{strategy}", embedder) en_chunks = splitter(load_documents(en_files)) save_vectorstore(en_chunks, f"vectorstore_en_{strategy}", embedder) # Query any FAISS DB def query_vectorstore(query: str, path: str, top_k: int = 3): embedder = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base") db = FAISS.load_local(path, embedder, allow_dangerous_deserialization=True) results = db.similarity_search("query: " + query, k=top_k) docs = [doc.page_content for doc in results] for i, doc in enumerate(results): print(f"\nResult {i + 1}:\n{doc.page_content}\n") return docs if __name__ == "__main__": build_dual_stores(['KB/BanglaKB.docx'], ['KB/EnglishKB.docx'], strategy="recursive")