import os
from typing import List
from docx import Document as DocxDocument
from langchain.schema import Document
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

# Load individual file types
def load_txt(path: str) -> List[Document]:
    return TextLoader(path, encoding="utf-8").load()

def load_pdf(path: str) -> List[Document]:
    return PyPDFLoader(path).load()

def load_docx(path: str) -> List[Document]:
    full_text = "\n".join([para.text.strip() for para in DocxDocument(path).paragraphs if para.text.strip()])
    return [Document(page_content=full_text, metadata={"source": path})]

# Load multiple documents from list of paths
def load_documents(paths: List[str]) -> List[Document]:
    all_docs = []
    for path in paths:
        ext = os.path.splitext(path)[1].lower()
        if ext == ".txt":
            all_docs.extend(load_txt(path))
        elif ext == ".pdf":
            all_docs.extend(load_pdf(path))
        elif ext == ".docx":
            all_docs.extend(load_docx(path))
    return all_docs

# Split using recursive method
def recursive_split(docs: List[Document], chunk_size=1024, overlap=100) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", ".", "।"]
    )
    return splitter.split_documents(docs)

# Split by paragraph blocks
def paragraph_split(docs: List[Document], chunk_size=512) -> List[Document]:
    chunks = []
    for doc in docs:
        paras = doc.page_content.split("\n\n")
        buf = ""
        for para in paras:
            if len(buf) + len(para) <= chunk_size:
                buf += para + "\n\n"
            else:
                chunks.append(Document(page_content=buf.strip(), metadata=doc.metadata))
                buf = para + "\n\n"
        if buf:
            chunks.append(Document(page_content=buf.strip(), metadata=doc.metadata))
    return chunks

# Split based on semantic similarity
def semantic_split(docs: List[Document], embedder, threshold=0.85) -> List[Document]:
    chunks = []
    for doc in docs:
        paras = doc.page_content.split("\n\n")
        if len(paras) < 2:
            chunks.append(doc)
            continue
        buf = []
        for i in range(len(paras) - 1):
            buf.append(paras[i])
            v1 = embedder.embed_query("passage: " + paras[i])
            v2 = embedder.embed_query("passage: " + paras[i + 1])
            sim = cosine_similarity([v1], [v2])[0][0]
            if sim < threshold:
                chunks.append(Document("\n\n".join(buf).strip(), doc.metadata))
                buf = []
        buf.append(paras[-1])
        if buf:
            chunks.append(Document("\n\n".join(buf).strip(), doc.metadata))
    return chunks

# Save document chunks to FAISS vectorstore
def save_vectorstore(docs: List[Document], path: str, embedder):
    vectordb = FAISS.from_documents(docs, embedder)
    vectordb.save_local(path)
    print(f"Saved to: {path}")

# Build separate FAISS DBs for Bangla and English files
def build_dual_stores(bn_files: List[str], en_files: List[str], strategy="semantic"):
    embedder = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
    splitter = {
        "semantic": lambda d: semantic_split(d, embedder),
        "paragraph": paragraph_split,
        "recursive": recursive_split
    }.get(strategy)
    if not splitter:
        raise ValueError(f"Invalid strategy: {strategy}")

    bn_chunks = splitter(load_documents(bn_files))
    save_vectorstore(bn_chunks, f"vectorstore_bn_{strategy}", embedder)

    en_chunks = splitter(load_documents(en_files))
    save_vectorstore(en_chunks, f"vectorstore_en_{strategy}", embedder)

# Query any FAISS DB
def query_vectorstore(query: str, path: str, top_k: int = 3):
    embedder = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-base")
    db = FAISS.load_local(path, embedder, allow_dangerous_deserialization=True)
    results = db.similarity_search("query: " + query, k=top_k)
    docs = [doc.page_content for doc in results]
    for i, doc in enumerate(results):
        print(f"\nResult {i + 1}:\n{doc.page_content}\n")
    return docs

if __name__ == "__main__":
    build_dual_stores(['KB/BanglaKB.docx'], ['KB/EnglishKB.docx'], strategy="recursive")