import threading

from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter

lock = threading.Lock()


def llama_index_sentence_splitter(
    documents: list[str],
    document_ids: list[str],
    chunk_size: int = 256,
    skip_chunks_threshold: int = 30

) -> list[dict]:
    """
    Split documents into chunks using the SentenceSplitter from LlamaIndex.

    Args:
        documents (list[str]): List of documents to be split into chunks.
        document_ids (list[str]): List of document IDs corresponding to the documents.
        chunk_size (int): Size of each chunk. Default is 256.
        skip_chunks_threshold (int): Minimum length of text in a chunk to be included. Default is 30.

    Returns:
        list[dict]: A list of dictionaries, each containing a document ID and its corresponding content chunk.
    """

    def remove_spaces_and_newlines(text: str) -> str:
        """
        Remove spaces and newlines from the text.
        """
        return ' '.join(text.split())

    chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64))

    # See here for more details on why we use a lock:
    # https://stackoverflow.com/questions/27433370/what-would-cause-wordnetcorpusreader-to-have-no-attribute-lazycorpusloader
    with lock:
        node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap))

        chunks = []
        docs = [[Document(text=doc)] for doc in documents]
        for doc_id, doc in zip(document_ids, docs):

            texts = [node.text for node in node_parser(doc)]

            # Filter out chunks that do not contain enough text
            texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold]

            if not texts:
                continue

            chunks += [
                {"document_id": doc_id, "content": text}
                for text in texts
            ]
        return chunks