import threading from llama_index.core import Document from llama_index.core.node_parser import SentenceSplitter lock = threading.Lock() def llama_index_sentence_splitter( documents: list[str], document_ids: list[str], chunk_size: int = 256, skip_chunks_threshold: int = 30 ) -> list[dict]: """ Split documents into chunks using the SentenceSplitter from LlamaIndex. Args: documents (list[str]): List of documents to be split into chunks. document_ids (list[str]): List of document IDs corresponding to the documents. chunk_size (int): Size of each chunk. Default is 256. skip_chunks_threshold (int): Minimum length of text in a chunk to be included. Default is 30. Returns: list[dict]: A list of dictionaries, each containing a document ID and its corresponding content chunk. """ def remove_spaces_and_newlines(text: str) -> str: """ Remove spaces and newlines from the text. """ return ' '.join(text.split()) chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64)) # See here for more details on why we use a lock: # https://stackoverflow.com/questions/27433370/what-would-cause-wordnetcorpusreader-to-have-no-attribute-lazycorpusloader with lock: node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_overlap)) chunks = [] docs = [[Document(text=doc)] for doc in documents] for doc_id, doc in zip(document_ids, docs): texts = [node.text for node in node_parser(doc)] # Filter out chunks that do not contain enough text texts = [text for text in texts if len(remove_spaces_and_newlines(text)) > skip_chunks_threshold] if not texts: continue chunks += [ {"document_id": doc_id, "content": text} for text in texts ] return chunks