Spaces:
Sleeping
Sleeping
# file: chunking.py | |
import uuid | |
from typing import List, Tuple, Dict, Any | |
from langchain_core.documents import Document | |
from langchain.storage import InMemoryStore | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# --- Configuration for Parent-Child Splitting --- | |
# Parent chunks are the larger documents passed to the LLM for context. | |
PARENT_CHUNK_SIZE = 2000 | |
PARENT_CHUNK_OVERLAP = 200 | |
# Child chunks are the smaller, more granular documents used for retrieval. | |
CHILD_CHUNK_SIZE = 400 | |
CHILD_CHUNK_OVERLAP = 100 | |
def create_parent_child_chunks( | |
full_text: str | |
) -> Tuple[List[Document], InMemoryStore, Dict[str, str]]: | |
""" | |
Implements the Parent Document strategy for chunking. | |
1. Splits the document into larger "parent" chunks. | |
2. Splits the parent chunks into smaller "child" chunks. | |
3. The child chunks are used for retrieval, while the parent chunks | |
are used to provide context to the LLM. | |
Args: | |
full_text: The entire text content of the document. | |
Returns: | |
A tuple containing: | |
- A list of the small "child" documents for the vector store. | |
- An in-memory store mapping parent document IDs to the parent documents. | |
- A dictionary mapping child document IDs to their parent's ID. | |
""" | |
if not full_text: | |
print("Warning: Input text for chunking is empty.") | |
return [], InMemoryStore(), {} | |
print("Creating parent and child chunks...") | |
# This splitter creates the large documents that will be stored. | |
parent_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=PARENT_CHUNK_SIZE, | |
chunk_overlap=PARENT_CHUNK_OVERLAP, | |
) | |
# This splitter creates the small, granular chunks for retrieval. | |
child_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=CHILD_CHUNK_SIZE, | |
chunk_overlap=CHILD_CHUNK_OVERLAP, | |
) | |
parent_documents = parent_splitter.create_documents([full_text]) | |
docstore = InMemoryStore() | |
child_documents = [] | |
child_to_parent_id_map = {} | |
# Generate unique IDs for each parent document and add them to the store | |
parent_ids = [str(uuid.uuid4()) for _ in parent_documents] | |
docstore.mset(list(zip(parent_ids, parent_documents))) | |
# Split each parent document into smaller child documents | |
for i, p_doc in enumerate(parent_documents): | |
parent_id = parent_ids[i] | |
_child_docs = child_splitter.split_documents([p_doc]) | |
for _child_doc in _child_docs: | |
child_id = str(uuid.uuid4()) | |
_child_doc.metadata["parent_id"] = parent_id | |
_child_doc.metadata["child_id"] = child_id | |
child_to_parent_id_map[child_id] = parent_id | |
child_documents.extend(_child_docs) | |
print(f"Created {len(parent_documents)} parent chunks and {len(child_documents)} child chunks.") | |
return child_documents, docstore, child_to_parent_id_map |