Chai-Tea-Latte / chunking_parent.py
PercivalFletcher's picture
Upload 7 files
5abe5ee verified
# file: chunking.py
import uuid
from typing import List, Tuple, Dict, Any
from langchain_core.documents import Document
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
# --- Configuration for Parent-Child Splitting ---
# Parent chunks are the larger documents passed to the LLM for context.
PARENT_CHUNK_SIZE = 2000
PARENT_CHUNK_OVERLAP = 200
# Child chunks are the smaller, more granular documents used for retrieval.
CHILD_CHUNK_SIZE = 400
CHILD_CHUNK_OVERLAP = 100
def create_parent_child_chunks(
full_text: str
) -> Tuple[List[Document], InMemoryStore, Dict[str, str]]:
"""
Implements the Parent Document strategy for chunking.
1. Splits the document into larger "parent" chunks.
2. Splits the parent chunks into smaller "child" chunks.
3. The child chunks are used for retrieval, while the parent chunks
are used to provide context to the LLM.
Args:
full_text: The entire text content of the document.
Returns:
A tuple containing:
- A list of the small "child" documents for the vector store.
- An in-memory store mapping parent document IDs to the parent documents.
- A dictionary mapping child document IDs to their parent's ID.
"""
if not full_text:
print("Warning: Input text for chunking is empty.")
return [], InMemoryStore(), {}
print("Creating parent and child chunks...")
# This splitter creates the large documents that will be stored.
parent_splitter = RecursiveCharacterTextSplitter(
chunk_size=PARENT_CHUNK_SIZE,
chunk_overlap=PARENT_CHUNK_OVERLAP,
)
# This splitter creates the small, granular chunks for retrieval.
child_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHILD_CHUNK_SIZE,
chunk_overlap=CHILD_CHUNK_OVERLAP,
)
parent_documents = parent_splitter.create_documents([full_text])
docstore = InMemoryStore()
child_documents = []
child_to_parent_id_map = {}
# Generate unique IDs for each parent document and add them to the store
parent_ids = [str(uuid.uuid4()) for _ in parent_documents]
docstore.mset(list(zip(parent_ids, parent_documents)))
# Split each parent document into smaller child documents
for i, p_doc in enumerate(parent_documents):
parent_id = parent_ids[i]
_child_docs = child_splitter.split_documents([p_doc])
for _child_doc in _child_docs:
child_id = str(uuid.uuid4())
_child_doc.metadata["parent_id"] = parent_id
_child_doc.metadata["child_id"] = child_id
child_to_parent_id_map[child_id] = parent_id
child_documents.extend(_child_docs)
print(f"Created {len(parent_documents)} parent chunks and {len(child_documents)} child chunks.")
return child_documents, docstore, child_to_parent_id_map