Spaces:

jatinmehra
/

PDF-Insight-PRO

Running

PDF-Insight-PRO / utils /text_processing.py

Jatin Mehra

Add FAISS indexing utilities and enhance text processing functions for improved chunking and validation

447c09c 5 months ago

5.78 kB

	"""
	Utility functions for text processing and embeddings.

	This module contains utility functions for text processing, tokenization,
	chunking, and embedding operations.
	"""

	from typing import List, Dict, Any, Tuple
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from langchain_community.document_loaders import PyMuPDFLoader
	from langchain.schema import Document

	from configs.config import Config


	def estimate_tokens(text: str) -> int:
	"""
	Estimate the number of tokens in a text (rough approximation).

	Args:
	text: Input text

	Returns:
	Estimated number of tokens
	"""
	return len(text) // 4


	def process_pdf_file(file_path: str) -> List[Document]:
	"""
	Load a PDF file and extract its text with metadata.

	Args:
	file_path: Path to the PDF file

	Returns:
	List of Document objects with metadata

	Raises:
	FileNotFoundError: If the file doesn't exist
	"""
	import os
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"The file {file_path} does not exist.")

	loader = PyMuPDFLoader(file_path)
	documents = loader.load()
	return documents


	def chunk_text(documents: List[Document], max_length: int = None) -> List[Dict[str, Any]]:
	"""
	Split documents into chunks with metadata.

	Args:
	documents: List of Document objects
	max_length: Maximum chunk length in tokens

	Returns:
	List of chunk dictionaries with text and metadata
	"""
	if max_length is None:
	max_length = Config.DEFAULT_CHUNK_SIZE

	chunks = []

	for doc in documents:
	text = doc.page_content
	metadata = doc.metadata
	paragraphs = text.split("\n\n")
	current_chunk = ""
	current_metadata = metadata.copy()

	for paragraph in paragraphs:
	# Skip very short paragraphs
	if len(paragraph.strip()) < Config.MIN_PARAGRAPH_LENGTH:
	continue

	if estimate_tokens(current_chunk + paragraph) <= max_length // 4:
	current_chunk += paragraph + "\n\n"
	else:
	# Only add chunks with meaningful content
	if current_chunk.strip() and len(current_chunk.strip()) > Config.MIN_CHUNK_LENGTH:
	chunks.append({
	"text": current_chunk.strip(),
	"metadata": current_metadata
	})
	current_chunk = paragraph + "\n\n"

	# Add the last chunk if it has meaningful content
	if current_chunk.strip() and len(current_chunk.strip()) > Config.MIN_CHUNK_LENGTH:
	chunks.append({
	"text": current_chunk.strip(),
	"metadata": current_metadata
	})

	return chunks


	def create_embeddings(chunks: List[Dict[str, Any]], model: SentenceTransformer) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
	"""
	Create embeddings for a list of chunk texts.

	Args:
	chunks: List of chunk dictionaries
	model: SentenceTransformer model

	Returns:
	Tuple of (embeddings array, chunks)
	"""
	texts = [chunk["text"] for chunk in chunks]
	embeddings = model.encode(texts, show_progress_bar=True, convert_to_tensor=True)
	return embeddings.cpu().numpy(), chunks


	def filter_relevant_chunks(chunks_data: List[Tuple], threshold: float = None) -> List[Tuple]:
	"""
	Filter chunks based on similarity threshold.

	Args:
	chunks_data: List of (text, score, metadata) tuples
	threshold: Similarity threshold (lower is more similar)

	Returns:
	Filtered list of chunks
	"""
	if threshold is None:
	threshold = Config.SIMILARITY_THRESHOLD

	return [chunk for chunk in chunks_data if len(chunk) >= 3 and chunk[1] < threshold]


	def prepare_context_from_chunks(context_chunks: List[Tuple], max_tokens: int = None) -> str:
	"""
	Prepare context string from chunk data.

	Args:
	context_chunks: List of (text, score, metadata) tuples
	max_tokens: Maximum tokens for context

	Returns:
	Formatted context string
	"""
	if max_tokens is None:
	max_tokens = Config.MAX_CONTEXT_TOKENS

	# Sort chunks by relevance (lower distance = more relevant)
	sorted_chunks = sorted(context_chunks, key=lambda x: x[1]) if context_chunks else []

	# Filter out chunks with very high distance scores (low similarity)
	relevant_chunks = filter_relevant_chunks(sorted_chunks)

	context = ""
	total_tokens = 0

	for chunk, _, _ in relevant_chunks:
	if chunk and chunk.strip():
	chunk_tokens = estimate_tokens(chunk)
	if total_tokens + chunk_tokens <= max_tokens:
	context += chunk + "\n\n"
	total_tokens += chunk_tokens
	else:
	break

	return context.strip() if context else "No initial context provided from preliminary search."


	def validate_chunk_data(chunk_data: Any) -> bool:
	"""
	Validate chunk data structure.

	Args:
	chunk_data: Chunk data to validate

	Returns:
	True if valid, False otherwise
	"""
	if not isinstance(chunk_data, (list, tuple)):
	return False

	if len(chunk_data) < 3:
	return False

	text, score, metadata = chunk_data[0], chunk_data[1], chunk_data[2]

	if not isinstance(text, str) or not text.strip():
	return False

	if not isinstance(score, (int, float)):
	return False

	if not isinstance(metadata, dict):
	return False

	return True