import os import pytesseract from pdf2image import convert_from_path from PIL import Image import pandas as pd from pypdf import PdfReader from pptx import Presentation import io class DocumentProcessor: def __init__(self): self.text_chunks = [] def process_file(self, file_path): ext = os.path.splitext(file_path)[1].lower() if ext == '.pdf': return self._process_pdf(file_path) elif ext in ('.png', '.jpg', '.jpeg'): return self._process_image(file_path) elif ext == '.pptx': return self._process_pptx(file_path) elif ext in ('.xlsx', '.xls'): return self._process_excel(file_path) else: raise ValueError(f"Unsupported file type: {ext}") def _process_pdf(self, file_path): try: # Try text extraction first reader = PdfReader(file_path) text = "\n".join([page.extract_text() for page in reader.pages]) if len(text.strip()) > 10: return self._chunk_text(text) except Exception: pass # Fallback to OCR images = convert_from_path(file_path) return self._process_images(images) def _process_image(self, file_path): return pytesseract.image_to_string(Image.open(file_path)) def _process_pptx(self, file_path): prs = Presentation(file_path) text = [] for slide in prs.slides: for shape in slide.shapes: if hasattr(shape, "text"): text.append(shape.text) return "\n".join(text) def _process_excel(self, file_path): df = pd.read_excel(file_path) return df.to_markdown() def _chunk_text(self, text, chunk_size=512): return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]