Spaces:
Paused
Paused
import os | |
import pytesseract | |
from pdf2image import convert_from_path | |
from PIL import Image | |
import pandas as pd | |
from pypdf import PdfReader | |
from pptx import Presentation | |
import io | |
class DocumentProcessor: | |
def __init__(self): | |
self.text_chunks = [] | |
def process_file(self, file_path): | |
ext = os.path.splitext(file_path)[1].lower() | |
if ext == '.pdf': | |
return self._process_pdf(file_path) | |
elif ext in ('.png', '.jpg', '.jpeg'): | |
return self._process_image(file_path) | |
elif ext == '.pptx': | |
return self._process_pptx(file_path) | |
elif ext in ('.xlsx', '.xls'): | |
return self._process_excel(file_path) | |
else: | |
raise ValueError(f"Unsupported file type: {ext}") | |
def _process_pdf(self, file_path): | |
try: | |
# Try text extraction first | |
reader = PdfReader(file_path) | |
text = "\n".join([page.extract_text() for page in reader.pages]) | |
if len(text.strip()) > 10: | |
return self._chunk_text(text) | |
except Exception: | |
pass | |
# Fallback to OCR | |
images = convert_from_path(file_path) | |
return self._process_images(images) | |
def _process_image(self, file_path): | |
return pytesseract.image_to_string(Image.open(file_path)) | |
def _process_pptx(self, file_path): | |
prs = Presentation(file_path) | |
text = [] | |
for slide in prs.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
text.append(shape.text) | |
return "\n".join(text) | |
def _process_excel(self, file_path): | |
df = pd.read_excel(file_path) | |
return df.to_markdown() | |
def _chunk_text(self, text, chunk_size=512): | |
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] |