oration / document_processor.py
mayureshkamwal
first commit
ccb3135
import os
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import pandas as pd
from pypdf import PdfReader
from pptx import Presentation
import io
class DocumentProcessor:
def __init__(self):
self.text_chunks = []
def process_file(self, file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == '.pdf':
return self._process_pdf(file_path)
elif ext in ('.png', '.jpg', '.jpeg'):
return self._process_image(file_path)
elif ext == '.pptx':
return self._process_pptx(file_path)
elif ext in ('.xlsx', '.xls'):
return self._process_excel(file_path)
else:
raise ValueError(f"Unsupported file type: {ext}")
def _process_pdf(self, file_path):
try:
# Try text extraction first
reader = PdfReader(file_path)
text = "\n".join([page.extract_text() for page in reader.pages])
if len(text.strip()) > 10:
return self._chunk_text(text)
except Exception:
pass
# Fallback to OCR
images = convert_from_path(file_path)
return self._process_images(images)
def _process_image(self, file_path):
return pytesseract.image_to_string(Image.open(file_path))
def _process_pptx(self, file_path):
prs = Presentation(file_path)
text = []
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
return "\n".join(text)
def _process_excel(self, file_path):
df = pd.read_excel(file_path)
return df.to_markdown()
def _chunk_text(self, text, chunk_size=512):
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]