pope30 / utils.py
ramy2018's picture
Upload 6 files
fcd494c verified
raw
history blame
877 Bytes
import os
from PyPDF2 import PdfReader
from docx import Document
def process_pdf(file_path):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text.split('\n\n') # ุชู‚ุณูŠู… ุงู„ู†ุต ุฅู„ู‰ ูู‚ุฑุงุช
def process_docx(file_path):
doc = Document(file_path)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""]
return paragraphs
def process_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text.split('\n\n')
def process_documents(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == '.pdf':
return process_pdf(file_path)
elif ext == '.docx':
return process_docx(file_path)
elif ext == '.txt':
return process_txt(file_path)
else:
return []