pope30 / utils.py
ramy2018's picture
Update utils.py
4932f0a verified
import os
import re
from docx import Document
from PyPDF2 import PdfReader
def read_txt_file(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def read_docx_file(file_path):
doc = Document(file_path)
return "\n".join([para.text for para in doc.paragraphs])
def read_pdf_file(file_path):
reader = PdfReader(file_path)
return "\n".join(page.extract_text() or "" for page in reader.pages)
def split_arabic_text(text, chunk_size=500):
sentences = re.split(r'(?<=[.ุŸ!])\s+', text)
chunks = []
current = ""
for sentence in sentences:
if len(current) + len(sentence) <= chunk_size:
current += sentence + " "
else:
chunks.append(current.strip())
current = sentence + " "
if current:
chunks.append(current.strip())
return chunks
def process_documents(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == ".txt":
text = read_txt_file(file_path)
elif ext == ".docx":
text = read_docx_file(file_path)
elif ext == ".pdf":
text = read_pdf_file(file_path)
else:
return []
clean_text = text.replace('\n', ' ').replace('\r', ' ').strip()
return split_arabic_text(clean_text)