import os import re from docx import Document from PyPDF2 import PdfReader def read_txt_file(file_path): with open(file_path, "r", encoding="utf-8") as f: return f.read() def read_docx_file(file_path): doc = Document(file_path) return "\n".join([para.text for para in doc.paragraphs]) def read_pdf_file(file_path): reader = PdfReader(file_path) return "\n".join(page.extract_text() or "" for page in reader.pages) def split_arabic_text(text, chunk_size=500): sentences = re.split(r'(?<=[.؟!])\s+', text) chunks = [] current = "" for sentence in sentences: if len(current) + len(sentence) <= chunk_size: current += sentence + " " else: chunks.append(current.strip()) current = sentence + " " if current: chunks.append(current.strip()) return chunks def process_documents(file_path): ext = os.path.splitext(file_path)[1].lower() if ext == ".txt": text = read_txt_file(file_path) elif ext == ".docx": text = read_docx_file(file_path) elif ext == ".pdf": text = read_pdf_file(file_path) else: return [] clean_text = text.replace('\n', ' ').replace('\r', ' ').strip() return split_arabic_text(clean_text)