File size: 877 Bytes
fcd494c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
import os
from PyPDF2 import PdfReader
from docx import Document
def process_pdf(file_path):
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text.split('\n\n') # تقسيم النص إلى فقرات
def process_docx(file_path):
doc = Document(file_path)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""]
return paragraphs
def process_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text.split('\n\n')
def process_documents(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == '.pdf':
return process_pdf(file_path)
elif ext == '.docx':
return process_docx(file_path)
elif ext == '.txt':
return process_txt(file_path)
else:
return []
|