File size: 877 Bytes
fcd494c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
from PyPDF2 import PdfReader
from docx import Document

def process_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text.split('\n\n')  # تقسيم النص إلى فقرات

def process_docx(file_path):
    doc = Document(file_path)
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""]
    return paragraphs

def process_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text.split('\n\n')

def process_documents(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.pdf':
        return process_pdf(file_path)
    elif ext == '.docx':
        return process_docx(file_path)
    elif ext == '.txt':
        return process_txt(file_path)
    else:
        return []