|
import os |
|
from PyPDF2 import PdfReader |
|
from docx import Document |
|
|
|
def process_pdf(file_path): |
|
reader = PdfReader(file_path) |
|
text = "" |
|
for page in reader.pages: |
|
text += page.extract_text() + "\n" |
|
return text.split('\n\n') |
|
|
|
def process_docx(file_path): |
|
doc = Document(file_path) |
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != ""] |
|
return paragraphs |
|
|
|
def process_txt(file_path): |
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
text = f.read() |
|
return text.split('\n\n') |
|
|
|
def process_documents(file_path): |
|
ext = os.path.splitext(file_path)[1].lower() |
|
if ext == '.pdf': |
|
return process_pdf(file_path) |
|
elif ext == '.docx': |
|
return process_docx(file_path) |
|
elif ext == '.txt': |
|
return process_txt(file_path) |
|
else: |
|
return [] |
|
|