translate_tl / utils /file_readers.py
asasasaasasa's picture
init
da8d2e4
raw
history blame
875 Bytes
# utils/file_readers.py
import docx
import PyPDF2
def read_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
def read_docx(file_path):
doc = docx.Document(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
def read_pdf(file_path):
text = ''
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def read_file(file_path):
if file_path.endswith('.txt'):
return read_txt(file_path)
elif file_path.endswith('.docx'):
return read_docx(file_path)
elif file_path.endswith('.pdf'):
return read_pdf(file_path)
else:
return ""