Spaces:
Sleeping
Sleeping
| # utils/file_readers.py | |
| import docx | |
| import PyPDF2 | |
| def read_txt(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| return f.read() | |
| def read_docx(file_path): | |
| doc = docx.Document(file_path) | |
| full_text = [] | |
| for para in doc.paragraphs: | |
| full_text.append(para.text) | |
| return '\n'.join(full_text) | |
| def read_pdf(file_path): | |
| text = '' | |
| with open(file_path, 'rb') as f: | |
| reader = PyPDF2.PdfReader(f) | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text | |
| return text | |
| def read_file(file_path): | |
| if file_path.endswith('.txt'): | |
| return read_txt(file_path) | |
| elif file_path.endswith('.docx'): | |
| return read_docx(file_path) | |
| elif file_path.endswith('.pdf'): | |
| return read_pdf(file_path) | |
| else: | |
| return "" | |