import os from docx import Document import PyPDF2 class ChatbotDataLoader: def __init__(self): pass def read_docx(self, file_path): """ Reads content from a .docx file. """ doc = Document(file_path) content = "\n".join([para.text for para in doc.paragraphs]) return content def read_pdf(self, file_path): """ Reads content from a .pdf file. """ with open(file_path, "rb") as file: reader = PyPDF2.PdfReader(file) content = "" for page in range(len(reader.pages)): content += reader.pages[page].extract_text() return content def load_file(self, file_path): """ Reads content from a .docx or .pdf file based on the file extension. """ if file_path.endswith(".docx"): return self.read_docx(file_path) elif file_path.endswith(".pdf"): return self.read_pdf(file_path) else: raise ValueError(f"Unsupported file type: {file_path}") def load_directory(self, dir_path): """ Iterates through the directory, loads all .docx and .pdf files, and returns their content. """ file_contents = {} for root, _, files in os.walk(dir_path): for file in files: file_path = os.path.join(root, file) if file.endswith((".docx", ".pdf")): try: content = self.load_file(file_path) file_contents[file_path] = content except Exception as e: print(f"Failed to load {file_path}: {str(e)}") return file_contents