emailresponding / utils /loaders.py
khalidsabri's picture
first changes
c10dac6
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, UnstructuredHTMLLoader, BSHTMLLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
class DocumentLoader:
"""
A class to handle loading of documents from various sources.
"""
def __init__(self, data_path):
"""
Initializes the document loader with a given data path.
:param data_path: Path to the data directory.
"""
self.data_path = data_path
self.loader = None
def load_documents(self):
"""
Loads documents from the specified data path.
Handles errors related to path validity and document loading.
:return: List of loaded documents.
"""
if not os.path.exists(self.data_path):
raise FileNotFoundError(f"The specified path '{self.data_path}' does not exist.")
try:
# Assuming PDF files in a directory
self.loader = PyPDFDirectoryLoader(self.data_path)
documents = self.loader.load()
if not documents:
raise ValueError("No documents found in the specified path.")
return documents
except Exception as e:
raise RuntimeError(f"Error loading documents: {e}")