|
from smolagents import Tool |
|
|
|
class SimpleTool(Tool): |
|
name = "pdf_extraction" |
|
description = """Reads and extracts the text from all PDF files in the given folder and returns the combined text.""" |
|
inputs = { |
|
"path": { "type": "string", "description": "Folder location of PDF files", "default": "pdfs", "nullable": True } |
|
} |
|
output_type = "string" |
|
|
|
def __init__(self, *args, **kwargs): |
|
super().__init__(*args, **kwargs) |
|
try: |
|
from pypdf import PdfReader |
|
except ImportError: |
|
raise ImportError( |
|
"You must install package `pypdf` to run this tool: for instance, run `pip install pypdf`." |
|
) |
|
self.reader_class = PdfReader |
|
|
|
def forward(self, path: str = "pdfs") -> str: |
|
|
|
if not os.path.exists(path): |
|
return f"Error: The folder '{path}' does not exist." |
|
|
|
|
|
pdf_files = [file for file in os.listdir(path) if file.endswith(".pdf")] |
|
if not pdf_files: |
|
return f"No PDF files found in the folder '{path}'." |
|
|
|
combined_text = [] |
|
|
|
|
|
for pdf_file in pdf_files: |
|
pdf_path = os.path.join(path, pdf_file) |
|
try: |
|
reader = self.reader_class(pdf_path) |
|
file_text = "" |
|
for page in reader.pages: |
|
file_text += page.extract_text() |
|
combined_text.append(f"### File: {pdf_file}\n{file_text.strip()}") |
|
except Exception as e: |
|
combined_text.append(f"### File: {pdf_file}\nError reading file: {str(e)}") |
|
|
|
|
|
return "\n\n".join(combined_text) |