matterattetatte's picture
Update tool.py
9862511 verified
from smolagents import Tool
class SimpleTool(Tool):
name = "pdf_extraction"
description = """Reads and extracts the text from all PDF files in the given folder and returns the combined text."""
inputs = {
"path": { "type": "string", "description": "Folder location of PDF files", "default": "pdfs", "nullable": True }
}
output_type = "string"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
try:
from pypdf import PdfReader
except ImportError:
raise ImportError(
"You must install package `pypdf` to run this tool: for instance, run `pip install pypdf`."
)
self.reader_class = PdfReader
def forward(self, path: str = "pdfs") -> str:
# Ensure the folder exists
if not os.path.exists(path):
return f"Error: The folder '{path}' does not exist."
# Find all PDF files in the folder
pdf_files = [file for file in os.listdir(path) if file.endswith(".pdf")]
if not pdf_files:
return f"No PDF files found in the folder '{path}'."
combined_text = []
# Iterate over each PDF file and extract its text
for pdf_file in pdf_files:
pdf_path = os.path.join(path, pdf_file)
try:
reader = self.reader_class(pdf_path)
file_text = ""
for page in reader.pages:
file_text += page.extract_text() # Extract text from each page
combined_text.append(f"### File: {pdf_file}\n{file_text.strip()}")
except Exception as e:
combined_text.append(f"### File: {pdf_file}\nError reading file: {str(e)}")
# Return all combined results
return "\n\n".join(combined_text)