File size: 1,831 Bytes
6e4eef0
 
 
e13ea1d
 
 
 
 
9862511
6e4eef0
e13ea1d
 
 
 
 
 
 
 
 
0c5dccc
e13ea1d
 
 
 
042710c
e13ea1d
 
 
 
042710c
e13ea1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from smolagents import Tool

class SimpleTool(Tool):
    name = "pdf_extraction"
    description = """Reads and extracts the text from all PDF files in the given folder and returns the combined text."""
    inputs = {
        "path": { "type": "string", "description": "Folder location of PDF files", "default": "pdfs", "nullable": True }
    }
    output_type = "string"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        try:
            from pypdf import PdfReader
        except ImportError:
            raise ImportError(
                "You must install package `pypdf` to run this tool: for instance, run `pip install pypdf`."
            )
        self.reader_class = PdfReader

    def forward(self, path: str = "pdfs") -> str:
        # Ensure the folder exists
        if not os.path.exists(path):
            return f"Error: The folder '{path}' does not exist."

        # Find all PDF files in the folder
        pdf_files = [file for file in os.listdir(path) if file.endswith(".pdf")]
        if not pdf_files:
            return f"No PDF files found in the folder '{path}'."

        combined_text = []
        
        # Iterate over each PDF file and extract its text
        for pdf_file in pdf_files:
            pdf_path = os.path.join(path, pdf_file)
            try:
                reader = self.reader_class(pdf_path)
                file_text = ""
                for page in reader.pages:
                    file_text += page.extract_text()  # Extract text from each page
                combined_text.append(f"### File: {pdf_file}\n{file_text.strip()}")
            except Exception as e:
                combined_text.append(f"### File: {pdf_file}\nError reading file: {str(e)}")

        # Return all combined results
        return "\n\n".join(combined_text)