jarvis_gaia_agent / tools /file_parser.py
onisj's picture
feat(tools): add more tool to extend the functionaily of jarvis
751d628
import logging
import os
import pandas as pd
import PyPDF2
import speech_recognition as sr
import re
from langchain_core.tools import StructuredTool
from pydantic import BaseModel, Field
from typing import Optional
logger = logging.getLogger(__name__)
class FileParserInput(BaseModel):
task_id: str = Field(description="Task identifier")
file_type: str = Field(description="File extension (e.g., pdf, csv)")
file_path: str = Field(description="Path to the file")
query: Optional[str] = Field(description="Query related to the file", default=None)
async def file_parser_func(task_id: str, file_type: str, file_path: str, query: Optional[str] = None) -> str:
"""
Parse a file based on task_id, file_type, file_path, and query context.
Args:
task_id (str): Task identifier.
file_type (str): File extension (e.g., 'xlsx', 'mp3', 'pdf').
file_path (str): Path to the file.
query (Optional[str]): Question context to guide parsing (e.g., for specific data extraction).
Returns:
str: Parsed content or error message.
"""
try:
if not os.path.exists(file_path):
logger.warning(f"File not found: {file_path}")
return "File not found"
logger.info(f"Parsing file: {file_path} for task {task_id}")
if file_type in ["xlsx", "xls"]:
df = pd.read_excel(file_path, engine="openpyxl")
if query and ("sum" in query.lower() or "total" in query.lower()):
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
if numerical_cols.empty:
return "No numerical data found"
if "food" in query.lower():
food_rows = df[df.apply(lambda x: "food" in str(x).lower(), axis=1)]
if not food_rows.empty and numerical_cols[0] in food_rows:
total = food_rows[numerical_cols[0]].sum()
return f"{total:.2f}"
total = df[numerical_cols[0]].sum()
return f"{total:.2f}"
return df.to_string(index=False)
elif file_type == "csv":
df = pd.read_csv(file_path)
if query and ("sum" in query.lower() or "total" in query.lower()):
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
if numerical_cols.empty:
return "No numerical data found"
total = df[numerical_cols[0]].sum()
return f"{total:.2f}"
return df.to_string(index=False)
elif file_type == "pdf":
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
text = "".join(page.extract_text() or "" for page in reader.pages)
if query and "page number" in query.lower():
pages = re.findall(r'\b\d+\b', text)
return ", ".join(sorted(pages, key=int)) if pages else "No page numbers found"
return text.strip() or "No text extracted"
elif file_type == "txt":
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
if query and "page number" in query.lower():
pages = re.findall(r'\b\d+\b', text)
return ", ".join(sorted(pages, key=int)) if pages else "No page numbers found"
return text.strip()
elif file_type == "mp3":
recognizer = sr.Recognizer()
with sr.AudioFile(file_path) as source:
audio = recognizer.record(source)
try:
text = recognizer.recognize_google(audio)
logger.debug(f"Transcribed audio: {text}")
if query and "page number" in query.lower():
pages = re.findall(r'\b\d+\b', text)
return ", ".join(sorted(pages, key=int)) if pages else "No page numbers provided"
return text
except sr.UnknownValueError:
logger.error("Could not understand audio")
return "No text transcribed from audio"
except Exception as e:
logger.error(f"Audio parsing failed: {e}")
return "Error transcribing audio"
else:
logger.warning(f"Unsupported file type: {file_type}")
return f"Unsupported file type: {file_type}"
except Exception as e:
logger.error(f"Error parsing file for task {task_id}: {e}")
return f"Error: {str(e)}"
file_parser_tool = StructuredTool.from_function(
func=file_parser_func,
name="file_parser_tool",
args_schema=FileParserInput,
coroutine=file_parser_func
)