import logging import os import pandas as pd import PyPDF2 import speech_recognition as sr import re from langchain_core.tools import StructuredTool from pydantic import BaseModel, Field from typing import Optional logger = logging.getLogger(__name__) class FileParserInput(BaseModel): task_id: str = Field(description="Task identifier") file_type: str = Field(description="File extension (e.g., pdf, csv)") file_path: str = Field(description="Path to the file") query: Optional[str] = Field(description="Query related to the file", default=None) async def file_parser_func(task_id: str, file_type: str, file_path: str, query: Optional[str] = None) -> str: """ Parse a file based on task_id, file_type, file_path, and query context. Args: task_id (str): Task identifier. file_type (str): File extension (e.g., 'xlsx', 'mp3', 'pdf'). file_path (str): Path to the file. query (Optional[str]): Question context to guide parsing (e.g., for specific data extraction). Returns: str: Parsed content or error message. """ try: if not os.path.exists(file_path): logger.warning(f"File not found: {file_path}") return "File not found" logger.info(f"Parsing file: {file_path} for task {task_id}") if file_type in ["xlsx", "xls"]: df = pd.read_excel(file_path, engine="openpyxl") if query and ("sum" in query.lower() or "total" in query.lower()): numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns if numerical_cols.empty: return "No numerical data found" if "food" in query.lower(): food_rows = df[df.apply(lambda x: "food" in str(x).lower(), axis=1)] if not food_rows.empty and numerical_cols[0] in food_rows: total = food_rows[numerical_cols[0]].sum() return f"{total:.2f}" total = df[numerical_cols[0]].sum() return f"{total:.2f}" return df.to_string(index=False) elif file_type == "csv": df = pd.read_csv(file_path) if query and ("sum" in query.lower() or "total" in query.lower()): numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns if numerical_cols.empty: return "No numerical data found" total = df[numerical_cols[0]].sum() return f"{total:.2f}" return df.to_string(index=False) elif file_type == "pdf": with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) text = "".join(page.extract_text() or "" for page in reader.pages) if query and "page number" in query.lower(): pages = re.findall(r'\b\d+\b', text) return ", ".join(sorted(pages, key=int)) if pages else "No page numbers found" return text.strip() or "No text extracted" elif file_type == "txt": with open(file_path, "r", encoding="utf-8") as f: text = f.read() if query and "page number" in query.lower(): pages = re.findall(r'\b\d+\b', text) return ", ".join(sorted(pages, key=int)) if pages else "No page numbers found" return text.strip() elif file_type == "mp3": recognizer = sr.Recognizer() with sr.AudioFile(file_path) as source: audio = recognizer.record(source) try: text = recognizer.recognize_google(audio) logger.debug(f"Transcribed audio: {text}") if query and "page number" in query.lower(): pages = re.findall(r'\b\d+\b', text) return ", ".join(sorted(pages, key=int)) if pages else "No page numbers provided" return text except sr.UnknownValueError: logger.error("Could not understand audio") return "No text transcribed from audio" except Exception as e: logger.error(f"Audio parsing failed: {e}") return "Error transcribing audio" else: logger.warning(f"Unsupported file type: {file_type}") return f"Unsupported file type: {file_type}" except Exception as e: logger.error(f"Error parsing file for task {task_id}: {e}") return f"Error: {str(e)}" file_parser_tool = StructuredTool.from_function( func=file_parser_func, name="file_parser_tool", args_schema=FileParserInput, coroutine=file_parser_func )