File size: 8,430 Bytes
4fbcf68
 
 
 
 
 
 
 
 
 
 
4f3a783
 
 
4fbcf68
 
 
 
 
 
 
 
 
 
 
 
 
 
bdba660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fbcf68
 
 
 
 
 
 
 
 
 
 
 
 
bdba660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fbcf68
 
 
 
 
 
 
 
 
 
 
 
 
 
48e6960
4fbcf68
 
d3c4ed6
 
da0456f
d3c4ed6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48e6960
d3c4ed6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a9af1
ea86f3d
962369b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fbcf68
 
 
 
 
 
 
48e6960
4fbcf68
 
 
 
 
 
 
48e6960
4fbcf68
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

import os
from typing import Dict, List, Any
from pydantic_ai import Agent
from pydantic_ai.models.gemini import GeminiModel
from pydantic_ai.providers.google_gla import GoogleGLAProvider
from pydantic_ai import RunContext
from pydantic import BaseModel
from google.api_core.exceptions import ResourceExhausted  # Import the exception for quota exhaustion
from csv_service import get_csv_basic_info
from orchestrator_functions import csv_chart, csv_chat
from dotenv import load_dotenv

load_dotenv()


# Load all API keys from the environment variable
GEMINI_API_KEYS = os.getenv("GEMINI_API_KEYS", "").split(",")  # Expecting a comma-separated list of keys

# Function to initialize the model with a specific API key
def initialize_model(api_key: str) -> GeminiModel:
    return GeminiModel(
        'gemini-2.0-flash',
        provider=GoogleGLAProvider(api_key=api_key)
    )

# Define the tools
async def generate_csv_answer(csv_url: str, user_questions: List[str]) -> Any:
    """
    This function generates answers for the given user questions using the CSV URL.
    It uses the csv_chat function to process each question and return the answers.
    
    Args:
        csv_url (str): The URL of the CSV file.
        user_questions (List[str]): A list of user questions.
        
    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing the question and answer for each question.
        
    Example:
        [
            {"question": "What is the average age of the customers?", "answer": "The average age is 35."},
            {"question": "What is the most common gender?", "answer": "The most common gender is Male."}
        ]
    """
    
    print("LLM using the csv chat function....")
    print("CSV URL:", csv_url)
    print("User question:", user_questions)

    # Create an array to accumulate the answers
    answers = []
    # Loop through the user questions and generate answers for each
    for question in user_questions:
        answer = await csv_chat(csv_url, question)
        answers.append(dict(question=question, answer=answer))
    return answers

async def generate_chart(csv_url: str, user_questions: List[str]) -> Any:
    
    """
    This function generates charts for the given user questions using the CSV URL.
    It uses the csv_chart function to process each question and return the chart URLs.
    It returns a list of dictionaries containing the question and chart URL for each question.
    Args:
        csv_url (str): The URL of the CSV file.
        user_questions (List[str]): A list of user questions.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing the question and chart URL for each question.

    Example:
        [
            {"question": "What is the average age of the customers?", "chart_url": "https://example.com/chart1.png"},
            {"question": "What is the most common gender?", "chart_url": "https://example.com/chart2.png"}
        ]
    """
    
    print("LLM using the csv chart function....")
    print("CSV URL:", csv_url)
    print("User question:", user_questions)

    # Create an array to accumulate the charts
    charts = []
    # Loop through the user questions and generate charts for each
    for question in user_questions:
        chart = await csv_chart(csv_url, question)
        charts.append(dict(question=question, image_url=chart))
    
    return charts

# Function to create an agent with a specific CSV URL
def create_agent(csv_url: str, api_key: str, conversation_history: List) -> Agent:
    csv_metadata = get_csv_basic_info(csv_url)
    
    system_prompt = f"""
# Role: Expert Data Analysis Assistant
# Personality & Origin: You are exclusively the CSV Document Analysis Assistant, created by the chatcsvandpdf team. Your sole purpose is to assist users with CSV-related tasks—analyzing, interpreting, and processing data.

## Capabilities:
- Break complex queries into simpler sub-tasks

## Instruction Framework:
1. QUERY PROCESSING:
   - If request contains multiple questions:
     a) Decompose into logical sub-questions
     b) Process sequentially
     c) Combine results coherently

2. DATA HANDLING:
   - Always verify CSV structure matches the request
   - Handle missing/ambiguous data by:
     a) Asking clarifying questions OR
     b) Making reasonable assumptions (state them clearly)

3. VISUALIZATION STANDARDS:
   - Format images as: `![Description](direct-url)`
   - Include axis labels and titles
   - Use appropriate chart types

4. COMMUNICATION PROTOCOL:
   - Friendly, professional tone
   - Explain technical terms
   - Summarize key findings
   - Highlight limitations/caveats

5. TOOL USAGE:
   - Can process statistical operations
   - Supports visualization libraries

## Current Context:
- Working with CSV_URL: {csv_url}
- Dataset overview: {csv_metadata}
- Your conversation history: {conversation_history}
- Output format: Markdown compatible

## Response Template:
1. Confirm understanding of request
2. Outline analysis approach
3. Present results with visualizations (if applicable)
4. Provide interpretation
5. Offer next-step suggestions
"""
    
    # system_prompt = (
    # "You are a data analyst. "
    # "You have all the tools you need to answer any question. "
    # "If the user asks for multiple answers or charts, break the question into several well-defined questions. "
    # "Pass the CSV URL or file path along with the questions to the tools to generate the answer. "
    # "The tools are actually LLMs with Python code execution capabilities. "
    # "Modify the query if needed to make it simpler for the LLM to understand. "
    # "Answer in a friendly and helpful manner. "
    # "**Format images** in Markdown: `![alt_text](direct-image-url)`. "
    # f"Your CSV URL is {csv_url}. "
    # f"Your CSV metadata is {csv_metadata}."
    # )

    
    # system_prompt = (
    #     "You are a data analyst assistant with limited tool capabilities. "
    #     "Available tools can only handle simple data queries: "
    #     "- Count rows/columns\n- Calculate basic stats (avg, sum, min/max)\n"
    #     "- Create simple visualizations (pie charts, bar graphs)\n"
    #     "- Show column names/types\n\n"
        
    #     "Query Handling Rules:\n"
    #     "1. If query is complex, ambiguous, or exceeds tool capabilities:\n"
    #     "   - Break into simpler sub-questions\n"
    #     "   - Ask for clarification\n"
    #     "   - Rephrase to nearest simple query\n"
    #     "2. For 'full report' requests:\n"
    #     "   - Outline possible analysis steps\n"
    #     "   - Ask user to select one component at a time\n\n"
        
    #     "Examples:\n"
    #     "- Bad query: 'Show me everything'\n"
    #     "  Response: 'I can show row count (10), columns (5: Name, Age...), "
    #     "or a pie chart of categories. Which would you like?'\n"
    #     "- Bad query: 'Analyze trends'\n"
    #     "  Response: 'For trend analysis, I can show monthly averages or "
    #     "year-over-year comparisons. Please specify time period and metric.'\n\n"
        
    #     "Current CSV Context:\n"
    #     f"- URL: {csv_url}\n"
    #     f"- Metadata: {csv_metadata}\n\n"
        
    #     "Always format images as: ![Chart Description](direct-image-url)"
    # )
    
    return Agent(
        model=initialize_model(api_key),
        deps_type=str,
        tools=[generate_csv_answer, generate_chart],
        system_prompt=system_prompt
    )

def csv_orchestrator_chat(csv_url: str, user_question: str, conversation_history: List) -> str:
    print("CSV URL:", csv_url)
    print("User questions:", user_question)

    # Iterate through all API keys
    for api_key in GEMINI_API_KEYS:
        try:
            print(f"Attempting with API key: {api_key}")
            agent = create_agent(csv_url, api_key, conversation_history)
            result = agent.run_sync(user_question)
            print("Orchestrator Result:", result.data)
            return result.data
        except ResourceExhausted or Exception as e:
            print(f"Quota exhausted for API key: {api_key}. Switching to the next key.")
            continue  # Move to the next key
        except Exception as e:
            print(f"Error with API key {api_key}: {e}")
            continue  # Move to the next key

    # If all keys are exhausted or fail
    print("All API keys have been exhausted or failed.")
    return None