agent-leaderboard / data_loader.py
Pratik Bhavsar
working draft
10ad72f
raw
history blame
4.98 kB
import pandas as pd
def load_data():
"""Load and preprocess the data."""
df = pd.read_csv("results.csv").dropna()
# Add combined I/O cost column with 3:1 ratio
df["IO Cost"] = (
df["Input cost per million token"] * 0.75
+ df["Output cost per million token"] * 0.25
)
return df
# categories.py
CATEGORIES = {
"Overall": ["Model Avg"],
"Overall single turn": ["single turn perf"],
"Overall multi turn": ["multi turn perf"],
"Single func call": [
"xlam_single_tool_single_call",
"xlam_multiple_tool_single_call",
],
"Multiple func call": [
"xlam_multiple_tool_multiple_call",
"xlam_single_tool_multiple_call",
"BFCL_v3_multi_turn_base_multi_func_call",
],
"Irrelevant query": ["BFCL_v3_irrelevance"],
"Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
"Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
"Missing params": ["BFCL_v3_multi_turn_miss_param"],
"Composite": ["BFCL_v3_multi_turn_composite"],
}
INSIGHTS = """
# Key Insights from Agent Leaderboard
| Category | Finding | Implications |
|----------|---------|--------------|
| Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
| Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
| Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
| Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
| Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
| Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
**Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
"""
METHODOLOGY = """
# Methodology
## Overview
The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
## Tool Selection Quality Metric
Models are evaluated on their ability to:
- Correctly identify when tools are needed
- Select the appropriate tool for the task
- Handle cases where no suitable tool exists
- Maintain context across multiple interactions
## Dataset Structure
| Type | Samples | Category | Dataset Name | Purpose |
|------|---------|-----------|--------------|----------|
| Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
| | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
| | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
| | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
| Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
| | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
| | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
| | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
| | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
"""