Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

agent-leaderboard / data_loader.py

Pratik Bhavsar

working draft

10ad72f 6 months ago

4.98 kB

	import pandas as pd


	def load_data():
	"""Load and preprocess the data."""
	df = pd.read_csv("results.csv").dropna()

	# Add combined I/O cost column with 3:1 ratio
	df["IO Cost"] = (
	df["Input cost per million token"] * 0.75
	+ df["Output cost per million token"] * 0.25
	)
	return df


	# categories.py
	CATEGORIES = {
	"Overall": ["Model Avg"],
	"Overall single turn": ["single turn perf"],
	"Overall multi turn": ["multi turn perf"],
	"Single func call": [
	"xlam_single_tool_single_call",
	"xlam_multiple_tool_single_call",
	],
	"Multiple func call": [
	"xlam_multiple_tool_multiple_call",
	"xlam_single_tool_multiple_call",
	"BFCL_v3_multi_turn_base_multi_func_call",
	],
	"Irrelevant query": ["BFCL_v3_irrelevance"],
	"Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
	"Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
	"Missing params": ["BFCL_v3_multi_turn_miss_param"],
	"Composite": ["BFCL_v3_multi_turn_composite"],
	}

	INSIGHTS = """
	# Key Insights from Agent Leaderboard

	\| Category \| Finding \| Implications \|
	\|----------\|---------\|--------------\|
	\| Performance Leader \| Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs \| Demonstrates that top-tier performance is achievable without premium pricing \|
	\| Cost vs Performance \| Top 3 models span a 200x price difference yet show only 6% performance gap \| Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end \|
	\| Open Source Models \| Qwen-72b matches premium models in safety and context handling at lower cost \| Signals growing maturity in open-source models and potential for broader adoption \|
	\| Safety Features \| While irrelevance detection is widely solved, tool miss detection remains a challenge \| Highlights uneven development in safety features and areas needing focused improvement \|
	\| Edge Case Handling \| Models still struggle with maintaining context in complex scenarios \| Indicates need for architectural improvements in handling sophisticated interactions \|
	\| Architecture Impact \| Models show clear trade-offs between context handling and parallel execution \| Suggests need for specialized models or hybrid approaches for different use cases \|

	Note: Findings based on comprehensive evaluation across multiple tasks and scenarios.
	"""

	METHODOLOGY = """
	# Methodology

	## Overview
	The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
	The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.

	## Tool Selection Quality Metric
	Models are evaluated on their ability to:
	- Correctly identify when tools are needed
	- Select the appropriate tool for the task
	- Handle cases where no suitable tool exists
	- Maintain context across multiple interactions

	## Dataset Structure
	\| Type \| Samples \| Category \| Dataset Name \| Purpose \|
	\|------\|---------\|-----------\|--------------\|----------\|
	\| Single-Turn \| 100 + 100 \| Single Function Call \| xlam_single_tool_single_call \| Evaluates basic ability to read documentation and make single function calls \|
	\| \| 200 + 50 \| Multiple Function Call \| xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call \| Tests parallel execution and result aggregation capabilities \|
	\| \| 100 \| Irrelevant Query \| BFCL_v3_irrelevance \| Tests ability to recognize when available tools don't match user needs \|
	\| \| 100 \| Long Context \| tau_long_context \| Assesses handling of extended interactions and complex instructions \|
	\| Multi-Turn \| 50 + 30 \| Single Function Call \| BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call \| Tests basic conversational function calling abilities \|
	\| \| 50 \| Multiple Function Call \| BFCL_v3_multi_turn_base_multi_func_call \| Evaluates handling of multiple function calls in conversation \|
	\| \| 100 \| Missing Function \| BFCL_v3_multi_turn_miss_func \| Tests graceful handling of unavailable tools \|
	\| \| 100 \| Missing Parameters \| BFCL_v3_multi_turn_miss_param \| Assesses parameter collection and handling incomplete information \|
	\| \| 100 \| Composite \| BFCL_v3_multi_turn_composite \| Tests overall robustness in complex scenarios \|
	"""