Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import pandas as pd | |
| from glob import glob | |
| import numpy as np | |
| from pathlib import Path | |
| DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")] | |
| SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()] | |
| def load_data(): | |
| """Load and preprocess the data.""" | |
| df = pd.read_csv("results.csv").dropna() | |
| # Add combined I/O cost column with 3:1 ratio | |
| df["IO Cost"] = ( | |
| df["Input cost per million token"] * 0.75 | |
| + df["Output cost per million token"] * 0.25 | |
| ) | |
| return df | |
| # categories.py | |
| CATEGORIES = { | |
| "Overall": ["Model Avg"], | |
| "Overall single turn": ["single turn perf"], | |
| "Overall multi turn": ["multi turn perf"], | |
| "Single func call": [ | |
| "xlam_single_tool_single_call", | |
| "xlam_multiple_tool_single_call", | |
| ], | |
| "Multiple func call": [ | |
| "xlam_multiple_tool_multiple_call", | |
| "xlam_single_tool_multiple_call", | |
| "BFCL_v3_multi_turn_base_multi_func_call", | |
| ], | |
| "Irrelevant query": ["BFCL_v3_irrelevance"], | |
| "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"], | |
| "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"], | |
| "Missing params": ["BFCL_v3_multi_turn_miss_param"], | |
| "Composite": ["BFCL_v3_multi_turn_composite"], | |
| } | |
| METHODOLOGY = """# Methodology | |
| ## Overview | |
| The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations. | |
| The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability. | |
| ## Tool Selection Quality Metric | |
| Models are evaluated on their ability to: | |
| - Correctly identify when tools are needed | |
| - Select the appropriate tool for the task | |
| - Handle cases where no suitable tool exists | |
| - Maintain context across multiple interactions | |
| ## Dataset Structure | |
| | Type | Samples | Category | Dataset Name | Purpose | | |
| |------|---------|-----------|--------------|----------| | |
| | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls | | |
| | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities | | |
| | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs | | |
| | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions | | |
| | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities | | |
| | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation | | |
| | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools | | |
| | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information | | |
| | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios | | |
| """ | |
| INSIGHTS = """ | |
| # Key Insights from Agent Leaderboard | |
| | Category | Finding | Implications | | |
| |----------|---------|--------------| | |
| | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing | | |
| | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end | | |
| | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption | | |
| | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement | | |
| | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions | | |
| | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases | | |
| **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios. | |
| """ | |
| chat_css = """ | |
| /* Container styles */ | |
| .container { | |
| display: flex; | |
| gap: 1.5rem; | |
| height: calc(100vh - 100px); | |
| padding: 1rem; | |
| } | |
| /* Chat panel styles */ | |
| .chat-panel { | |
| flex: 2; | |
| background: #1a1f2c; | |
| border-radius: 1rem; | |
| padding: 1rem; | |
| overflow-y: auto; | |
| max-height: calc(100vh - 120px); | |
| } | |
| /* Message styles */ | |
| .message { | |
| padding: 1.2rem; | |
| margin: 0.8rem; | |
| border-radius: 1rem; | |
| font-family: monospace; | |
| box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); | |
| } | |
| .system { | |
| background: linear-gradient(135deg, #8e44ad, #9b59b6); | |
| } | |
| .user { | |
| background: linear-gradient(135deg, #2c3e50, #3498db); | |
| margin-left: 2rem; | |
| } | |
| .assistant { | |
| background: linear-gradient(135deg, #27ae60, #2ecc71); | |
| margin-right: 2rem; | |
| } | |
| .role-badge { | |
| display: inline-block; | |
| padding: 0.3rem 0.8rem; | |
| border-radius: 0.5rem; | |
| font-weight: bold; | |
| margin-bottom: 0.8rem; | |
| font-size: 0.9rem; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| .system-role { | |
| background-color: #8e44ad; | |
| color: white; | |
| } | |
| .user-role { | |
| background-color: #3498db; | |
| color: white; | |
| } | |
| .assistant-role { | |
| background-color: #27ae60; | |
| color: white; | |
| } | |
| .content { | |
| white-space: pre-wrap; | |
| word-break: break-word; | |
| color: #f5f6fa; | |
| line-height: 1.5; | |
| } | |
| /* Metrics panel styles */ | |
| .metrics-panel { | |
| flex: 1; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 2rem; | |
| padding: 1.5rem; | |
| background: #1a1f2c; | |
| border-radius: 1rem; | |
| } | |
| .metric-section { | |
| background: #1E293B; | |
| padding: 1.5rem; | |
| border-radius: 1rem; | |
| } | |
| .score-section { | |
| text-align: center; | |
| } | |
| .score-display { | |
| font-size: 3rem; | |
| font-weight: bold; | |
| color: #4ADE80; | |
| line-height: 1; | |
| margin: 0.5rem 0; | |
| } | |
| .explanation-text { | |
| color: #E2E8F0; | |
| line-height: 1.6; | |
| font-size: 0.95rem; | |
| } | |
| /* Tool info panel styles */ | |
| .tool-info-panel { | |
| background: #1a1f2c; | |
| padding: 1.5rem; | |
| border-radius: 1rem; | |
| color: #f5f6fa; | |
| } | |
| .tool-section { | |
| margin-bottom: 1.5rem; | |
| } | |
| .tool-name { | |
| font-size: 1.2rem; | |
| color: #4ADE80; | |
| font-weight: bold; | |
| margin-bottom: 0.5rem; | |
| } | |
| .tool-description { | |
| color: #E2E8F0; | |
| line-height: 1.6; | |
| margin-bottom: 1rem; | |
| } | |
| .tool-parameters .parameter { | |
| margin: 0.5rem 0; | |
| padding: 0.5rem; | |
| background: rgba(255, 255, 255, 0.05); | |
| border-radius: 0.5rem; | |
| } | |
| .param-name { | |
| color: #63B3ED; | |
| font-weight: bold; | |
| margin-right: 0.5rem; | |
| } | |
| .tool-examples .example { | |
| margin: 0.5rem 0; | |
| padding: 0.5rem; | |
| background: rgba(255, 255, 255, 0.05); | |
| border-radius: 0.5rem; | |
| font-family: monospace; | |
| } | |
| /* Custom scrollbar */ | |
| ::-webkit-scrollbar { | |
| width: 8px; | |
| } | |
| ::-webkit-scrollbar-track { | |
| background: rgba(255, 255, 255, 0.1); | |
| border-radius: 4px; | |
| } | |
| ::-webkit-scrollbar-thumb { | |
| background: linear-gradient(45deg, #3498db, #2ecc71); | |
| border-radius: 4px; | |
| } | |
| /* Title styles */ | |
| .title { | |
| color: #63B3ED; | |
| font-size: 2rem; | |
| font-weight: bold; | |
| text-align: center; | |
| margin-bottom: 1.5rem; | |
| padding: 1rem; | |
| } | |
| /* Headers */ | |
| h3 { | |
| color: #63B3ED; | |
| margin: 0 0 1rem 0; | |
| font-size: 1.1rem; | |
| font-weight: 500; | |
| letter-spacing: 0.05em; | |
| } | |
| """ | |
| # Updated header and cards with theme awareness | |
| HEADER_CONTENT = """ | |
| <style> | |
| @media (prefers-color-scheme: dark) { | |
| :root { | |
| --bg-primary: rgb(17, 17, 27); | |
| --bg-secondary: rgba(30, 30, 45, 0.95); | |
| --bg-hover: rgba(40, 40, 55, 0.95); | |
| --text-primary: #ffffff; | |
| --text-secondary: #94a3b8; | |
| --text-tertiary: #e2e8f0; | |
| --border-color: rgba(255, 255, 255, 0.1); | |
| --border-hover: rgba(255, 255, 255, 0.2); | |
| --card-bg: rgba(17, 17, 27, 0.6); | |
| --accent-color: #4F46E5; | |
| --accent-bg: rgba(79, 70, 229, 0.1); | |
| } | |
| } | |
| @media (prefers-color-scheme: light) { | |
| :root { | |
| --bg-primary: rgb(255, 255, 255); | |
| --bg-secondary: rgba(243, 244, 246, 0.95); | |
| --bg-hover: rgba(229, 231, 235, 0.95); | |
| --text-primary: #000000; | |
| --text-secondary: #4b5563; | |
| --text-tertiary: #1f2937; | |
| --border-color: rgba(0, 0, 0, 0.1); | |
| --border-hover: rgba(0, 0, 0, 0.2); | |
| --card-bg: rgba(249, 250, 251, 0.6); | |
| --accent-color: #4F46E5; | |
| --accent-bg: rgba(79, 70, 229, 0.1); | |
| } | |
| } | |
| .header-wrapper { | |
| padding: 3rem 2rem; | |
| background: var(--bg-primary); | |
| border-radius: 16px; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| text-align: center; | |
| } | |
| .header-wrapper a { | |
| color: var(--text-primary) !important; | |
| text-decoration: none !important; | |
| } | |
| .description { | |
| color: var(--text-primary); | |
| font-size: 1.1rem; | |
| line-height: 1.6; | |
| max-width: 800px; | |
| margin: 0 auto 2rem; | |
| text-align: center; | |
| } | |
| .actions { | |
| display: flex; | |
| gap: 1rem; | |
| justify-content: center; | |
| margin-bottom: 2rem; | |
| color: var(--text-primary); | |
| } | |
| .action-button { | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| padding: 0.75rem 1.5rem; | |
| background: var(--bg-secondary); | |
| border: 1px solid var(--border-color); | |
| border-radius: 100px; | |
| color: var(--text-primary) !important; | |
| text-decoration: none !important; | |
| font-size: 0.95rem; | |
| transition: all 0.2s ease; | |
| } | |
| .action-button:hover { | |
| background: var(--bg-hover); | |
| border-color: var(--border-hover); | |
| color: var(--text-primary) !important; | |
| } | |
| .update-info { | |
| color: var(--text-secondary); | |
| font-size: 0.9rem; | |
| margin-bottom: 3rem; | |
| } | |
| .features-grid { | |
| display: grid; | |
| grid-template-columns: repeat(3, 1fr); | |
| gap: 1.5rem; | |
| width: 100%; | |
| max-width: 1200px; | |
| } | |
| .feature-card { | |
| background: var(--card-bg); | |
| border: 1px solid var(--border-color); | |
| border-radius: 16px; | |
| padding: 2rem; | |
| text-align: left; | |
| } | |
| .feature-icon { | |
| background: var(--accent-bg); | |
| width: 40px; | |
| height: 40px; | |
| border-radius: 12px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| margin-bottom: 1.5rem; | |
| } | |
| .feature-title { | |
| color: var(--text-primary); | |
| font-size: 1.25rem; | |
| font-weight: 600; | |
| margin-bottom: 1rem; | |
| } | |
| .feature-description { | |
| color: var(--text-secondary); | |
| font-size: 0.95rem; | |
| margin-bottom: 1.5rem; | |
| } | |
| .feature-list { | |
| list-style: none; | |
| padding: 0; | |
| margin: 0; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 0.75rem; | |
| } | |
| .feature-list li { | |
| color: var(--text-tertiary); | |
| font-size: 0.95rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| .feature-list li::before { | |
| content: ''; | |
| width: 6px; | |
| height: 6px; | |
| background: var(--accent-color); | |
| border-radius: 50%; | |
| flex-shrink: 0; | |
| } | |
| /* Force all links to match theme */ | |
| .header-wrapper a:link, | |
| .header-wrapper a:visited, | |
| .header-wrapper a:hover, | |
| .header-wrapper a:active { | |
| color: var(--text-primary) !important; | |
| } | |
| /* Title specific styles */ | |
| .main-title { | |
| color: var(--text-primary); | |
| font-size: 48px; | |
| font-weight: 700; | |
| margin: 40px 0; | |
| text-align: center; | |
| } | |
| .subtitle { | |
| color: var(--text-secondary); | |
| margin-bottom: 2rem; | |
| } | |
| </style> | |
| <div class="header-wrapper"> | |
| <h1 class="main-title">Agent Leaderboard</h1> | |
| <h2 class="subtitle">Comprehensive multi-benchmark evaluation for tool calling</h2> | |
| <div class="actions"> | |
| <a href="#" class="action-button"> | |
| <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
| <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/> | |
| <line x1="8" y1="12" x2="16" y2="12"/> | |
| </svg> | |
| Blog | |
| </a> | |
| <a href="#" class="action-button"> | |
| <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
| <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/> | |
| </svg> | |
| GitHub | |
| </a> | |
| <a href="#" class="action-button"> | |
| <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> | |
| <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
| <polyline points="7 10 12 15 17 10"/> | |
| <line x1="12" y1="15" x2="12" y2="3"/> | |
| </svg> | |
| Dataset | |
| </a> | |
| </div> | |
| """ | |
| CARDS = """ | |
| <div class="features-grid"> | |
| <div class="feature-card"> | |
| <div class="feature-icon"> | |
| <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24"> | |
| <path d="M22 12h-4l-3 9L9 3l-3 9H2"/> | |
| </svg> | |
| </div> | |
| <h3 class="feature-title">Make Better Decisions</h3> | |
| <ul class="feature-list"> | |
| <li>Cost-effectiveness analysis</li> | |
| <li>Business impact metrics</li> | |
| <li>Vendor strategy insights</li> | |
| </ul> | |
| </div> | |
| <div class="feature-card"> | |
| <div class="feature-icon"> | |
| <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24"> | |
| <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/> | |
| </svg> | |
| </div> | |
| <h3 class="feature-title">360Β° Domain Evaluation</h3> | |
| <ul class="feature-list"> | |
| <li>Cross-domain evaluation</li> | |
| <li>Real-world use cases</li> | |
| <li>Edge case evaluation</li> | |
| </ul> | |
| </div> | |
| <div class="feature-card"> | |
| <div class="feature-icon"> | |
| <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24"> | |
| <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/> | |
| </svg> | |
| </div> | |
| <h3 class="feature-title">Updated Periodically</h3> | |
| <ul class="feature-list"> | |
| <li>11 private models evaluated</li> | |
| <li>5 open source models included</li> | |
| <li>Monthly model additions</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| DESCRIPTION_HTML = """ | |
| <div style=" | |
| background: var(--bg-secondary, rgba(30, 30, 45, 0.95)); | |
| border-radius: 12px; | |
| padding: 24px; | |
| margin: 16px 0; | |
| "> | |
| <div style=" | |
| display: flex; | |
| flex-direction: column; | |
| gap: 16px; | |
| "> | |
| <div style=" | |
| color: var(--text-primary); | |
| font-size: 1.1rem; | |
| font-weight: 500; | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| "> | |
| π― Purpose | |
| <span style=" | |
| background: var(--accent-color, #4F46E5); | |
| color: white; | |
| padding: 4px 12px; | |
| border-radius: 100px; | |
| font-size: 0.9rem; | |
| ">Latest Update: Feb 2025</span> | |
| </div> | |
| <p style=" | |
| color: var(--text-secondary); | |
| margin: 0; | |
| line-height: 1.6; | |
| "> | |
| Welcome to the AI Agent Tool Calling Leaderboard! This comprehensive benchmark evaluates | |
| language models' ability to effectively utilize tools and functions in complex scenarios. | |
| </p> | |
| <div style=" | |
| color: var(--text-primary); | |
| font-size: 1.1rem; | |
| font-weight: 500; | |
| margin-top: 8px; | |
| "> | |
| π What We Evaluate | |
| </div> | |
| <div style=" | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
| gap: 16px; | |
| color: var(--text-secondary); | |
| "> | |
| <div style="display: flex; gap: 8px; align-items: center;"> | |
| π Single/Multi-turn Interactions | |
| </div> | |
| <div style="display: flex; gap: 8px; align-items: center;"> | |
| π§© Function Composition | |
| </div> | |
| <div style="display: flex; gap: 8px; align-items: center;"> | |
| β‘ Error Handling | |
| </div> | |
| </div> | |
| <div style=" | |
| color: var(--text-primary); | |
| font-size: 1.1rem; | |
| font-weight: 500; | |
| margin-top: 8px; | |
| "> | |
| π Key Results | |
| </div> | |
| <div style=" | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); | |
| gap: 16px; | |
| color: var(--text-secondary); | |
| "> | |
| <div style="display: flex; gap: 8px; align-items: center;"> | |
| β Accuracy Performance | |
| </div> | |
| <div style="display: flex; gap: 8px; align-items: center;"> | |
| π° Open Vs Closed Source | |
| </div> | |
| <div style="display: flex; gap: 8px; align-items: center;"> | |
| βοΈ Overall Effectiveness | |
| </div> | |
| </div> | |
| <div style=" | |
| border-left: 4px solid var(--accent-color, #4F46E5); | |
| padding-left: 12px; | |
| margin-top: 8px; | |
| color: var(--text-secondary); | |
| font-style: italic; | |
| "> | |
| π‘ Use the filters below to explore different aspects of the evaluation and compare model performance across various dimensions. | |
| </div> | |
| </div> | |
| </div> | |
| """ | |