"""
Agent Leaderboard v1 - Main leaderboard interface
Updated implementation with LLM Type support and optimized radar charts
"""
import base64
import math
import re
from datetime import datetime
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
# Import components and styles from modular files
from components.leaderboard_components import (
get_chart_colors, get_rank_badge, get_type_badge,
get_metric_tooltip, get_responsive_styles, get_faq_section
)
from styles.leaderboard_styles import get_leaderboard_css
ASSET_ICON_PATH = Path("krew_icon.png")
KREW_ICON_BASE64 = ""
if ASSET_ICON_PATH.exists():
KREW_ICON_BASE64 = base64.b64encode(ASSET_ICON_PATH.read_bytes()).decode("utf-8")
CSV_PATH = Path("combined_evaluation_summary.csv")
if CSV_PATH.exists():
EVALUATION_DATE = datetime.fromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d")
else:
EVALUATION_DATE = datetime.today().strftime("%Y-%m-%d")
def create_leaderboard_v2_tab():
"""Create the main leaderboard v1 tab with interactive table"""
token_to_cost_factor = 2e-6 # Rough cost per token ($2 per 1M tokens)
tokens_per_turn = 1000 # Approximate tokens exchanged per turn for scaling
level_ids = [f"L{i}" for i in range(1, 8)]
level_tsq_sources = {
"L1": "L1_ArgAcc",
"L2": "L2_SelectAcc",
"L3": "L3_PSM",
"L4": "L4_Coverage",
"L5": "L5_AdaptiveRoutingScore",
"L6": "L6_EffScore",
"L7": "L7_ContextRetention",
}
def load_leaderboard_data():
"""Load and prepare the leaderboard data"""
df = pd.read_csv('combined_evaluation_summary.csv')
# Clean and prepare data
df = df.copy()
numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')]
for col in numeric_candidate_cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
# Derive per-level helper columns for cost and turns
sr_columns = []
tsq_columns = []
duration_columns = []
cost_columns = []
turns_columns = []
for level in level_ids:
sr_col = f"{level}_SR"
if sr_col in df.columns:
sr_columns.append(sr_col)
df[sr_col] = df[sr_col].round(3)
tsq_source = level_tsq_sources.get(level)
if tsq_source and tsq_source in df.columns:
tsq_columns.append(tsq_source)
duration_col = f"{level}_Avg_Exec_Time"
if duration_col in df.columns:
duration_columns.append(duration_col)
token_col = f"{level}_Avg_Tokens"
if token_col in df.columns:
cost_col = f"{level}_Avg_Cost"
turns_col = f"{level}_Avg_Turns"
df[cost_col] = df[token_col] * token_to_cost_factor
df[turns_col] = df[token_col] / tokens_per_turn
cost_columns.append(cost_col)
turns_columns.append(turns_col)
if sr_columns:
df['Avg AC'] = df[sr_columns].mean(axis=1)
if tsq_columns:
df['Avg TSQ'] = df[tsq_columns].mean(axis=1)
if cost_columns:
df['Avg Total Cost'] = df[cost_columns].mean(axis=1)
if duration_columns:
df['Avg Session Duration'] = df[duration_columns].mean(axis=1)
if turns_columns:
df['Avg Turns'] = df[turns_columns].mean(axis=1)
# Derive core capability metrics for radar visualization
if sr_columns:
df['Overall Success'] = df[sr_columns].mean(axis=1)
execution_cols = [col for col in ['L1_CallEM', 'L1_ArgAcc', 'L2_SelectAcc'] if col in df.columns]
if execution_cols:
df['Execution Accuracy'] = df[execution_cols].mean(axis=1)
reasoning_cols = [col for col in ['L3_ProvAcc', 'L3_PSM', 'L4_Coverage'] if col in df.columns]
if reasoning_cols:
df['Complex Reasoning'] = df[reasoning_cols].mean(axis=1)
robustness_cols = [col for col in ['L5_AdaptiveRoutingScore', 'L5_FallbackSR'] if col in df.columns]
if robustness_cols:
df['Robustness'] = df[robustness_cols].mean(axis=1)
context_cols = [col for col in ['L6_ReuseRage', 'L6_EffScore', 'L7_ContextRetention'] if col in df.columns]
if context_cols:
df['Context & Efficiency'] = df[context_cols].mean(axis=1)
epr_cols = [f"L{i}_EPR_CVR" for i in range(1, 8) if f"L{i}_EPR_CVR" in df.columns]
if epr_cols:
df['Call Validity'] = df[epr_cols].mean(axis=1)
# Use LLM Type from CSV directly, with mapping to display names
if 'LLM Type' in df.columns:
# Clean the LLM Type column to remove any whitespace
df['LLM Type'] = df['LLM Type'].astype(str).str.strip()
# Map LLM Type to Model Type
def map_llm_type(llm_type):
if llm_type.upper() == "OSS":
return "Open source"
else:
return "Proprietary"
df['Model Type'] = df['LLM Type'].apply(map_llm_type)
else:
# Fallback to vendor mapping if LLM Type column doesn't exist
vendor_model_type_map = {
"OpenAI": "Proprietary",
"Anthropic": "Proprietary",
"Google": "Proprietary",
"Microsoft": "Proprietary",
"Mistral": "Proprietary",
"Databricks": "Open source",
"Meta": "Open source",
"Alibaba": "Open source",
"알리바바": "Open source", # Korean name for Alibaba
"Kakao": "Open source",
"SKT": "Open source",
"KT": "Open source",
"xAI": "Proprietary",
}
df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary')
# Round numeric columns for better display
round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy',
'Complex Reasoning', 'Robustness', 'Context & Efficiency', 'Call Validity']
round_one_cols = ['Avg Session Duration', 'Avg Turns']
for col in round_three_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
for col in round_one_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').round(1)
if cost_columns:
df[cost_columns] = df[cost_columns].apply(pd.to_numeric, errors='coerce').round(3)
if turns_columns:
df[turns_columns] = df[turns_columns].apply(pd.to_numeric, errors='coerce').round(2)
if duration_columns:
df[duration_columns] = df[duration_columns].apply(pd.to_numeric, errors='coerce').round(2)
# Fill NaN values appropriately
df = df.fillna('')
return df
def build_static_radar_chart(values, labels):
"""Render a small static radar chart as inline SVG"""
if not values or all(v == 0 for v in values):
return """
Agent Leaderboard · {level_title}
{level_description}
"""
model_type_lookup = {
"OSS": "Open source",
"API": "Proprietary"
}
def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"):
"""Apply shared filters and sorting to the leaderboard dataframe."""
filtered_df = df.copy()
level_key = resolve_level(level_filter)
highlight_column = None
if model_type_filter != "All":
mapped_type = model_type_lookup.get(model_type_filter, model_type_filter)
filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type]
actual_sort_column = sort_by if sort_by in filtered_df.columns else None
if not actual_sort_column:
if level_key == "ALL":
actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None
else:
actual_sort_column = sr_column_map.get(level_key)
if level_key in sr_column_map:
highlight_column = sr_column_map[level_key]
elif level_key == "ALL" and overall_sort_column in filtered_df.columns:
highlight_column = overall_sort_column
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
return filtered_df, level_key, highlight_column
def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order):
"""Filter and sort the leaderboard data"""
df = load_leaderboard_data()
filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by)
# Generate HTML table
return generate_html_table(filtered_df, highlight_column)
# Load initial data
initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending")
initial_df = load_leaderboard_data() # Load raw data for model selector
if not initial_df.empty:
overall_success_numeric = pd.to_numeric(initial_df.get('Overall Success'), errors='coerce')
if overall_success_numeric.notna().any():
initial_df = initial_df.assign(**{'Overall Success': overall_success_numeric}).sort_values(
'Overall Success', ascending=False, na_position='last'
)
else:
initial_df = initial_df.sort_values('Model')
initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else []
initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else []
initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models)
initial_level_metric_level = level_ids[0] if level_ids else None
initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else []
initial_level_model_values = initial_level_model_choices[:5]
initial_level_metric_chart = create_level_metric_chart(
initial_df,
initial_level_metric_level,
initial_level_model_values
) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available")
# Load custom CSS and responsive styles
custom_css = get_leaderboard_css() + get_responsive_styles() + """
"""
gr.HTML(custom_css)
# Header styles and navigation
gr.HTML("""
""")
gr.HTML("
")
gr.Image(
value="banner_wide.png",
show_label=False,
interactive=False,
type="filepath",
elem_id="hero-banner"
)
gr.HTML("
")
gr.HTML("""
Hugging Face KREW Ko-AgentBench
Agent benchmark optimized for real Korean usage.
""")
# Links section below title
gr.HTML("""
""")
# Section 1: Task Design by Stage
gr.HTML("""
We analyzed agent capabilities across seven stages—from simple tool calls to long-context retention and robustness.
Single Turn
80%
- L1: Single Tool Call
- L2: Tool Selection
- L3: Sequential Tool Reasoning
- L4: Parallel Tool Reasoning
- L5: Error Handling & Robustness
Multi Turn
20%
- L6: Efficient Tool Utilization
- L7: Long-Context Memory
""")
# Section 2: Core Scenario Design
gr.HTML("""
We built realistic scenarios—such as appointment booking and blog review search—by integrating APIs widely used in Korea including Naver Maps, Kakao services, and local websites.
⌄
""")
# Section 3: Key Evaluation Criteria
gr.HTML("""
Cache-based Iterative Evaluation
- Improved handling of failed API responses
- Addresses chronic benchmark issues such as mismatched response attributes
- Ensures benchmark consistency and reliability
Robustness Testing
- Evaluates recognition and response strategies for intentional failure scenarios (e.g., discontinued products)
- Surfaces models that remain stable in real-world deployments
Level-specific Precision Metrics
- Evaluates each phase of problem solving, including tool selection, parameter setup, and data flow
- Quantitatively identifies model strengths and weaknesses
""")
# Metrics overview cards removed per updated design
# Domain filter section with enhanced styling
gr.HTML("""
""")
level_options = list(level_details.keys())
# Main leaderboard table with dynamic title and integrated controls
leaderboard_title = gr.HTML(update_leaderboard_title(default_level))
# Integrated controls within leaderboard section - stacked vertically
gr.HTML("
Select Task Level
")
domain_filter = gr.Radio(
choices=level_options,
value=default_level,
label="",
interactive=True,
container=False,
elem_classes=["domain-radio", "inline-radio"]
)
gr.HTML("
🔍 Filters & Sorting
")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("
Model Access")
model_type_filter = gr.Radio(
choices=["All", "OSS", "API"],
value="All",
label="",
elem_classes=["domain-radio", "inline-radio"],
container=False
)
with gr.Column(scale=1):
gr.HTML("
You can select up to five models.")
model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist()[:10],
value=initial_df['Model'].tolist()[:5],
multiselect=True,
label="",
info=None,
container=False,
)
# Radar chart plot - wrapped in centered container
gr.HTML('')
radar_chart = gr.Plot(
label="",
value=create_domain_radar_chart(
load_leaderboard_data(),
initial_df['Model'].tolist()[:5]
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('
')
gr.HTML("")
# Define generate_performance_card function before using it
def generate_performance_card(model_name):
"""Generate HTML for the model performance card"""
if not model_name:
return """
Please select a model to generate its performance card
"""
# Get model data
df = load_leaderboard_data()
model_data = df[df['Model'] == model_name]
if model_data.empty:
return """
Model not found in the database
"""
row = model_data.iloc[0]
# Get overall rank based on overall success
df_with_success = df.copy()
df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce')
df_with_success = df_with_success[df_with_success['Overall Success'].notna()]
df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True)
try:
rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
except:
rank = 'N/A'
# Format values
def format_value(val, decimals=3, prefix='', suffix=''):
if pd.isna(val) or val == '':
return 'N/A'
return f"{prefix}{float(val):.{decimals}f}{suffix}"
def format_score(value):
if pd.isna(value) or value == '':
return 'N/A'
return f"{float(value):.3f}"
radar_metrics = [
("Execution Accuracy", row.get('Execution Accuracy')),
("Complex Reasoning", row.get('Complex Reasoning')),
("Robustness", row.get('Robustness')),
("Context & Efficiency", row.get('Context & Efficiency')),
("Overall Success", row.get('Overall Success')),
("Validity", row.get('Call Validity')),
]
radar_values = []
radar_labels = []
for label, value in radar_metrics:
if pd.isna(value) or value == '':
radar_values.append(0.0)
else:
try:
radar_values.append(max(0.0, min(1.0, float(value))))
except (TypeError, ValueError):
radar_values.append(0.0)
radar_labels.append(label)
mini_radar_html = build_static_radar_chart(radar_values, radar_labels)
level_blocks = []
for level in level_ids:
sr_col = sr_column_map.get(level)
level_blocks.append((level, row.get(sr_col, '')))
evaluation_date = EVALUATION_DATE
icon_html = ""
if KREW_ICON_BASE64:
icon_html = f'

'
else:
icon_html = '
🤖
'
card_html = f"""
"""
return card_html
# MODEL PERFORMANCE CARD SECTION
gr.HTML("""
Model Performance Card
Explore detailed performance cards that visualize six core metrics plus overall SR across L1–L7 levels.
※ Ranks are determined by the average SR across L1–L7.
""")
with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"):
gr.HTML("""
Choose a model to generate its analysis card.
""")
card_model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist(),
value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
)
download_card_btn = gr.Button(
"Download as PNG",
elem_id="download-card-btn-en",
elem_classes=["pill-button"]
)
gr.HTML("""
""")
# Card display area - generate initial card
initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None
initial_card_html = generate_performance_card(initial_model) if initial_model else ""
card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html-en")
gr.HTML("""
""")
# Level metric breakdown section
gr.HTML("""
Level-specific Metrics
Compare model scores with each Ko-AgentBench level's dedicated metrics for deeper insights.
""")
with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"):
level_metric_selector = gr.Dropdown(
choices=level_ids,
value=level_ids[0] if level_ids else None,
multiselect=False,
label="",
info=None,
container=False,
elem_classes=["level-dropdown"]
)
level_model_selector = gr.Dropdown(
choices=initial_level_model_choices,
value=initial_level_model_values,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
)
gr.HTML('
')
level_metric_chart = gr.Plot(
label="",
value=initial_level_metric_chart,
elem_classes=["level-metric-plot", "plot-container"]
)
gr.HTML("""
""")
# # Heatmap section
# gr.HTML("""
#
#
#
Comprehensive Performance Heatmap
#
See each model's L1–L7 SR scores at a glance.
#
#
# """)
# heatmap_chart = gr.Plot(
# label="",
# value=initial_heatmap,
# elem_classes=["heatmap-plot", "plot-container"]
# )
# gr.HTML("""
#
#
# """)
# Update functions
def get_optimal_sort_order(sort_by_value):
"""Return the optimal sort order for a given metric"""
# Metrics where higher is better (descending)
descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids]
# Metrics where lower is better (ascending)
ascending_metrics = []
if sort_by_value in descending_metrics:
return "Descending"
elif sort_by_value in ascending_metrics:
return "Ascending"
else:
return "Descending" # Default fallback
def update_table(level_filter, model_type_filter, sort_order):
title_html = update_leaderboard_title(level_filter)
sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success")
table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order)
return title_html, table_html
def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
# Update model selector choices based on filtered data
available_models_all = filtered_df['Model'].tolist()
available_models = available_models_all[:15] # Top 15 from filtered results
# If selected models are not in available models, reset to top 5
if selected_models:
valid_selected = [m for m in selected_models if m in available_models]
# Check if more than 5 models are selected and show alert
if len(valid_selected) > 5:
gr.Warning("You can select up to 5 models.")
# Remove the last selected item (6th item) instead of keeping first 5
valid_selected = valid_selected[:-1]
if not valid_selected:
valid_selected = available_models[:5]
else:
valid_selected = available_models[:5]
# Create radar chart
chart = create_domain_radar_chart(filtered_df, valid_selected)
# Prepare heatmap order prioritizing selected models
# Level metric chart
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
available_level_models = available_models_all
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
if not valid_level_models:
valid_level_models = available_level_models[:5]
else:
valid_level_models = available_level_models[:5]
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models,
value=valid_selected,
multiselect=True,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
),
chart,
gr.Dropdown(
choices=available_level_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_metric_fig,
)
def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
available_models_all = filtered_df['Model'].tolist()
if selected_models:
valid_selected = [m for m in selected_models if m in available_models_all]
# Check if more than 5 models are selected and show alert
if len(valid_selected) > 5:
# JavaScript alert for exceeding 5 models
gr.Warning("You can select up to 5 models.")
# Remove the last selected item (6th item) instead of keeping first 5
valid_selected = valid_selected[:-1]
if not valid_selected:
valid_selected = available_models_all[:5]
else:
valid_selected = available_models_all[:5]
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
available_level_models = available_models_all
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_level_models][:5]
if not valid_level_models:
valid_level_models = available_level_models[:5]
else:
valid_level_models = available_level_models[:5]
level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models_all[:15],
value=valid_selected,
multiselect=True,
label="",
info=None,
container=False,
),
create_domain_radar_chart(filtered_df, valid_selected),
gr.Dropdown(
choices=available_level_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_metric_fig,
)
def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models):
df = load_leaderboard_data()
sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success")
filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric)
available_models = filtered_df['Model'].tolist()
if level_selected_models:
valid_level_models = [m for m in level_selected_models if m in available_models]
# Check if more than 5 models are selected and show alert
if len(valid_level_models) > 5:
gr.Warning("You can select up to 5 models.")
# Remove the last selected item (6th item) instead of keeping first 5
valid_level_models = valid_level_models[:-1]
if not valid_level_models:
valid_level_models = available_models[:5]
else:
valid_level_models = available_models[:5]
effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None)
level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics")
return (
gr.Dropdown(
choices=available_models,
value=valid_level_models,
multiselect=True,
label="",
info=None,
container=False,
elem_classes=["model-dropdown", "level-model-dropdown"]
),
level_chart,
)
# Update table when filters change
filter_inputs = [domain_filter, model_type_filter, sort_order]
for input_component in filter_inputs:
input_component.change(
fn=update_table,
inputs=filter_inputs,
outputs=[leaderboard_title, leaderboard_table]
)
# Also update radar chart when filters change
input_component.change(
fn=update_radar_chart,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
)
# Update radar chart when model selection changes
model_selector.change(
fn=update_radar_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart]
)
level_metric_selector.change(
fn=update_level_metric_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[level_model_selector, level_metric_chart]
)
level_model_selector.change(
fn=update_level_metric_only,
inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector],
outputs=[level_model_selector, level_metric_chart]
)
# Add custom CSS for the performance card
gr.HTML("""
""")
# Wire up the card generator to selection change
card_model_selector.change(
fn=generate_performance_card,
inputs=[card_model_selector],
outputs=[card_display]
)
# Wire up download button with html2canvas capture
download_card_btn.click(
fn=None,
js="""
async () => {
const ensureHtml2Canvas = () => new Promise((resolve, reject) => {
if (window.html2canvas) {
resolve(window.html2canvas);
return;
}
const existing = document.querySelector('script[data-html2canvas]');
if (existing) {
existing.addEventListener('load', () => resolve(window.html2canvas));
existing.addEventListener('error', reject);
return;
}
const script = document.createElement('script');
script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js';
script.async = true;
script.dataset.html2canvas = 'true';
script.onload = () => resolve(window.html2canvas);
script.onerror = () => reject(new Error('Failed to load html2canvas'));
document.head.appendChild(script);
});
const pause = (ms) => new Promise(resolve => setTimeout(resolve, ms));
await pause(60);
const container = document.getElementById('performance-card-html-en');
const card = container?.querySelector('.performance-card');
if (!container || !card) {
alert('Performance card not found. Please select a model first.');
return;
}
const btn = document.getElementById('download-card-btn-en');
const originalText = btn?.textContent || '';
if (btn) {
btn.textContent = 'Generating...';
btn.disabled = true;
}
try {
const html2canvasLib = await ensureHtml2Canvas();
if (!html2canvasLib) {
throw new Error('html2canvas unavailable');
}
const canvas = await html2canvasLib(card, {
backgroundColor: '#01091A',
scale: 2,
logging: false,
useCORS: true
});
if (!canvas || !canvas.width || !canvas.height) {
throw new Error('Captured canvas is empty');
}
const link = document.createElement('a');
const modelName = card.querySelector('.card-model-name')?.textContent || 'model';
const timestamp = new Date().toISOString().slice(0, 10);
const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`;
link.download = fileName;
const dataUrl = canvas.toDataURL('image/png');
if (!dataUrl || dataUrl === 'data:,' || dataUrl.length <= 'data:image/png;base64,'.length) {
throw new Error('Failed to generate PNG data');
}
link.href = dataUrl;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
} catch (error) {
console.error('Error capturing card:', error);
alert('Failed to capture performance card. Please try again.');
} finally {
if (btn) {
btn.textContent = originalText;
btn.disabled = false;
}
}
}
"""
)
# Also update card when filters change to keep model selector in sync
for input_component in filter_inputs:
def update_dropdown_and_card(*args):
filtered_df, _, _ = apply_filters(
load_leaderboard_data(),
args[0],
args[1],
args[2],
"Overall Success" if args[0] == "ALL" else sr_column_map.get(resolve_level(args[0]), "Overall Success")
)
choices = filtered_df['Model'].tolist()
# Select first model from filtered list
value = choices[0] if choices else None
return gr.Dropdown(
choices=choices,
value=value,
label="",
info=None,
container=False,
# elem_classes=["model-dropdown"]
)
input_component.change(
fn=update_dropdown_and_card,
inputs=filter_inputs,
outputs=[card_model_selector]
)
return leaderboard_table
def create_leaderboard_v2_interface():
"""Create the complete leaderboard v1 interface"""
return create_leaderboard_v2_tab()
def create_domain_radar_chart(df, selected_models=None, max_models=5):
"""Visualize six core capability metrics on a radar chart."""
df = df.copy()
metrics_info = [
{"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"},
{"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"},
{"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"},
{"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"},
{"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"},
{"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"},
]
required_columns = [m["column"] for m in metrics_info]
if df.empty or not any(col in df.columns for col in required_columns):
return create_empty_radar_chart("Not enough data to build the capability radar")
# Default model selection
if not selected_models:
if "Overall Success" in df.columns:
top_models = df.sort_values("Overall Success", ascending=False)
else:
top_models = df
selected_models = top_models['Model'].head(max_models).tolist()
selected_models = selected_models[:max_models]
# Ensure metric columns are numeric
for metric in metrics_info:
col = metric["column"]
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
fig = go.Figure()
angle_labels = [m["label"] for m in metrics_info]
palette = [
{'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'},
{'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'},
{'fill': 'rgba(161, 98, 7, 0.22)', 'line': '#A16207'},
{'fill': 'rgba(220, 38, 38, 0.20)', 'line': '#DC2626'},
{'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'},
]
for idx, model_name in enumerate(selected_models):
model_data = df[df['Model'] == model_name]
if model_data.empty:
continue
row = model_data.iloc[0]
values = []
tooltips = []
for metric in metrics_info:
col = metric["column"]
value = row[col] if col in row else float('nan')
if pd.isna(value) or value == '':
value = 0
values.append(float(value))
tooltips.append(metric["description"])
if not values:
continue
values_loop = values + [values[0]]
angles_loop = angle_labels + [angle_labels[0]]
tooltips_loop = tooltips + [tooltips[0]]
colors = palette[idx % len(palette)]
fig.add_trace(
go.Scatterpolar(
r=values_loop,
theta=angles_loop,
fill='toself',
fillcolor=colors['fill'],
line=dict(color=colors['line'], width=3),
marker=dict(
size=10,
color=colors['line'],
symbol='circle',
line=dict(width=2, color='#01091A')
),
name=model_name,
customdata=tooltips_loop,
mode="lines+markers",
hovertemplate="
%{fullData.name}" +
"
%{theta}" +
"
%{customdata}" +
"
%{r:.3f}" +
"
",
hoverlabel=dict(
bgcolor="rgba(1, 9, 26, 0.95)",
bordercolor=colors['line'],
font=dict(color="white", size=12, family="'Geist', sans-serif")
)
)
)
tick_vals = [i / 5 for i in range(6)]
tick_text = [f"{val:.2f}" for val in tick_vals]
fig.update_layout(
polar=dict(
bgcolor='rgba(245, 246, 247, 0.03)',
radialaxis=dict(
visible=True,
range=[0, 1],
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.1)',
gridwidth=1,
tickvals=tick_vals,
ticktext=tick_text,
tickfont=dict(
size=11,
color='white',
family="'Geist Mono', monospace"
)
),
angularaxis=dict(
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.08)',
tickfont=dict(
size=13,
family="'Geist', sans-serif",
color='white',
weight=600
),
rotation=90,
direction="clockwise",
),
),
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.15,
xanchor="center",
x=0.5,
font=dict(size=12, family="'Geist', sans-serif", color='white'),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1,
itemsizing='constant',
itemwidth=30
),
title=dict(
text="
Core Capability Radar",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="white",
weight=700
),
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=800,
width=900,
margin=dict(t=30, b=50, l=10, r=10),
autosize=True,
annotations=[]
)
return fig
def create_performance_heatmap(df, ordered_models=None, max_models=12):
"""Render a heatmap of SR scores across task levels for selected models."""
df = df.copy()
level_sequence = [f"L{i}" for i in range(1, 8)]
sr_columns = []
for level in level_sequence:
col = f"{level}_SR"
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
sr_columns.append((level, col))
if df.empty or not sr_columns:
return create_empty_heatmap("Not enough SR data to render the heatmap")
df = df.drop_duplicates(subset=["Model"])
if df.empty:
return create_empty_heatmap("No models available to render the heatmap")
sort_column = "Overall Success" if "Overall Success" in df.columns else sr_columns[0]
df = df.sort_values(sort_column, ascending=False)
if ordered_models:
ordered_models = [m for m in ordered_models if m in df["Model"].tolist()]
else:
ordered_models = df["Model"].tolist()
if not ordered_models:
return create_empty_heatmap("No models available to render the heatmap")
ordered_models = ordered_models[:max_models]
heatmap_df = df.set_index("Model").reindex(ordered_models)
level_labels = []
z_matrix = []
has_values = False
for level, col in sr_columns:
if col not in heatmap_df.columns:
continue
label = f"{level} · SR"
level_labels.append(label)
row_values = []
for model in ordered_models:
value = heatmap_df.at[model, col] if model in heatmap_df.index else None
if pd.isna(value):
row_values.append(None)
else:
val = float(value)
row_values.append(val)
has_values = True
z_matrix.append(row_values)
if not level_labels or not has_values:
return create_empty_heatmap("Not enough SR data to render the heatmap")
colorscale = [
[0.0, "#0A0A0A"],
[0.25, "#1A1411"],
[0.5, "#332818"],
[0.75, "#B8660A"],
[1.0, "#FFD21E"],
]
fig = go.Figure()
fig.add_trace(
go.Heatmap(
z=z_matrix,
x=ordered_models,
y=level_labels,
colorscale=colorscale,
zmin=0,
zmax=1,
hovertemplate="
%{y}%{x}SR · %{z:.3f}
",
colorbar=dict(
title="Success Rate",
titlefont=dict(color="white", family="'Geist', sans-serif", size=12),
tickfont=dict(color="white", family="'Geist', sans-serif", size=10),
thickness=12,
len=0.7,
outlinecolor="rgba(255, 255, 255, 0.1)",
bgcolor="rgba(1, 9, 26, 0.75)"
),
showscale=True
)
)
annotations = []
for y_idx, level in enumerate(level_labels):
for x_idx, model in enumerate(ordered_models):
value = z_matrix[y_idx][x_idx]
if value is None:
continue
font_color = "#0B1120" if value >= 0.6 else "#F8FAFC"
annotations.append(
dict(
x=model,
y=level,
text=f"{value:.3f}",
showarrow=False,
font=dict(
family="'Geist Mono', monospace",
size=11,
color=font_color
)
)
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
margin=dict(t=80, b=90, l=110, r=160),
height=520,
width=1450,
font=dict(family="'Geist', sans-serif", color="white"),
xaxis=dict(
tickangle=-25,
showgrid=False,
ticks="",
tickfont=dict(size=11, family="'Geist', sans-serif", color="white")
),
yaxis=dict(
showgrid=False,
ticks="",
tickfont=dict(size=12, family="'Geist', sans-serif", color="white")
),
annotations=annotations,
title=dict(
text="
Comprehensive Performance Heatmap",
x=0.5,
y=0.98,
font=dict(
size=20,
family="'Geist', sans-serif",
color="white",
weight=700
),
)
)
fig.update_xaxes(side="bottom")
return fig
def create_empty_heatmap(message):
"""Render an empty state for the heatmap with a centered message."""
fig = go.Figure()
fig.add_annotation(
text=f"🗺️ {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=520,
# width=1450,
autosize=True,
margin=dict(t=80, b=80, l=80, r=160),
title=dict(
text="
Comprehensive Performance Heatmap",
x=0.5,
y=0.98,
font=dict(
size=20,
family="'Geist', sans-serif",
color="white",
weight=700
),
)
)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
return fig
def create_level_metric_chart(df, level, selected_models=None, max_models=5):
"""Render a grouped horizontal bar chart showing per-model scores for a level's metrics."""
if not level:
return create_empty_level_metric_chart("Select a level to view its metrics")
df = df.copy()
level_prefix = f"{level}_"
level_columns = [col for col in df.columns if col.startswith(level_prefix)]
metric_columns = []
for col in level_columns:
metric_suffix = col[len(level_prefix):]
metric_key_lower = metric_suffix.lower()
if "cost" in metric_key_lower:
continue
numeric_series = pd.to_numeric(df[col], errors='coerce')
valid_values = numeric_series.dropna()
if valid_values.empty:
continue
if (valid_values < 0).any() or (valid_values > 1.05).any():
continue
df[col] = numeric_series
metric_columns.append(col)
if not metric_columns:
return create_empty_level_metric_chart("This level has no 0-1 metrics to visualize")
df = df.drop_duplicates(subset=['Model'])
if df.empty:
return create_empty_level_metric_chart("No models available to render level metrics")
if selected_models:
model_order = [m for m in selected_models if m in df['Model'].tolist()]
else:
sort_col = 'Overall Success' if 'Overall Success' in df.columns else metric_columns[0]
model_order = df.sort_values(sort_col, ascending=False)['Model'].tolist()
if not model_order:
model_order = df['Model'].tolist()
model_order = model_order[:max_models]
df_models = df[df['Model'].isin(model_order)].set_index('Model')
if df_models.empty:
return create_empty_level_metric_chart("No matching models for selected filters")
def prettify_metric_name(metric_key):
raw = metric_key[len(level_prefix):]
text = raw.replace('_', ' ')
text = re.sub(r'(?<=.)([A-Z])', r' \1', text)
text = text.replace('Avg', 'Average')
replacements = {
'Sr': 'SR',
'Ac': 'AC',
'Tsq': 'TSQ',
'Cvr': 'CVR',
'Psm': 'PSM',
'Prov': 'Prov',
'Call Em': 'CallEM',
'Reuse Rate': 'Reuse Rate',
'Eff Score': 'Eff Score'
}
words = text.title().split()
words = [replacements.get(word, word) for word in words]
return ' '.join(words)
metric_labels = []
for col in metric_columns:
label = prettify_metric_name(col)
if label in metric_labels:
suffix = 2
while f"{label} ({suffix})" in metric_labels:
suffix += 1
label = f"{label} ({suffix})"
metric_labels.append(label)
model_palette = [
'#ffd21e',
'#FF8A3C',
'#A16207',
'#DC2626',
'#F8FAFC',
'#38BDF8',
]
fig = go.Figure()
max_value = 0
for idx, model in enumerate(model_order):
values = []
for col in metric_columns:
value = df_models.at[model, col] if (model in df_models.index and col in df_models.columns) else float('nan')
if pd.notna(value):
values.append(float(value))
max_value = max(max_value, float(value))
else:
values.append(None)
color = model_palette[idx % len(model_palette)]
fig.add_trace(
go.Bar(
name=model,
y=metric_labels,
x=values,
orientation='h',
marker=dict(color=color, line=dict(color='rgba(1,9,26,0.8)', width=1)),
hovertemplate="
%{y}Model ·
%{fullData.name}Score · %{x:.3f}
",
)
)
plot_height = max(360, 140 + 48 * len(metric_labels))
if max_value <= 0:
x_range = [0, 1]
else:
x_range = [0, max_value * 1.05]
fig.update_layout(
barmode='group',
bargap=0.25,
bargroupgap=0.18,
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=plot_height,
# width=1450,
autosize=True,
margin=dict(t=90, b=80, l=220, r=160),
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
bgcolor='rgba(1, 9, 26, 0.75)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1,
font=dict(size=11, family="'Geist', sans-serif", color='white')
),
xaxis=dict(
title=dict(text=f"
{level} Metric Score", font=dict(size=14, color="white")),
tickfont=dict(size=11, color="white"),
gridcolor='rgba(245, 246, 247, 0.08)',
zerolinecolor='rgba(245, 246, 247, 0.18)',
range=x_range
),
yaxis=dict(
tickfont=dict(size=13, color="white"),
automargin=True
),
title=dict(
text=f"
{level} Metric Breakdown",
x=0.5,
y=0.98,
font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
)
)
return fig
def create_empty_level_metric_chart(message):
fig = go.Figure()
fig.add_annotation(
text=f"🧭 {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(size=18, color="white", family="'Geist', sans-serif"),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=420,
width=1450,
margin=dict(t=80, b=60, l=80, r=120),
title=dict(
text="
Level Metric Breakdown",
x=0.5,
y=0.98,
font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700)
)
)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
return fig
def create_empty_radar_chart(message):
"""Create an empty radar chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"📊 {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=1450,
width=1450,
margin=dict(t=100, b=80, l=80, r=200),
title=dict(
text="
Core Capability Radar",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="white",
weight=700
),
),
annotations=[
dict(
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=10, color='#64748B'),
showarrow=False
)
]
)
return fig
# NEW VISUALIZATION FUNCTIONS
def create_cost_performance_scatter(df, metric="Avg AC"):
"""Create scatter plot showing cost vs performance efficiency"""
# Filter out models without cost or performance data
df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy()
label_map = {
'Proprietary': 'API',
'Open source': 'OSS'
}
if df_filtered.empty:
return create_empty_chart("No data available for cost-performance analysis")
# Convert to numeric
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce')
# Create color mapping for model type
color_map = {
'Proprietary': '#1098F7', # Airglow Blue for Proprietary
'Open source': '#58BC82' # Green for Open source
}
df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7')
fig = go.Figure()
# Add scatter points
for model_type in df_filtered['Model Type'].unique():
df_type = df_filtered[df_filtered['Model Type'] == model_type]
legend_name = label_map.get(model_type, model_type)
fig.add_trace(go.Scatter(
x=df_type[metric],
y=df_type['Avg Total Cost'],
mode='markers+text',
name=legend_name,
text=df_type['Model'],
textposition="top center",
textfont=dict(size=10, color='white'),
marker=dict(
size=df_type['Avg Turns'] * 3, # Size based on number of turns
color=color_map.get(model_type, '#F5F6F7'),
opacity=0.8,
line=dict(width=2, color='#01091A')
),
hovertemplate="
%{text}" +
f"{metric}: %{{x:.3f}}
" +
"Cost: $%{y:.3f}
" +
"Turns: %{marker.size:.1f}
" +
"
"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Total Cost'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="💎 High Performance
Low Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="white"), bgcolor="rgba(245, 246, 247, 0.1)")
fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance
High Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)")
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"
Cost-Performance Efficiency: {metric_display}",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"
{metric_display}",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="
Average Session Cost ($)",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
font=dict(size=12, family="'Geist', sans-serif", color='white'),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1
),
margin=dict(t=100, b=80, l=80, r=80)
)
return fig
def create_speed_accuracy_plot(df, metric="Avg AC"):
"""Create scatter plot showing speed vs accuracy trade-off"""
# Filter out models without duration or performance data
df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy()
if df_filtered.empty:
return create_empty_chart("No data available for speed-accuracy analysis")
# Convert to numeric
df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
# Create color scale based on cost
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
fig = go.Figure()
# Add scatter trace
fig.add_trace(go.Scatter(
x=df_filtered[metric],
y=df_filtered['Avg Session Duration'],
mode='markers+text',
text=df_filtered['Model'],
textposition="top center",
textfont=dict(size=9, color='white'),
marker=dict(
size=12,
color=df_filtered['Avg Total Cost'],
colorscale=[[0, '#0A0A0A'], [0.5, '#B8660A'], [1, '#ffd21e']],
showscale=True,
colorbar=dict(
title=dict(
text="Cost ($)",
font=dict(color="white")
),
tickfont=dict(color="white"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
x=1.02
),
line=dict(width=2, color='#01091A')
),
hovertemplate="
%{text}" +
f"{metric}: %{{x:.3f}}
" +
"Duration: %{y:.1f}s
" +
"Cost: $%{marker.color:.3f}
" +
"
"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Session Duration'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="⚡ Fast & Accurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="white", weight=600))
fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#ffd21e", weight=600))
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"
Speed vs Accuracy Trade-off: {metric_display}",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"
{metric_display}",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="
Average Session Duration (seconds)",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=120)
)
return fig
def create_domain_specialization_matrix(df, metric_type="AC"):
"""Create bubble chart showing domain specialization"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Prepare data
data = []
for _, model in df.iterrows():
if model['Model'] == '':
continue
model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce')
if pd.isna(model_avg):
continue
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in model and model[domain_col] != '':
domain_val = pd.to_numeric(model[domain_col], errors='coerce')
if not pd.isna(domain_val):
# Calculate specialization strength (deviation from model average)
specialization = domain_val - model_avg
data.append({
'Model': model['Model'],
'Domain': domain,
'Performance': domain_val,
'Specialization': specialization,
'Model Type': model['Model Type']
})
if not data:
return create_empty_chart("No domain specialization data available")
df_plot = pd.DataFrame(data)
# Create bubble chart
fig = go.Figure()
# Color based on specialization strength
fig.add_trace(go.Scatter(
x=df_plot['Domain'],
y=df_plot['Model'],
mode='markers',
marker=dict(
size=df_plot['Performance'] * 30, # Size based on absolute performance
color=df_plot['Specialization'],
colorscale=[[0, '#B8660A'], [0.5, '#E6B800'], [1, '#ffd21e']],
showscale=True,
colorbar=dict(
title=dict(
text="Specialization
Strength",
font=dict(color="white")
),
tickfont=dict(color="white"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1
),
line=dict(width=2, color='#01091A'),
opacity=0.8
),
text=[f"Performance: {p:.3f}
Specialization: {s:+.3f}"
for p, s in zip(df_plot['Performance'], df_plot['Specialization'])],
hovertemplate="
%{y}" +
"Domain: %{x}
" +
"%{text}
" +
"
"
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"
Domain Specialization Matrix: {metric_display}",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text="
Business Domains",
font=dict(size=16, color="white")
),
tickfont=dict(size=13, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
yaxis=dict(
title=dict(
text="
Models",
font=dict(size=16, color="white")
),
tickfont=dict(size=11, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=1100,
width=1450,
margin=dict(t=100, b=80, l=220, r=120)
)
return fig
def create_performance_gap_analysis(df, metric_type="AC"):
"""Create range plot showing performance gaps by domain"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Calculate min, max, median for each domain
gap_data = []
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in df.columns:
domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna()
if len(domain_values) > 0:
gap_data.append({
'Domain': domain,
'Min': domain_values.min(),
'Max': domain_values.max(),
'Median': domain_values.median(),
'Q1': domain_values.quantile(0.25),
'Q3': domain_values.quantile(0.75),
'Gap': domain_values.max() - domain_values.min()
})
if not gap_data:
return create_empty_chart("No data available for gap analysis")
df_gap = pd.DataFrame(gap_data)
df_gap = df_gap.sort_values('Gap', ascending=True)
fig = go.Figure()
# Add range bars
for idx, row in df_gap.iterrows():
# Add full range line
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='lines',
line=dict(color='#64748B', width=2),
showlegend=False,
hoverinfo='skip'
))
# Add IQR box
fig.add_trace(go.Scatter(
x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']],
y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']],
fill='toself',
fillcolor='rgba(255, 210, 30, 0.3)',
line=dict(color='#ffd21e', width=2),
showlegend=False,
hoverinfo='skip',
mode='lines'
))
# Add median marker
fig.add_trace(go.Scatter(
x=[row['Median']],
y=[row['Domain']],
mode='markers',
marker=dict(
size=12,
color='#ffd21e',
symbol='diamond',
line=dict(width=2, color='#01091A')
),
showlegend=False,
hovertemplate=f"
{row['Domain']}" +
f"Min: {row['Min']:.3f}
" +
f"Q1: {row['Q1']:.3f}
" +
f"Median: {row['Median']:.3f}
" +
f"Q3: {row['Q3']:.3f}
" +
f"Max: {row['Max']:.3f}
" +
f"Gap: {row['Gap']:.3f}
" +
"
"
))
# Add min/max points
for idx, row in df_gap.iterrows():
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='markers',
marker=dict(size=8, color='white', line=dict(width=2, color='#01091A')),
showlegend=False,
hoverinfo='skip'
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"
Performance Gap Analysis by Domain: {metric_display}",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700)
),
xaxis=dict(
title=dict(
text=f"
{metric_display} Score",
font=dict(size=16, color="white")
),
tickfont=dict(size=12, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)",
range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
),
yaxis=dict(
title=dict(
text="
Business Domain",
font=dict(size=16, color="white")
),
tickfont=dict(size=13, color="white"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=800,
width=1450,
margin=dict(t=100, b=80, l=140, r=80),
showlegend=False
)
# Add legend manually
fig.add_annotation(
text="◆ Median ━ IQR ─ Full Range",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=12, color='white'),
showarrow=False
)
return fig
def create_empty_chart(message):
"""Create an empty chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"📊 {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="white",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=700,
width=1450,
margin=dict(t=80, b=80, l=80, r=80)
)