agent-leaderboard / tabs /leaderboard_v2.py
Pratik Bhavsar
improved layout
83e2d7b
raw
history blame
156 kB
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
# Import components and styles from modular files
from components.leaderboard_components import (
get_chart_colors, get_rank_badge, get_type_badge,
get_output_type_badge, get_score_bar, get_metric_tooltip,
get_responsive_styles, get_faq_section, SORT_COLUMN_MAP
)
from components.prediction_components import create_ac_prediction_chart
from styles.leaderboard_styles import get_leaderboard_css
def create_leaderboard_v2_tab():
"""Create the main leaderboard v2 tab with interactive table"""
def load_leaderboard_data():
"""Load and prepare the leaderboard data"""
df = pd.read_csv('results_v2.csv')
# Clean and prepare data
df = df.copy()
# Round numeric columns for better display
numeric_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
# Fill NaN values appropriately
df = df.fillna('')
return df
def generate_html_table(filtered_df, domain_filter):
"""Generate styled HTML table with rank badges and score bars"""
table_html = """
<style>
/* Dark theme table styling */
.v2-table-container {
background: var(--bg-card);
border-radius: 16px;
overflow: hidden;
border: 1px solid var(--border-subtle);
margin-top: 20px;
}
.v2-styled-table {
width: 100%;
border-collapse: collapse;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
background: var(--bg-card);
color: var(--text-primary);
}
.v2-styled-table thead {
position: sticky;
top: 0;
background: rgba(227, 84, 84, 0.1);
z-index: 1;
}
.v2-styled-table th {
padding: 14px 12px;
text-align: left;
font-weight: 600;
color: var(--text-primary);
border-bottom: 2px solid var(--accent-primary);
font-size: 13px;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.v2-styled-table td {
padding: 12px;
border-bottom: 1px solid var(--border-subtle);
color: var(--text-primary);
transition: all 0.2s ease;
}
.v2-styled-table tbody tr {
transition: all 0.3s ease;
}
.v2-styled-table tbody tr:hover {
background: rgba(227, 84, 84, 0.15) !important;
box-shadow: 0 0 20px rgba(227, 84, 84, 0.3), inset 0 0 20px rgba(227, 84, 84, 0.1);
transform: scale(1.01);
}
.v2-styled-table tbody tr:nth-child(even) {
background: var(--bg-secondary);
}
.model-name {
font-weight: 500;
color: var(--accent-primary);
transition: color 0.2s ease;
}
/* Keep model name color consistent on hover to emphasize row highlight */
.v2-styled-table tr:hover .model-name {
color: var(--accent-secondary);
}
.numeric-cell {
font-family: 'Geist Mono', monospace;
font-size: 13px;
text-align: center;
}
/* Score bar specific styling */
.score-cell {
min-width: 180px;
}
</style>
<div class="v2-table-container">
<table class="v2-styled-table">
<thead>
<tr>
<th style="width: 80px;">Rank</th>
<th>Model</th>
<th style="width: 120px;">Type</th>
<th style="width: 120px;">Output Type</th>
<th>Vendor</th>
<th style="width: 200px;" title="Action Completion (AC): Measures how well the agent accomplishes user goals and completes tasks successfully. Higher is better (0-1 scale).">
<span class="metric-header">Avg Action Completion <span class="info-icon">β“˜</span></span>
</th>
<th style="width: 200px;" title="Tool Selection Quality (TSQ): Evaluates the accuracy of selecting the right tools and using them with correct parameters. Higher is better (0-1 scale).">
<span class="metric-header">Avg Tool Selection Quality <span class="info-icon">β“˜</span></span>
</th>
<th title="Average cost per conversation session in USD, including all API calls and processing. Lower is better.">
<span class="metric-header">Avg Cost ($) <span class="info-icon">β“˜</span></span>
</th>
<th title="Average time taken to complete a full conversation session from start to finish, measured in seconds. Lower is generally better.">
<span class="metric-header">Avg Duration (s) <span class="info-icon">β“˜</span></span>
</th>
<th title="Average number of back-and-forth exchanges needed to complete a task. Lower typically indicates more efficient task completion.">
<span class="metric-header">Avg Turns <span class="info-icon">β“˜</span></span>
</th>
</tr>
</thead>
<tbody>
"""
# Generate table rows
for idx, (_, row) in enumerate(filtered_df.iterrows()):
rank = idx + 1
table_html += f"""
<tr>
<td>{get_rank_badge(rank)}</td>
<td class="model-name">{row['Model']}</td>
<td>{get_type_badge(row['Model Type'])}</td>
<td>{get_output_type_badge(row.get('Output Type', 'Normal'))}</td>
<td>{row['Vendor']}</td>
"""
# Get appropriate values based on domain filter
if domain_filter != "All":
# For specific domain, show domain-specific scores
ac_col = f'{domain_filter} AC'
tsq_col = f'{domain_filter} TSQ'
# AC score
if ac_col in row and row[ac_col] != '':
ac_value = float(row[ac_col])
table_html += f'<td class="score-cell">{get_score_bar(ac_value)}</td>'
else:
table_html += '<td class="numeric-cell">-</td>'
# TSQ score
if tsq_col in row and row[tsq_col] != '':
tsq_value = float(row[tsq_col])
table_html += f'<td class="score-cell">{get_score_bar(tsq_value)}</td>'
else:
table_html += '<td class="numeric-cell">-</td>'
else:
# For "All", show overall averages
table_html += f"""
<td class="score-cell">{get_score_bar(row['Avg AC'])}</td>
<td class="score-cell">{get_score_bar(row['Avg TSQ'])}</td>
"""
# Add appropriate cost, duration, and turns based on domain filter
if domain_filter != "All":
# Use domain-specific values
cost_col = f'{domain_filter} Cost'
duration_col = f'{domain_filter} Duration'
turns_col = f'{domain_filter} Turns'
cost = row.get(cost_col, '')
duration = row.get(duration_col, '')
turns = row.get(turns_col, '')
# Convert to float if not empty
if cost != '':
cost = float(cost)
if duration != '':
duration = float(duration)
if turns != '':
turns = float(turns)
else:
# Use overall averages for "All" domain
cost = row.get('Avg Total Cost', row.get('Cost ($)', ''))
duration = row.get('Avg Session Duration', row.get('Duration (s)', ''))
turns = row.get('Avg Turns', row.get('Turns', ''))
# Format the values for display
if cost != '':
cost_display = f'{cost:.3f}'
else:
cost_display = '-'
if duration != '':
duration_display = f'{duration:.1f}'
else:
duration_display = '-'
if turns != '':
turns_display = f'{turns:.1f}'
else:
turns_display = '-'
table_html += f"""
<td class="numeric-cell">${cost_display}</td>
<td class="numeric-cell">{duration_display}</td>
<td class="numeric-cell">{turns_display}</td>
</tr>
"""
table_html += """
</tbody>
</table>
</div>
"""
return table_html
def update_leaderboard_title(domain_filter):
"""Update the leaderboard title based on selected domain"""
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
return f"""
<div class="dark-container pulse" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ“ˆ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Agent Leaderboard for {domain_filter_clean}
</h3>
</div>
<div class="dataframe-container">
"""
def filter_and_sort_data(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order):
"""Filter and sort the leaderboard data"""
df = load_leaderboard_data()
# Apply filters
filtered_df = df.copy()
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
# Domain filtering (check if model performs well in specific domain)
if domain_filter_clean != "All":
domain_col_map = {
"Banking": "Banking AC",
"Healthcare": "Healthcare AC",
"Insurance": "Insurance AC",
"Investment": "Investment AC",
"Telecom": "Telecom AC"
}
if domain_filter_clean in domain_col_map:
domain_col = domain_col_map[domain_filter_clean]
# Only show models that have data for this domain
filtered_df = filtered_df[filtered_df[domain_col] != '']
# Model type filtering
if model_type_filter != "All":
if model_type_filter == "Open Source":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
# Reasoning filtering
if reasoning_filter != "All":
if reasoning_filter == "Reasoning":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
elif reasoning_filter == "Normal":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
# Map display name to actual column name using shared mapping
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
# If domain is selected and sorting by AC or TSQ, use domain-specific column
if domain_filter_clean != "All":
if actual_sort_column == "Avg AC":
actual_sort_column = f"{domain_filter_clean} AC"
elif actual_sort_column == "Avg TSQ":
actual_sort_column = f"{domain_filter_clean} TSQ"
elif actual_sort_column == "Avg Total Cost":
actual_sort_column = f"{domain_filter_clean} Cost"
elif actual_sort_column == "Avg Session Duration":
actual_sort_column = f"{domain_filter_clean} Duration"
elif actual_sort_column == "Avg Turns":
actual_sort_column = f"{domain_filter_clean} Turns"
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
# Generate HTML table
return generate_html_table(filtered_df, domain_filter_clean)
# Load initial data
initial_table = filter_and_sort_data("🌐 All", "All", "All", "Avg AC", "Descending")
initial_df = load_leaderboard_data() # Load raw data for model selector
# Load custom CSS and responsive styles
custom_css = get_leaderboard_css() + get_responsive_styles() + """
<style>
/* Page-specific styles for leaderboard v2 */
/* Metric header styles with info icons */
.metric-header {
cursor: help;
display: inline-flex;
align-items: center;
gap: 6px;
}
.info-icon {
color: var(--accent-secondary);
font-size: 1em;
opacity: 0.8;
transition: opacity 0.2s ease;
font-weight: normal;
}
.metric-header:hover .info-icon {
opacity: 1;
}
/* Native tooltip styling */
.v2-styled-table th[title] {
cursor: help;
}
/* Custom tooltip using CSS only */
[data-tooltip] {
position: relative;
cursor: help;
}
[data-tooltip]::before {
content: attr(data-tooltip);
position: absolute;
bottom: 100%;
left: 50%;
transform: translateX(-50%);
background: rgba(26, 26, 46, 0.95);
color: #f5f6f7;
padding: 8px 12px;
border-radius: 6px;
font-size: 12px;
white-space: nowrap;
max-width: 300px;
z-index: 10000;
opacity: 0;
pointer-events: none;
transition: opacity 0.3s;
margin-bottom: 5px;
border: 1px solid rgba(16, 152, 247, 0.3);
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.8);
}
[data-tooltip]:hover::before {
opacity: 1;
}
/* Dark theme table styling */
.v2-table-container {
background: var(--bg-card);
border-radius: 16px;
overflow: visible; /* Changed from hidden to visible for tooltips */
border: 1px solid var(--border-subtle);
margin-top: 20px;
position: relative;
}
.v2-styled-table {
width: 100%;
border-collapse: collapse;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif;
background: var(--bg-card);
color: var(--text-primary);
}
.v2-styled-table thead {
position: sticky;
top: 0;
background: rgba(227, 84, 84, 0.1);
z-index: 1;
}
.v2-styled-table th {
padding: 14px 12px;
text-align: left;
font-weight: 600;
color: var(--text-primary);
border-bottom: 2px solid var(--accent-primary);
font-size: 14px;
text-transform: uppercase;
letter-spacing: 0.05em;
position: relative; /* Added for tooltip positioning */
}
.v2-styled-table td {
padding: 12px;
border-bottom: 1px solid var(--border-subtle);
color: var(--text-primary);
font-size: 14px;
transition: all 0.2s ease;
}
.v2-styled-table tbody tr {
transition: all 0.3s ease;
}
.v2-styled-table tbody tr:hover {
background: rgba(227, 84, 84, 0.15) !important;
box-shadow: 0 0 20px rgba(227, 84, 84, 0.3), inset 0 0 20px rgba(227, 84, 84, 0.1);
transform: scale(1.01);
}
.v2-styled-table tbody tr:nth-child(even) {
background: var(--bg-secondary);
}
.model-name {
font-weight: 500;
color: var(--accent-primary);
font-size: 14px;
transition: color 0.2s ease;
}
.v2-styled-table tr:hover .model-name {
color: var(--accent-secondary);
}
.numeric-cell {
font-family: 'Geist Mono', monospace;
font-size: 14px;
text-align: center;
}
.score-cell {
min-width: 180px;
}
</style>
<script>
// Function to update radio button styling
function updateRadioStyling() {
// Remove selected class from all labels first
document.querySelectorAll('.selected').forEach(function(label) {
label.classList.remove('selected');
});
// Apply selected class to checked radio buttons
document.querySelectorAll('input[type="radio"]:checked').forEach(function(input) {
var label = input.closest('label');
if (label) {
label.classList.add('selected');
// For domain radio buttons, apply special styling
if (label.closest('.domain-radio')) {
label.style.background = 'linear-gradient(145deg, rgba(227, 84, 84, 0.2), rgba(227, 84, 84, 0.1))';
label.style.borderColor = 'var(--accent-primary)';
label.style.transform = 'scale(1.05)';
label.style.fontWeight = '600';
}
}
});
}
// Wait for Gradio to initialize
function initializeRadioStyles() {
updateRadioStyling();
// Create observer to watch for changes
var observer = new MutationObserver(function(mutations) {
mutations.forEach(function(mutation) {
if (mutation.type === 'attributes' && mutation.attributeName === 'checked') {
updateRadioStyling();
}
});
});
// Observe all radio inputs
document.querySelectorAll('input[type="radio"]').forEach(function(radio) {
observer.observe(radio, { attributes: true });
});
}
// Try multiple initialization strategies
document.addEventListener('DOMContentLoaded', function() {
setTimeout(initializeRadioStyles, 100);
setTimeout(initializeRadioStyles, 500);
setTimeout(initializeRadioStyles, 1000);
});
// Also check when window loads
window.addEventListener('load', function() {
setTimeout(initializeRadioStyles, 100);
});
// Listen for Gradio's custom events
document.addEventListener('gradio:loaded', initializeRadioStyles);
</script>
"""
gr.HTML(custom_css)
# Header button above title
gr.HTML("""
<style>
/* Enhanced button styling with better gradio compatibility */
.custom-button-container {
text-align: center;
padding: 20px 0 10px 0;
margin-bottom: 10px;
}
.header-action-button {
display: inline-block !important;
padding: 14px 28px !important;
background: linear-gradient(135deg, #E35454 0%, #C84545 100%) !important;
color: #FFFFFF !important;
text-decoration: none !important;
border-radius: 16px !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 700 !important;
font-size: 1.1rem !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
border: none !important;
cursor: pointer !important;
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.4), 0 4px 12px rgba(0, 0, 0, 0.3) !important;
position: relative !important;
overflow: hidden !important;
text-shadow: 0 1px 2px rgba(0, 0, 0, 0.3) !important;
}
.header-action-button::before {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
transition: left 0.6s;
}
.header-action-button:hover::before {
left: 100%;
}
.header-action-button:hover {
transform: translateY(-3px) !important;
box-shadow: 0 12px 32px rgba(227, 84, 84, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important;
background: linear-gradient(135deg, #F46464 0%, #D84F4F 100%) !important;
color: #FFFFFF !important;
text-decoration: none !important;
}
.header-action-button:active {
transform: translateY(-1px) !important;
}
.action-button-icon {
font-size: 1.2rem !important;
margin-right: 8px !important;
filter: drop-shadow(0 0 8px rgba(255, 255, 255, 0.3));
}
/* Navigation buttons styling */
.nav-buttons-container {
display: flex;
justify-content: center;
align-items: center;
gap: 16px;
flex-wrap: wrap;
margin: 24px 0;
padding: 0 20px;
}
.nav-link-button {
display: inline-flex !important;
align-items: center !important;
gap: 8px !important;
padding: 12px 20px !important;
background: rgba(1, 9, 26, 0.8) !important;
color: #F5F6F7 !important;
text-decoration: none !important;
border-radius: 12px !important;
font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important;
font-weight: 600 !important;
font-size: 0.95rem !important;
transition: all 0.3s ease !important;
border: 2px solid rgba(245, 246, 247, 0.15) !important;
backdrop-filter: blur(10px) !important;
-webkit-backdrop-filter: blur(10px) !important;
position: relative !important;
overflow: hidden !important;
box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important;
}
.nav-link-button::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: linear-gradient(135deg, rgba(227, 84, 84, 0.1) 0%, rgba(16, 152, 247, 0.1) 100%);
opacity: 0;
transition: opacity 0.3s ease;
}
.nav-link-button:hover::before {
opacity: 1;
}
.nav-link-button:hover {
transform: translateY(-3px) scale(1.02) !important;
border-color: #E35454 !important;
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.3), 0 4px 12px rgba(0, 0, 0, 0.4) !important;
text-decoration: none !important;
color: #FFFFFF !important;
}
.nav-link-button.primary-nav {
background: linear-gradient(135deg, #1098F7 0%, #0A6BC4 100%) !important;
border-color: #1098F7 !important;
color: #FFFFFF !important;
font-weight: 700 !important;
}
.nav-link-button.primary-nav:hover {
background: linear-gradient(135deg, #2AA8FF 0%, #0550A0 100%) !important;
border-color: #2AA8FF !important;
box-shadow: 0 8px 24px rgba(16, 152, 247, 0.4), 0 4px 12px rgba(0, 0, 0, 0.4) !important;
color: #FFFFFF !important;
}
.nav-button-icon {
font-size: 1.1rem !important;
filter: drop-shadow(0 0 6px currentColor);
}
/* Responsive design */
@media (max-width: 768px) {
.nav-buttons-container {
gap: 12px;
padding: 0 10px;
}
.nav-link-button {
font-size: 0.85rem !important;
padding: 10px 16px !important;
}
.header-action-button {
font-size: 1rem !important;
padding: 12px 24px !important;
}
}
@media (max-width: 480px) {
.nav-buttons-container {
flex-direction: column;
gap: 8px;
}
.nav-link-button {
width: 200px;
justify-content: center;
}
}
</style>
<div class="custom-button-container">
<a href="https://app.galileo.ai/sign-up?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="header-action-button">
<span class="action-button-icon">πŸš€</span>Evaluate your GenAI for free
</a>
</div>
""")
gr.HTML("""
<div style="text-align: center; padding: 20px 0;">
<h1 style="font-size: 3rem; margin-bottom: 12px; color: var(--text-primary);
text-shadow: 0 0 20px rgba(227, 84, 84, 0.3); font-family: 'Geist', sans-serif; font-weight: 800;">
πŸš€ Galileo Agent Leaderboard v2
</h1>
<p style="color: var(--text-secondary); font-size: 1.2rem; margin-top: 0; font-family: 'Geist', sans-serif;">
Comprehensive performance metrics for LLM agents across business domains
</p>
</div>
""")
# Links section below title
gr.HTML("""
<div class="nav-buttons-container">
<a href="http://galileo.ai/blog/agent-leaderboard-v2" target="_blank" class="nav-link-button">
<span class="nav-button-icon">πŸ“</span>
Blog
</a>
<a href="https://galileo.ai/mastering-agents-ebook?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="nav-link-button">
<span class="nav-button-icon">πŸ“š</span>
eBook
</a>
<a href="https://github.com/rungalileo/agent-leaderboard" target="_blank" class="nav-link-button">
<span class="nav-button-icon">πŸ™</span>
GitHub
</a>
<a href="https://huggingface.co/datasets/galileo-ai/agent-leaderboard-v2" target="_blank" class="nav-link-button">
<span class="nav-button-icon">πŸ€—</span>
Dataset
</a>
<a href="https://huggingface.co/spaces/galileo-ai/agent-leaderboard/discussions/new" target="_blank" class="nav-link-button">
<span class="nav-button-icon">βž•</span>
Add Model
</a>
</div>
""")
# Metrics overview cards with insights
gr.HTML("""
<div style="margin-bottom: 40px;">
<!-- Ultra-modern metric cards with advanced styling -->
<style>
.insight-card {
background: linear-gradient(145deg, rgba(245, 246, 247, 0.03) 0%, rgba(227, 84, 84, 0.08) 100%);
border-radius: 16px;
padding: 20px;
position: relative;
border: 1px solid var(--border-subtle);
transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
overflow: hidden;
backdrop-filter: blur(20px);
-webkit-backdrop-filter: blur(20px);
}
.insight-card::before {
content: '';
position: absolute;
inset: 0;
border-radius: 24px;
padding: 1px;
background: linear-gradient(145deg, var(--border-subtle), var(--border-default));
-webkit-mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0);
-webkit-mask-composite: source-out;
mask-composite: subtract;
pointer-events: none;
}
.insight-card::after {
content: '';
position: absolute;
top: -100%;
left: -100%;
width: 300%;
height: 300%;
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
opacity: 0;
transition: opacity 0.6s ease, transform 0.6s ease;
pointer-events: none;
}
.insight-card:hover::after {
opacity: 0.15;
transform: translate(50%, 50%);
}
.insight-card:hover {
transform: translateY(-8px);
border-color: var(--accent-primary);
box-shadow:
0 24px 48px rgba(227, 84, 84, 0.2),
0 12px 24px rgba(0, 0, 0, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.1);
}
.insight-card.secondary-accent:hover {
border-color: var(--accent-primary);
}
.insight-card.secondary-accent::after {
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
}
.insight-card.tertiary-accent:hover {
border-color: var(--accent-primary);
}
.insight-card.tertiary-accent::after {
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
}
.card-header {
display: flex;
align-items: center;
gap: 8px;
margin-bottom: 12px;
}
.card-icon {
display: flex;
align-items: center;
justify-content: center;
font-size: 2rem;
margin-right: 8px;
}
.card-title {
flex: 1;
}
.card-label {
font-family: 'Geist Mono', monospace;
font-size: 0.7rem;
letter-spacing: 0.05em;
text-transform: uppercase;
color: var(--text-secondary);
margin-bottom: 2px;
}
.card-value {
font-family: 'Geist', sans-serif;
font-size: 1.1rem;
font-weight: 700;
color: var(--text-primary);
line-height: 1.1;
}
.insight-list {
list-style: none;
padding: 0;
margin: 0;
}
.insight-list li {
margin-bottom: 8px;
}
.insight-item {
display: flex;
align-items: center;
gap: 8px;
padding: 8px 10px;
background: rgba(245, 246, 247, 0.03);
border-radius: 8px;
border: 1px solid var(--border-subtle);
transition: all 0.3s ease;
}
.insight-item:hover {
background: rgba(227, 84, 84, 0.1);
border-color: var(--accent-primary);
transform: translateX(4px);
}
.insight-icon {
font-size: 1rem;
flex-shrink: 0;
}
.insight-text {
flex: 1;
font-size: 0.85rem;
line-height: 1.3;
color: var(--text-secondary);
}
.highlight {
color: var(--text-primary);
font-weight: 600;
}
.badge-row {
display: flex;
gap: 6px;
margin-top: 10px;
flex-wrap: wrap;
}
.badge {
padding: 4px 10px;
background: rgba(245, 246, 247, 0.05);
border: 1px solid var(--border-subtle);
border-radius: 16px;
font-size: 0.75rem;
color: var(--text-secondary);
transition: all 0.2s ease;
display: flex;
align-items: center;
gap: 4px;
}
.badge:hover {
background: rgba(227, 84, 84, 0.15);
border-color: var(--accent-primary);
color: var(--text-primary);
transform: scale(1.05);
}
.badge-icon {
font-size: 0.85rem;
}
@keyframes float {
0%, 100% { transform: translateY(0); }
50% { transform: translateY(-5px); }
}
.floating-icon {
animation: float 3s ease-in-out infinite;
}
/* Tertiary color for special elements */
.tertiary-color {
color: var(--accent-tertiary);
}
</style>
<!-- First row: Five key insight cards -->
<div class="insight-card-grid">
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
🎯
</div>
</div>
<div class="card-value">Task Completion</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Compare models based on their ability to complete real-world business tasks accurately and efficiently
</div>
</div>
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
πŸ’‘
</div>
</div>
<div class="card-value">Tool Selection</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Analyze how precisely models choose the right tools for each task and make optimal decisions
</div>
</div>
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
πŸ’°
</div>
</div>
<div class="card-value">Cost Efficiency</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Find models that deliver the best performance per dollar spent and optimize your ROI
</div>
</div>
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
πŸ›οΈ
</div>
</div>
<div class="card-value">Domain Coverage</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Banking, Healthcare, Insurance, Investment, and Telecom industries analyzed for specialized performance
</div>
</div>
<div class="insight-card">
<div class="card-header">
<div class="card-icon floating-icon" style="color: var(--accent-primary);">
πŸš€
</div>
</div>
<div class="card-value">Speed vs Accuracy</div>
<div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;">
Understand the trade-offs between response time and accuracy to find the right balance
</div>
</div>
</div>
<!-- Second row: Key features showcase -->
<div class="metric-card-grid" style="margin-top: 16px;">
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);">
<div class="card-value">Model Capabilities</div>
<div class="badge-row" style="margin-top: 16px;">
<div class="badge">
<span class="badge-icon">πŸ”“</span>
<span>Open Source</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ”’</span>
<span>Proprietary</span>
</div>
<div class="badge">
<span class="badge-icon">🧠</span>
<span>Reasoning</span>
</div>
</div>
</div>
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);">
<div class="card-value">Interactive Visualizations</div>
<div class="badge-row" style="margin-top: 16px;">
<div class="badge">
<span class="badge-icon">πŸ•ΈοΈ</span>
<span>Radar Charts</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ“Š</span>
<span>Heatmaps</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ“ˆ</span>
<span>Scatter Plots</span>
</div>
</div>
</div>
<div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);">
<div class="card-value">Real-World Performance</div>
<div class="badge-row" style="margin-top: 16px;">
<div class="badge">
<span class="badge-icon">πŸ’Ό</span>
<span>Business Tasks</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ”„</span>
<span>Multi-Turn</span>
</div>
<div class="badge">
<span class="badge-icon">πŸ“‹</span>
<span>Benchmarks</span>
</div>
</div>
</div>
</div>
</div>
""")
# Domain filter section with enhanced styling
gr.HTML("""
<style>
/* Enhanced domain selector styling */
.domain-selector-container {
background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);
border-radius: 20px;
padding: 32px;
margin-bottom: 32px;
border: 1px solid var(--border-subtle);
position: relative;
overflow: hidden;
box-shadow:
0 8px 32px rgba(0, 0, 0, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.05);
}
.domain-selector-container::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
opacity: 0.1;
animation: pulse 4s ease-in-out infinite;
}
.domain-header {
text-align: center;
margin-bottom: 28px;
position: relative;
z-index: 1;
}
.domain-title {
font-size: 2rem;
font-weight: 800;
background: linear-gradient(90deg, var(--accent-primary), var(--accent-secondary));
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 8px;
text-shadow: 0 0 30px var(--glow-primary);
}
.domain-subtitle {
color: var(--text-secondary);
font-size: 1.2rem;
font-family: 'Geist', sans-serif;
}
/* Custom radio button styling */
.domain-radio {
display: flex !important;
gap: 12px !important;
flex-wrap: wrap !important;
justify-content: center !important;
position: relative;
z-index: 1;
}
/* Gradio radio button wrapper */
.domain-radio .wrap {
display: flex !important;
gap: 12px !important;
flex-wrap: wrap !important;
justify-content: center !important;
width: 100% !important;
}
.domain-radio label,
.domain-radio .wrap > label {
flex: 1 !important;
min-width: 160px !important;
max-width: 200px !important;
padding: 16px 24px !important;
background: var(--bg-card) !important;
border: 2px solid var(--border-default) !important;
border-radius: 16px !important;
cursor: pointer !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
text-align: center !important;
position: relative !important;
overflow: hidden !important;
}
.domain-radio label::before {
content: '';
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: linear-gradient(145deg, transparent, var(--glow-primary));
opacity: 0;
transition: opacity 0.3s ease;
pointer-events: none;
}
.domain-radio label:hover {
transform: translateY(-2px) !important;
border-color: var(--accent-primary) !important;
box-shadow:
0 8px 24px rgba(227, 84, 84, 0.3),
inset 0 0 20px rgba(227, 84, 84, 0.1) !important;
}
.domain-radio label:hover::before {
opacity: 0.1;
}
.domain-radio input[type="radio"] {
display: none !important;
}
.domain-radio input[type="radio"]:checked + label,
.domain-radio .wrap > label:has(input[type="radio"]:checked),
.domain-radio label.selected {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: var(--text-primary) !important;
font-weight: 600 !important;
transform: scale(1.05) !important;
box-shadow:
0 12px 32px rgba(227, 84, 84, 0.4),
0 0 60px rgba(227, 84, 84, 0.2) !important;
}
.domain-radio input[type="radio"]:checked + label::before {
opacity: 0.2;
}
/* Domain icons */
.domain-icon {
font-size: 1.5rem;
margin-bottom: 4px;
display: block;
filter: drop-shadow(0 0 10px currentColor);
}
.domain-name {
font-size: 0.95rem;
font-weight: 500;
margin-top: 4px;
}
/* Badge for domain counts */
.domain-count {
position: absolute;
top: 8px;
right: 8px;
background: var(--accent-primary);
color: white;
font-size: 0.75rem;
padding: 2px 8px;
border-radius: 12px;
font-weight: 600;
opacity: 0.8;
}
/* Filter radio buttons styling - smaller for better fit */
.filter-radio {
max-width: 100% !important;
}
.filter-radio .gr-row {
gap: 8px !important;
}
.filter-radio .gr-column {
min-width: 0 !important;
flex: 1 !important;
}
.filter-radio .gr-form {
min-width: 0 !important;
}
.filter-radio .gr-radio-group {
gap: 4px !important;
}
.filter-radio .domain-radio {
display: flex !important;
gap: 4px !important;
flex-wrap: nowrap !important;
justify-content: center !important;
}
.filter-radio .domain-radio label {
min-width: auto !important;
max-width: 120px !important;
padding: 8px 12px !important;
font-size: 0.8rem !important;
white-space: nowrap !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
}
/* Additional targeting for the specific filter components */
.filter-radio .gr-box {
padding: 8px !important;
}
.filter-radio .gr-radio {
gap: 4px !important;
}
.filter-radio .gr-input-label {
font-size: 0.85rem !important;
margin-bottom: 4px !important;
}
/* Force compact layout for the filters */
@media (max-width: 1400px) {
.filter-radio .domain-radio label {
padding: 6px 10px !important;
font-size: 0.75rem !important;
}
}
/* Compact filter row styling */
.compact-filter-row {
margin-bottom: 20px !important;
}
.compact-filter-row .gr-column {
padding: 0 8px !important;
}
.compact-filter-row .gr-box {
padding: 0 !important;
}
/* Compact radio button styling */
.compact-radio {
width: 100% !important;
}
.compact-radio > label {
font-size: 0.85rem !important;
margin-bottom: 8px !important;
font-weight: 600 !important;
color: var(--text-primary) !important;
display: block !important;
}
.compact-radio .wrap {
display: flex !important;
flex-wrap: nowrap !important;
gap: 4px !important;
justify-content: center !important;
}
.compact-radio .wrap > label {
display: inline-flex !important;
align-items: center !important;
justify-content: center !important;
padding: 6px 10px !important;
margin: 0 !important;
background: var(--bg-card) !important;
border: 1px solid var(--border-default) !important;
border-radius: 8px !important;
cursor: pointer !important;
transition: all 0.2s ease !important;
font-size: 0.75rem !important;
white-space: nowrap !important;
flex: 1 !important;
min-width: 0 !important;
overflow: hidden !important;
text-overflow: ellipsis !important;
}
.compact-radio .wrap > label:has(input[type="radio"]:checked) {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: var(--text-primary) !important;
font-weight: 600 !important;
}
.compact-radio .wrap > label:hover {
background: rgba(227, 84, 84, 0.1) !important;
border-color: var(--accent-primary) !important;
transform: scale(1.02) !important;
}
.compact-radio input[type="radio"] {
display: none !important;
}
/* Target Gradio's data attributes for selected state */
.compact-radio label[data-selected="true"],
.compact-radio label[aria-checked="true"],
.domain-radio label[data-selected="true"],
.domain-radio label[aria-checked="true"] {
background: transparent !important;
border-color: var(--accent-primary) !important;
color: var(--text-primary) !important;
font-weight: 600 !important;
}
/* Sort by radio buttons */
.sort-by-radio .domain-radio {
display: flex !important;
gap: 10px !important;
flex-wrap: wrap !important;
justify-content: flex-start !important;
}
.sort-by-radio .domain-radio .wrap {
display: flex !important;
gap: 10px !important;
flex-wrap: wrap !important;
justify-content: flex-start !important;
width: 100% !important;
}
.sort-by-radio .domain-radio label,
.sort-by-radio .domain-radio .wrap > label {
min-width: 180px !important;
max-width: 220px !important;
padding: 12px 20px !important;
font-size: 0.95rem !important;
}
</style>
<div class="domain-selector-container">
<div class="domain-header">
<h2 class="domain-title">πŸ›οΈ Select Business Domain</h2>
<p class="domain-subtitle">Choose a domain to see specialized agent performance</p>
</div>
""")
# Creating a custom radio with better visual design
domain_choices = [
("All", "🌐", "All Domains"),
("Banking", "🏦", "Banking"),
("Healthcare", "πŸ₯", "Healthcare"),
("Insurance", "πŸ›‘οΈ", "Insurance"),
("Investment", "πŸ’°", "Investment"),
("Telecom", "πŸ“±", "Telecom")
]
with gr.Row():
domain_filter = gr.Radio(
choices=["🌐 All", "🏦 Banking", "πŸ₯ Healthcare", "πŸ›‘οΈ Insurance", "πŸ’° Investment", "πŸ“± Telecom"],
value="🌐 All",
label="",
interactive=True,
elem_classes=["domain-radio"]
)
gr.HTML("""
</div>
""")
# Filter controls with enhanced styling
gr.HTML("""
<div class="dark-container" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ”</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Filters & Sorting
</h3>
</div>
""")
# First row: Model filters and sort order
with gr.Row(elem_classes=["compact-filter-row"]):
with gr.Column(scale=1):
model_type_filter = gr.Radio(
choices=["All", "Open Source", "Proprietary"],
value="All",
label="πŸ”“ Model Access",
elem_classes=["compact-radio"]
)
with gr.Column(scale=1):
reasoning_filter = gr.Radio(
choices=["All", "Reasoning", "Normal"],
value="All",
label="🧠 Output Type",
elem_classes=["compact-radio"]
)
with gr.Column(scale=1):
sort_order = gr.Radio(
choices=["Descending", "Ascending"],
value="Descending",
label="πŸ”„ Sort Order",
elem_classes=["compact-radio"]
)
# Second row: Sort by options
gr.HTML("""<div style="margin-top: 20px; margin-bottom: 10px;">
<h4 style="color: var(--text-primary); font-size: 1.1rem; font-weight: 600; margin: 0;">πŸ“Š Sort By</h4>
</div>""")
gr.HTML('<div class="sort-by-radio">')
sort_by = gr.Radio(
choices=["Avg Action Completion", "Avg Tool Selection Quality", "Avg Session Cost", "Avg Session Duration", "Avg Turns"],
value="Avg Action Completion",
label="",
elem_classes=["domain-radio"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# Main leaderboard table with dynamic title
leaderboard_title = gr.HTML("""
<div class="dark-container pulse" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ“ˆ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Agent Leaderboard for All
</h3>
</div>
<div class="dataframe-container">
""")
leaderboard_table = gr.HTML(initial_table)
gr.HTML("""
</div>
</div>""")
# Evaluate Your Agents Button
gr.HTML("""
<div style="text-align: center; margin-top: 32px; margin-bottom: 32px;">
<a href="https://app.galileo.ai/sign-up?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank"
style="display: inline-flex; align-items: center; gap: 12px; padding: 16px 40px;
background: linear-gradient(135deg, #E35454 0%, #F06B6B 100%);
border: none;
border-radius: 16px;
color: white;
text-decoration: none;
font-size: 1.1rem;
font-family: 'Geist', sans-serif;
font-weight: 600;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.35), 0 2px 8px rgba(0, 0, 0, 0.1);
transform: translateY(0);">
<span style="font-size: 1.3rem;">πŸš€</span>
<span>Evaluate Your Agents</span>
<span style="font-size: 0.9rem;">β†’</span>
</a>
</div>
<style>
.dataframe-container + div a:hover {
background: linear-gradient(135deg, #D94444 0%, #E05555 100%) !important;
transform: translateY(-3px) !important;
box-shadow: 0 12px 32px rgba(227, 84, 84, 0.45), 0 4px 12px rgba(0, 0, 0, 0.15) !important;
}
</style>
""")
# Column Info Section
gr.HTML("""
<div class="dark-container" style="margin-top: 24px; margin-bottom: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ“‹</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Column Explanations
</h3>
</div>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin-top: 20px;">
<!-- Performance Metrics -->
<div class="info-box" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.03) 100%);">
<h4 style="color: var(--accent-primary); margin-top: 0; margin-bottom: 16px; font-size: 1.1rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;">
<span style="font-size: 1.3rem;">🎯</span>
Performance Metrics
</h4>
<div style="space-y: 12px;">
<div style="margin-bottom: 12px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
πŸ“Š Action Completion
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4; margin-bottom: 6px;">
Measures how well the agent accomplishes user goals and completes tasks successfully.
</div>
<a href="https://v2docs.galileo.ai/concepts/metrics/agentic/action-completion" target="_blank"
style="color: var(--accent-primary); text-decoration: none; font-size: 0.85rem; display: inline-flex; align-items: center; gap: 4px;">
πŸ“– Learn more about Action Completion
<span style="font-size: 0.7rem;">β†—</span>
</a>
</div>
<div style="border-top: 1px solid var(--border-subtle); padding-top: 12px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
πŸ› οΈ Tool Selection Quality
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4; margin-bottom: 6px;">
Evaluates the accuracy of selecting the right tools and using them with correct parameters.
</div>
<a href="https://v2docs.galileo.ai/concepts/metrics/agentic/tool-selection-quality" target="_blank"
style="color: var(--accent-primary); text-decoration: none; font-size: 0.85rem; display: inline-flex; align-items: center; gap: 4px;">
πŸ“– Learn more about Tool Selection Quality
<span style="font-size: 0.7rem;">β†—</span>
</a>
</div>
</div>
</div>
<!-- Session-Level Metrics -->
<div class="info-box" style="background: linear-gradient(145deg, rgba(16, 152, 247, 0.05) 0%, rgba(245, 246, 247, 0.03) 100%);">
<h4 style="color: var(--accent-secondary); margin-top: 0; margin-bottom: 16px; font-size: 1.1rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;">
<span style="font-size: 1.3rem;">πŸ“ˆ</span>
Session-Level Metrics
</h4>
<div style="space-y: 10px;">
<div style="margin-bottom: 10px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
πŸ’° Avg Cost ($)
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;">
Average cost per conversation session, including all API calls and processing.
</div>
</div>
<div style="margin-bottom: 10px; border-top: 1px solid var(--border-subtle); padding-top: 10px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
⚑ Avg Duration (s)
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;">
Average time taken to complete a full conversation session from start to finish.
</div>
</div>
<div style="border-top: 1px solid var(--border-subtle); padding-top: 10px;">
<div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;">
πŸ’¬ Avg Turns
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;">
Average number of back-and-forth exchanges needed to complete a task.
</div>
</div>
</div>
</div>
</div>
<!-- Additional Notes -->
<div style="margin-top: 24px; padding: 16px; background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle); border-radius: 12px;">
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;">
<span style="font-size: 1.1rem;">πŸ’‘</span>
<span style="font-weight: 600; color: var(--text-primary); font-size: 0.95rem;">Default Sorting</span>
</div>
<div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;">
The table is sorted by <strong style="color: var(--text-primary);">Action Completion</strong> in descending order by default, showing the best-performing models first. You can change the sorting using the filters above.
</div>
</div>
</div>
""")
# AI Agent Reliability Prediction Section
gr.HTML("""
<div class="dark-container" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ“ˆ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Enterprise Readiness Prediction
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
When will AI agents reach 99% reliability for enterprise deployment?
</p>
""")
# Create initial prediction with default filters
initial_prediction_chart, initial_date_99, initial_months_to_99, initial_best_ac = create_ac_prediction_chart(
load_leaderboard_data(), domain_filter="All", model_type_filter="All"
)
# Add dynamic insights section with visual cards
def generate_insight_html(date_99, months_to_99, domain_filter="All", model_type_filter="All", current_best_ac=None):
"""Generate insight HTML with visual cards/badges based on prediction results and filters"""
# Clean up filter names
if domain_filter.startswith('🌐'):
domain_clean = "All Domains"
elif domain_filter.startswith('🏦'):
domain_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_clean = "Telecom"
else:
domain_clean = domain_filter
filter_context = ""
filter_badge = ""
if domain_clean != "All Domains" or model_type_filter != "All":
parts = []
if domain_clean != "All Domains":
parts.append(domain_clean)
if model_type_filter != "All":
parts.append(f"{model_type_filter} Models")
filter_badge = f"""
<span style="
display: inline-flex;
align-items: center;
gap: 4px;
padding: 4px 10px;
background: rgba(16, 152, 247, 0.15);
border: 1px solid rgba(16, 152, 247, 0.3);
border-radius: 20px;
font-size: 0.85rem;
font-weight: 600;
color: var(--accent-primary);
margin-left: 12px;
">
<span style="font-size: 0.9rem;">πŸ”</span>
{' β€’ '.join(parts)}
</span>
"""
if date_99 and months_to_99:
if months_to_99 > 0:
# Calculate percentage progress (assuming 100% at 99% AC)
# If current_best_ac is provided, use it; otherwise default to 85%
if current_best_ac is not None:
# Scale from 0-99% AC to 0-100% progress
current_progress = min(100, int((current_best_ac / 0.99) * 100))
else:
current_progress = 85 # Fallback if not provided
return f"""
<div style="margin-bottom: 24px;">
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 16px;">
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-weight: 700;">
Key Predictions
</h3>
{filter_badge}
</div>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 16px;">
<!-- Enterprise Ready Date Card -->
<div style="
padding: 20px;
background: linear-gradient(135deg, rgba(16, 152, 247, 0.1) 0%, rgba(16, 152, 247, 0.05) 100%);
border: 1px solid rgba(16, 152, 247, 0.2);
border-radius: 16px;
position: relative;
overflow: hidden;
">
<div style="
position: absolute;
top: -20px;
right: -20px;
width: 60px;
height: 60px;
background: radial-gradient(circle, rgba(16, 152, 247, 0.2) 0%, transparent 70%);
border-radius: 50%;
"></div>
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;">
<span style="font-size: 1.8rem;">πŸ“…</span>
<span style="color: var(--text-secondary); font-size: 0.9rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Target Date</span>
</div>
<div style="font-size: 1.8rem; font-weight: 800; color: var(--text-primary); margin-bottom: 4px;">
{date_99.strftime('%b %Y')}
</div>
<div style="color: var(--text-secondary); font-size: 0.95rem;">
99% AC Threshold
</div>
</div>
<!-- Time Remaining Card -->
<div style="
padding: 20px;
background: linear-gradient(135deg, rgba(227, 84, 84, 0.1) 0%, rgba(227, 84, 84, 0.05) 100%);
border: 1px solid rgba(227, 84, 84, 0.2);
border-radius: 16px;
position: relative;
overflow: hidden;
">
<div style="
position: absolute;
top: -20px;
right: -20px;
width: 60px;
height: 60px;
background: radial-gradient(circle, rgba(227, 84, 84, 0.2) 0%, transparent 70%);
border-radius: 50%;
"></div>
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;">
<span style="font-size: 1.8rem;">⏱️</span>
<span style="color: var(--text-secondary); font-size: 0.9rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Timeline</span>
</div>
<div style="font-size: 1.8rem; font-weight: 800; color: var(--text-primary); margin-bottom: 4px;">
~{months_to_99:.0f}
</div>
<div style="color: var(--text-secondary); font-size: 0.95rem;">
Months to 99% AC
</div>
</div>
<!-- Current Performance Card -->
<div style="
padding: 20px;
background: linear-gradient(135deg, rgba(40, 167, 69, 0.1) 0%, rgba(40, 167, 69, 0.05) 100%);
border: 1px solid rgba(40, 167, 69, 0.2);
border-radius: 16px;
position: relative;
overflow: hidden;
">
<div style="
position: absolute;
top: -20px;
right: -20px;
width: 60px;
height: 60px;
background: radial-gradient(circle, rgba(40, 167, 69, 0.2) 0%, transparent 70%);
border-radius: 50%;
"></div>
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;">
<span style="font-size: 1.8rem;">πŸ“ˆ</span>
<span style="color: var(--text-secondary); font-size: 0.9rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Current Best</span>
</div>
<div style="font-size: 1.8rem; font-weight: 800; color: var(--text-primary); margin-bottom: 4px;">
{current_best_ac*100:.0f}%
</div>
<div style="color: var(--text-secondary); font-size: 0.95rem;">
AC Score Achieved
</div>
</div>
</div>
<!-- Progress Bar -->
<div style="margin-top: 20px; padding: 16px; background: rgba(255, 255, 255, 0.02); border: 1px solid var(--border-subtle); border-radius: 12px;">
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
<span style="color: var(--text-secondary); font-size: 0.9rem; font-weight: 600;">PROGRESS TO ENTERPRISE READY</span>
<span style="color: var(--accent-primary); font-size: 0.9rem; font-weight: 700;">{current_progress}%</span>
</div>
<div style="width: 100%; height: 8px; background: rgba(255, 255, 255, 0.1); border-radius: 4px; overflow: hidden;">
<div style="width: {current_progress}%; height: 100%; background: linear-gradient(90deg, var(--accent-primary) 0%, var(--accent-secondary) 100%); border-radius: 4px; transition: width 0.3s ease;"></div>
</div>
</div>
</div>
"""
else:
return f"""
<div style="margin-bottom: 24px;">
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 16px;">
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-weight: 700;">
Achievement Status
</h3>
{filter_badge}
</div>
<div style="
padding: 24px;
background: linear-gradient(135deg, rgba(40, 167, 69, 0.15) 0%, rgba(40, 167, 69, 0.05) 100%);
border: 2px solid rgba(40, 167, 69, 0.3);
border-radius: 16px;
text-align: center;
">
<span style="font-size: 3rem; margin-bottom: 12px; display: block;">πŸŽ‰</span>
<h2 style="margin: 0 0 12px 0; color: var(--text-primary); font-size: 1.8rem; font-weight: 800;">
Enterprise Ready!
</h2>
<p style="color: var(--text-secondary); font-size: 1.1rem; margin: 0 0 20px 0;">
Models have achieved enterprise-grade reliability
</p>
<div style="display: flex; justify-content: center; gap: 12px; flex-wrap: wrap;">
<span style="
display: inline-flex;
align-items: center;
gap: 6px;
padding: 8px 16px;
background: rgba(40, 167, 69, 0.1);
border: 1px solid rgba(40, 167, 69, 0.3);
border-radius: 24px;
font-size: 0.95rem;
font-weight: 600;
color: #28a745;
">
βœ“ 99% AC Achieved
</span>
<span style="
display: inline-flex;
align-items: center;
gap: 6px;
padding: 8px 16px;
background: rgba(16, 152, 247, 0.1);
border: 1px solid rgba(16, 152, 247, 0.3);
border-radius: 24px;
font-size: 0.95rem;
font-weight: 600;
color: var(--accent-primary);
">
⚑ Ready for Scaling
</span>
<span style="
display: inline-flex;
align-items: center;
gap: 6px;
padding: 8px 16px;
background: rgba(227, 84, 84, 0.1);
border: 1px solid rgba(227, 84, 84, 0.3);
border-radius: 24px;
font-size: 0.95rem;
font-weight: 600;
color: var(--accent-secondary);
">
πŸ›‘οΈ Focus on Guardrails
</span>
</div>
</div>
</div>
"""
else:
return f"""
<div style="margin-bottom: 24px;">
<div style="display: flex; align-items: center; gap: 8px; margin-bottom: 16px;">
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-weight: 700;">
Prediction Status
</h3>
{filter_badge}
</div>
<div style="
padding: 20px;
background: rgba(255, 193, 7, 0.08);
border: 1px solid rgba(255, 193, 7, 0.2);
border-radius: 16px;
display: flex;
align-items: center;
gap: 16px;
">
<span style="font-size: 2.5rem;">πŸ“Š</span>
<div>
<h4 style="margin: 0 0 8px 0; color: var(--text-primary); font-size: 1.1rem; font-weight: 700;">
Insufficient Data for Predictions
</h4>
<p style="color: var(--text-secondary); margin: 0; font-size: 0.95rem; line-height: 1.5;">
{'More models need to be evaluated in this category to generate reliable predictions.' if (domain_clean != "All Domains" or model_type_filter != "All") else 'As more models are released and evaluated, our predictions will become more accurate.'}
</p>
</div>
</div>
</div>
"""
# Create the insights HTML component ABOVE the chart
prediction_insights = gr.HTML(
generate_insight_html(initial_date_99, initial_months_to_99, "All", "All", initial_best_ac)
)
# Add prediction chart - make it reactive
gr.HTML('<div class="chart-container">')
prediction_plot = gr.Plot(
label="",
value=initial_prediction_chart,
elem_classes=["prediction-chart", "plot-container"]
)
gr.HTML('</div>')
# Add methodology note
gr.HTML("""
<div style="
margin-top: 20px;
margin-bottom: 24px;
padding: 16px;
background: rgba(16, 152, 247, 0.05);
border-left: 3px solid var(--accent-primary);
border-radius: 8px;
">
<div style="display: flex; align-items: flex-start; gap: 12px;">
<span style="font-size: 1.2rem; color: var(--accent-primary); margin-top: 2px;">πŸ“Š</span>
<div>
<h4 style="margin: 0 0 8px 0; color: var(--text-primary); font-size: 1rem; font-weight: 600;">
Methodology Note
</h4>
<p style="color: var(--text-secondary); margin: 0; font-size: 0.95rem; line-height: 1.6;">
Our current prediction uses a <strong>conservative linear projection with a 50% diminishing returns factor</strong>.
This simple approach assumes that future AI improvements will occur at half the current rate, accounting for
increasing technical challenges as models approach higher performance levels.
</p>
<p style="color: var(--text-secondary); margin: 8px 0 0 0; font-size: 0.95rem; line-height: 1.6;">
<strong>Why this approach?</strong> With limited data points currently available, complex curve fitting
(exponential, logistic) would lead to overfitting. As we evaluate more models and gather additional data points,
we will refine our methodology to incorporate more sophisticated growth models that better capture the true
trajectory of AI agent capabilities.
</p>
<p style="color: var(--accent-primary); margin: 8px 0 0 0; font-size: 0.9rem; font-weight: 600;">
πŸ”„ This projection will automatically update as new models are added to the leaderboard.
</p>
</div>
</div>
</div>
""")
gr.HTML("</div>")
# Radar Chart Section
gr.HTML("""
<div class="dark-container" style="margin-bottom: 24px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ•ΈοΈ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Domain Performance Analysis
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">Compare model performance across different business domains</p>
""")
with gr.Row():
with gr.Column(scale=1):
model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist()[:10],
value=initial_df['Model'].tolist()[:5],
multiselect=True,
label="🎯 Select Models for Comparison",
info="Choose up to 5 models to visualize",
elem_classes=["dropdown"]
)
# Radar chart plot - wrapped in centered container
gr.HTML('<div class="chart-container">')
radar_chart = gr.Plot(
label="",
value=create_domain_radar_chart(
load_leaderboard_data(),
"Avg AC",
initial_df['Model'].tolist()[:5]
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# Update functions
def get_optimal_sort_order(sort_by_value):
"""Return the optimal sort order for a given metric"""
# Metrics where higher is better (descending)
descending_metrics = ["Avg Action Completion", "Avg Tool Selection Quality"]
# Metrics where lower is better (ascending)
ascending_metrics = ["Avg Session Cost", "Avg Session Duration", "Avg Turns"]
if sort_by_value in descending_metrics:
return "Descending"
elif sort_by_value in ascending_metrics:
return "Ascending"
else:
return "Descending" # Default fallback
def update_sort_order_automatically(sort_by_value):
"""Update sort order automatically based on selected metric"""
optimal_order = get_optimal_sort_order(sort_by_value)
return optimal_order
def update_table(*args):
title_html = update_leaderboard_title(args[0]) # domain_filter is first arg
table_html = filter_and_sort_data(*args)
return title_html, table_html
def update_radar_chart(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
filtered_df = df.copy()
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
# Apply filters (same logic as filter_and_sort_data)
if domain_filter_clean != "All":
domain_col_map = {
"Banking": "Banking AC",
"Healthcare": "Healthcare AC",
"Insurance": "Insurance AC",
"Investment": "Investment AC",
"Telecom": "Telecom AC"
}
if domain_filter_clean in domain_col_map:
domain_col = domain_col_map[domain_filter_clean]
# Only show models that have data for this domain
filtered_df = filtered_df[filtered_df[domain_col] != '']
if model_type_filter != "All":
if model_type_filter == "Open Source":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
if reasoning_filter != "All":
if reasoning_filter == "Reasoning":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
elif reasoning_filter == "Normal":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
# Map display name to actual column name using shared mapping
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
# If domain is selected and sorting by AC or TSQ, use domain-specific column
if domain_filter_clean != "All":
if actual_sort_column == "Avg AC":
actual_sort_column = f"{domain_filter_clean} AC"
elif actual_sort_column == "Avg TSQ":
actual_sort_column = f"{domain_filter_clean} TSQ"
elif actual_sort_column == "Avg Total Cost":
actual_sort_column = f"{domain_filter_clean} Cost"
elif actual_sort_column == "Avg Session Duration":
actual_sort_column = f"{domain_filter_clean} Duration"
elif actual_sort_column == "Avg Turns":
actual_sort_column = f"{domain_filter_clean} Turns"
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
# Update model selector choices based on filtered data
available_models = filtered_df['Model'].tolist()[:15] # Top 15 from filtered results
# If selected models are not in available models, reset to top 5
if selected_models:
valid_selected = [m for m in selected_models if m in available_models]
if not valid_selected:
valid_selected = available_models[:5]
else:
valid_selected = available_models[:5]
# Create radar chart
chart = create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
return gr.Dropdown(choices=available_models, value=valid_selected), chart
def update_radar_only(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models):
# Get filtered dataframe
df = load_leaderboard_data()
filtered_df = df.copy()
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
# Apply filters (same logic as filter_and_sort_data)
if domain_filter_clean != "All":
domain_col_map = {
"Banking": "Banking AC",
"Healthcare": "Healthcare AC",
"Insurance": "Insurance AC",
"Investment": "Investment AC",
"Telecom": "Telecom AC"
}
if domain_filter_clean in domain_col_map:
domain_col = domain_col_map[domain_filter_clean]
filtered_df = filtered_df[filtered_df[domain_col] != '']
if model_type_filter != "All":
if model_type_filter == "Open Source":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
if reasoning_filter != "All":
if reasoning_filter == "Reasoning":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
elif reasoning_filter == "Normal":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
# Map display name to actual column name using shared mapping
actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by)
if actual_sort_column and actual_sort_column in filtered_df.columns:
ascending = (sort_order == "Ascending")
filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last')
if selected_models:
valid_selected = [m for m in selected_models if m in filtered_df['Model'].tolist()]
if not valid_selected:
valid_selected = filtered_df['Model'].tolist()[:5]
else:
valid_selected = filtered_df['Model'].tolist()[:5]
return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected)
# Function to update prediction chart and insights
def update_prediction_chart_and_insights(domain_filter, model_type_filter):
"""Update prediction chart and insights based on filters"""
df = load_leaderboard_data()
# Create new prediction chart with filters
chart, date_99, months_to_99, current_best_ac = create_ac_prediction_chart(
df, domain_filter=domain_filter, model_type_filter=model_type_filter
)
# Generate new insights HTML with current best AC for progress bar
insights_html = generate_insight_html(date_99, months_to_99, domain_filter, model_type_filter, current_best_ac)
return chart, insights_html
# Update table when filters change
filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order]
for input_component in filter_inputs:
input_component.change(
fn=update_table,
inputs=filter_inputs,
outputs=[leaderboard_title, leaderboard_table]
)
# Also update radar chart when filters change
input_component.change(
fn=update_radar_chart,
inputs=filter_inputs + [model_selector],
outputs=[model_selector, radar_chart]
)
# Update prediction chart when domain or model type filters change
# Only react to domain_filter and model_type_filter, not other filters
domain_filter.change(
fn=update_prediction_chart_and_insights,
inputs=[domain_filter, model_type_filter],
outputs=[prediction_plot, prediction_insights]
)
model_type_filter.change(
fn=update_prediction_chart_and_insights,
inputs=[domain_filter, model_type_filter],
outputs=[prediction_plot, prediction_insights]
)
# Update radar chart when model selection changes
model_selector.change(
fn=update_radar_only,
inputs=filter_inputs + [model_selector],
outputs=[radar_chart]
)
# Automatically update sort order when sort_by changes
sort_by.change(
fn=update_sort_order_automatically,
inputs=[sort_by],
outputs=[sort_order]
)
# Performance insights section
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ“Š</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Key Insights
</h3>
</div>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 24px; margin-top: 24px;">
<div class="info-box">
<h4 style="color: var(--accent-primary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">πŸ† Top Performers</h4>
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;">
<li>Highest AC scores indicate best action completion</li>
<li>Superior TSQ shows optimal tool selection</li>
<li>Balance cost-effectiveness with performance</li>
</ul>
</div>
<div class="info-box">
<h4 style="color: var(--accent-secondary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">πŸ” Filter Features</h4>
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;">
<li>Domain-specific performance analysis</li>
<li>Compare open source vs private models</li>
<li>Reasoning vs standard model comparison</li>
</ul>
</div>
<div class="info-box">
<h4 style="color: var(--accent-primary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">πŸ“ˆ Visualization</h4>
<ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;">
<li>Interactive radar charts for domain breakdown</li>
<li>Compare up to 5 models simultaneously</li>
<li>Hover for detailed performance metrics</li>
</ul>
</div>
</div>
</div>
""")
# NEW VISUALIZATIONS START HERE
# 1. Cost-Performance Efficiency Scatter Plot
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ’‘</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Cost-Performance Efficiency Analysis
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Identify models that deliver the best performance per dollar spent
</p>
""")
with gr.Row():
with gr.Column(scale=1):
efficiency_metric = gr.Dropdown(
choices=["Avg Action Completion", "Avg Tool Selection Quality"],
value="Avg Action Completion",
label="πŸ“Š Performance Metric",
info="Select which performance metric to analyze against cost",
elem_classes=["dropdown"]
)
gr.HTML('<div class="chart-container">')
cost_performance_plot = gr.Plot(
label="",
value=create_cost_performance_scatter(load_leaderboard_data(), "Avg AC"),
elem_classes=["efficiency-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# 2. Speed vs Accuracy Trade-off Chart
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">⚑</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Speed vs Accuracy Trade-off
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Find the sweet spot between response time and accuracy
</p>
""")
gr.HTML('<div class="chart-container">')
speed_accuracy_plot = gr.Plot(
label="",
value=create_speed_accuracy_plot(load_leaderboard_data(), "Avg AC"),
elem_classes=["speed-accuracy-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# 3. Performance Heatmap
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">πŸ”₯</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Comprehensive Performance Heatmap
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
All metrics at a glance - darker colors indicate better performance
</p>
""")
gr.HTML('<div class="chart-container">')
performance_heatmap = gr.Plot(
label="",
value=create_performance_heatmap(load_leaderboard_data()),
elem_classes=["heatmap-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# 4. Domain Specialization Matrix
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">🎯</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Domain Specialization Matrix
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Bubble size shows performance level, color intensity shows specialization strength
</p>
""")
with gr.Row():
with gr.Column(scale=1):
specialization_metric = gr.Dropdown(
choices=["AC (Action Completion)", "TSQ (Tool Selection Quality)"],
value="AC (Action Completion)",
label="πŸ“Š Metric Type",
info="Choose which metric to analyze for domain specialization",
elem_classes=["dropdown"]
)
gr.HTML('<div class="chart-container">')
domain_specialization_plot = gr.Plot(
label="",
value=create_domain_specialization_matrix(load_leaderboard_data(), "AC"),
elem_classes=["specialization-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# 5. Performance Gap Analysis
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-secondary);">πŸ“ˆ</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Performance Gap Analysis by Domain
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Visualize the performance range across models for each domain
</p>
""")
gr.HTML('<div class="chart-container">')
performance_gap_plot = gr.Plot(
label="",
value=create_performance_gap_analysis(load_leaderboard_data(), "AC"),
elem_classes=["gap-analysis-chart", "plot-container"]
)
gr.HTML('</div>')
gr.HTML("</div>")
# Update functions for new visualizations
def update_cost_performance(efficiency_metric):
actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric)
return create_cost_performance_scatter(load_leaderboard_data(), actual_metric)
def update_speed_accuracy(efficiency_metric):
actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric)
return create_speed_accuracy_plot(load_leaderboard_data(), actual_metric)
def update_domain_specialization(specialization_metric):
metric_type = "AC" if "AC" in specialization_metric else "TSQ"
return create_domain_specialization_matrix(load_leaderboard_data(), metric_type)
def update_performance_gap(specialization_metric):
metric_type = "AC" if "AC" in specialization_metric else "TSQ"
return create_performance_gap_analysis(load_leaderboard_data(), metric_type)
def update_all_visualizations(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order):
"""Update all new visualizations when filters change"""
df = load_leaderboard_data()
filtered_df = apply_filters(df, domain_filter, model_type_filter, reasoning_filter)
# Update efficiency metric based on current sort
actual_metric = SORT_COLUMN_MAP.get(sort_by, sort_by) if sort_by in ["Avg Action Completion", "Avg Tool Selection Quality"] else "Avg AC"
# Update all plots
cost_perf = create_cost_performance_scatter(filtered_df, actual_metric)
speed_acc = create_speed_accuracy_plot(filtered_df, actual_metric)
heatmap = create_performance_heatmap(filtered_df)
return cost_perf, speed_acc, heatmap
def apply_filters(df, domain_filter, model_type_filter, reasoning_filter):
"""Apply filters to dataframe"""
filtered_df = df.copy()
# Strip emoji prefix from domain filter
domain_filter_clean = domain_filter
if domain_filter.startswith('🌐'):
domain_filter_clean = "All"
elif domain_filter.startswith('🏦'):
domain_filter_clean = "Banking"
elif domain_filter.startswith('πŸ₯'):
domain_filter_clean = "Healthcare"
elif domain_filter.startswith('πŸ›‘οΈ'):
domain_filter_clean = "Insurance"
elif domain_filter.startswith('πŸ’°'):
domain_filter_clean = "Investment"
elif domain_filter.startswith('πŸ“±'):
domain_filter_clean = "Telecom"
# Domain filtering
if domain_filter_clean != "All":
domain_col_map = {
"Banking": "Banking AC",
"Healthcare": "Healthcare AC",
"Insurance": "Insurance AC",
"Investment": "Investment AC",
"Telecom": "Telecom AC"
}
if domain_filter_clean in domain_col_map:
domain_col = domain_col_map[domain_filter_clean]
filtered_df = filtered_df[filtered_df[domain_col] != '']
# Model type filtering
if model_type_filter != "All":
if model_type_filter == "Open Source":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source']
elif model_type_filter == "Proprietary":
filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary']
# Reasoning filtering
if reasoning_filter != "All":
if reasoning_filter == "Reasoning":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning']
elif reasoning_filter == "Normal":
filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal']
return filtered_df
# Connect update functions to components
efficiency_metric.change(
fn=update_cost_performance,
inputs=[efficiency_metric],
outputs=[cost_performance_plot]
)
efficiency_metric.change(
fn=update_speed_accuracy,
inputs=[efficiency_metric],
outputs=[speed_accuracy_plot]
)
specialization_metric.change(
fn=update_domain_specialization,
inputs=[specialization_metric],
outputs=[domain_specialization_plot]
)
specialization_metric.change(
fn=update_performance_gap,
inputs=[specialization_metric],
outputs=[performance_gap_plot]
)
# Update new visualizations when main filters change
for input_component in filter_inputs:
input_component.change(
fn=update_all_visualizations,
inputs=filter_inputs,
outputs=[cost_performance_plot, speed_accuracy_plot, performance_heatmap]
)
# Define generate_performance_card function before using it
def generate_performance_card(model_name):
"""Generate HTML for the model performance card"""
if not model_name:
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
Please select a model to generate its performance card
</div>"""
# Get model data
df = load_leaderboard_data()
model_data = df[df['Model'] == model_name]
if model_data.empty:
return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;">
Model not found in the database
</div>"""
row = model_data.iloc[0]
# Get overall rank
df_with_ac = df[df['Avg AC'] != ''].copy()
df_with_ac['Avg AC'] = pd.to_numeric(df_with_ac['Avg AC'], errors='coerce')
df_sorted = df_with_ac.sort_values('Avg AC', ascending=False).reset_index(drop=True)
try:
rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1
except:
rank = 'N/A'
# Format values
def format_value(val, decimals=3, prefix='', suffix=''):
if pd.isna(val) or val == '':
return 'N/A'
return f"{prefix}{float(val):.{decimals}f}{suffix}"
# Determine model type icon and badge color
type_icon = "πŸ”“" if row['Model Type'] == 'Open source' else "πŸ”’"
reasoning_icon = "🧠" if row.get('Output Type', '') == 'Reasoning' else "πŸ’‘"
# Calculate performance stars
def get_performance_stars(value, max_val=1.0):
if pd.isna(value) or value == '':
return '⭐' * 0
score = float(value) / max_val
if score >= 0.9:
return '⭐' * 5
elif score >= 0.7:
return '⭐' * 4
elif score >= 0.5:
return '⭐' * 3
elif score >= 0.3:
return '⭐' * 2
else:
return '⭐' * 1
# Create HTML
card_html = f"""
<div class="performance-card">
<div class="card-header">
<h1 class="card-model-name">{model_name}</h1>
<div class="card-stars">
{get_performance_stars(row['Avg AC'])}
</div>
</div>
<div class="metrics-grid" style="margin-bottom: 24px;">
<div class="metric-item">
<div class="metric-icon" style="color: var(--accent-primary);">πŸ†</div>
<div class="metric-label">Overall Rank</div>
<div class="metric-value">#{rank}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: var(--accent-primary);">🎯</div>
<div class="metric-label">Action Completion</div>
<div class="metric-value">{format_value(row['Avg AC'])}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: var(--accent-secondary);">πŸ› οΈ</div>
<div class="metric-label">Tool Selection</div>
<div class="metric-value">{format_value(row['Avg TSQ'])}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: #F5F6F7;">πŸ’°</div>
<div class="metric-label">Avg Cost</div>
<div class="metric-value">{format_value(row['Avg Total Cost'], 3, '$')}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: #F5F6F7;">⚑</div>
<div class="metric-label">Avg Duration</div>
<div class="metric-value">{format_value(row['Avg Session Duration'], 1, '', 's')}</div>
</div>
<div class="metric-item">
<div class="metric-icon" style="color: #F5F6F7;">πŸ’¬</div>
<div class="metric-label">Avg Turns</div>
<div class="metric-value">{format_value(row['Avg Turns'], 1)}</div>
</div>
</div>
<div class="domains-section" style="margin-top: 24px;">
<h3 class="domains-title">πŸ›οΈ Domain Performance</h3>
<div class="domains-grid">
"""
# Add domain scores
domains = [
('🏦', 'Banking'),
('πŸ₯', 'Healthcare'),
('πŸ›‘οΈ', 'Insurance'),
('πŸ’°', 'Investment'),
('πŸ“±', 'Telecom')
]
for domain_icon, domain_name in domains:
ac_col = f'{domain_name} AC'
ac_value = row.get(ac_col, '')
if ac_value != '' and not pd.isna(ac_value):
score_display = f"{float(ac_value):.3f}"
score_color = "var(--accent-primary)"
else:
score_display = "N/A"
score_color = "var(--text-muted)"
card_html += f"""
<div class="domain-item">
<div class="domain-name">{domain_icon}</div>
<div style="font-size: 0.7rem; color: var(--text-secondary); margin-bottom: 2px;">{domain_name}</div>
<div class="domain-score" style="color: {score_color};">{score_display}</div>
</div>
"""
card_html += f"""
</div>
</div>
<div class="card-footer">
<div class="card-url">
<strong>https://galileo.ai/agent-leaderboard</strong>
</div>
</div>
</div>
"""
return card_html
# MODEL PERFORMANCE CARD SECTION
gr.HTML("""
<div class="dark-container" style="margin-top: 32px;">
<div class="section-header">
<span class="section-icon" style="color: var(--accent-primary);">🎯</span>
<h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;">
Model Performance Card
</h3>
</div>
<p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Comprehensive performance card for any model - perfect for presentations and reports
</p>
<div style="display: flex; gap: 24px; align-items: flex-start;">
<!-- Controls Column -->
<div style="flex: 0 0 280px;">
<div style="background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle);
border-radius: 16px; padding: 20px; position: sticky; top: 20px;">
""")
card_model_selector = gr.Dropdown(
choices=initial_df['Model'].tolist(),
value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None,
label="πŸ€– Select Model",
info="Choose a model to view its performance card",
elem_classes=["dropdown"]
)
download_card_btn = gr.Button(
"πŸ“₯ Download Card as PNG",
variant="secondary",
elem_classes=["download-button"],
elem_id="download-card-btn"
)
gr.HTML("""
</div>
</div>
<!-- Card Display Column -->
<div style="flex: 1; min-width: 0;" id="card-display-container">
""")
# Card display area - generate initial card
initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None
initial_card_html = generate_performance_card(initial_model) if initial_model else ""
card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html")
gr.HTML("""
</div>
</div>
</div>""")
# Add custom CSS for the performance card
gr.HTML("""
<style>
/* Performance Card Styles */
.performance-card {
background: linear-gradient(145deg, rgba(1, 9, 26, 0.98) 0%, rgba(227, 84, 84, 0.05) 100%);
border: 2px solid var(--accent-primary);
border-radius: 24px;
padding: 32px;
max-width: 700px;
margin: 0 auto;
position: relative;
overflow: hidden;
box-shadow:
0 20px 40px rgba(0, 0, 0, 0.5),
0 0 80px rgba(227, 84, 84, 0.2),
inset 0 0 120px rgba(227, 84, 84, 0.05);
}
.performance-card::before {
content: '';
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%);
opacity: 0.1;
animation: pulse 4s ease-in-out infinite;
}
.card-header {
text-align: center;
margin-bottom: 24px;
position: relative;
z-index: 1;
}
.card-badges {
display: flex;
justify-content: center;
gap: 12px;
margin-bottom: 16px;
}
.card-model-name {
font-size: 2rem;
font-weight: 800;
background: linear-gradient(135deg, var(--accent-primary) 0%, var(--accent-secondary) 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 8px;
text-shadow: 0 0 40px var(--glow-primary);
line-height: 1.2;
}
.card-stars {
font-size: 1.2rem;
margin: 8px 0;
display: flex;
justify-content: center;
align-items: center;
gap: 2px;
}
.card-vendor {
font-size: 1.2rem;
color: var(--text-secondary);
font-weight: 500;
margin-top: 4px;
}
.metrics-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
gap: 16px;
margin-bottom: 24px;
position: relative;
z-index: 1;
}
.metric-item {
background: rgba(245, 246, 247, 0.05);
border: 1px solid var(--border-subtle);
border-radius: 16px;
padding: 16px;
text-align: center;
transition: all 0.3s ease;
}
.metric-item:hover {
transform: translateY(-4px);
border-color: var(--accent-primary);
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.2);
}
.metric-icon {
font-size: 1.5rem;
margin-bottom: 6px;
filter: drop-shadow(0 0 20px currentColor);
}
.metric-label {
font-size: 0.75rem;
color: var(--text-secondary);
text-transform: uppercase;
letter-spacing: 0.05em;
margin-bottom: 4px;
}
.metric-value {
font-size: 1.4rem;
font-weight: 700;
color: var(--text-primary);
font-family: 'Geist Mono', monospace;
}
.domains-section {
margin-top: 32px;
position: relative;
z-index: 1;
}
.domains-title {
font-size: 1.1rem;
font-weight: 600;
color: var(--text-primary);
margin-bottom: 16px;
text-align: center;
}
.domains-grid {
display: grid;
grid-template-columns: repeat(5, 1fr);
gap: 12px;
}
.domain-item {
background: rgba(245, 246, 247, 0.05);
border: 1px solid var(--border-subtle);
border-radius: 12px;
padding: 12px;
text-align: center;
}
.domain-name {
font-size: 1.4rem;
margin-bottom: 4px;
}
.domain-score {
font-size: 1rem;
font-weight: 600;
color: var(--accent-primary);
}
.card-footer {
text-align: center;
margin-top: 24px;
padding-top: 20px;
border-top: 1px solid var(--border-subtle);
position: relative;
z-index: 1;
}
.card-badge {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 8px 16px;
background: rgba(245, 246, 247, 0.05);
border: 1px solid var(--border-subtle);
border-radius: 20px;
font-size: 0.9rem;
color: var(--text-secondary);
margin: 0 4px;
}
.card-url {
margin-top: 12px;
font-size: 0.75rem;
color: var(--text-muted);
font-family: 'Geist Mono', monospace;
}
.primary-button {
background: linear-gradient(135deg, var(--accent-primary) 0%, #B94545 100%) !important;
color: white !important;
border: none !important;
padding: 10px 20px !important;
font-weight: 600 !important;
transition: all 0.3s ease !important;
}
.primary-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 24px rgba(227, 84, 84, 0.4) !important;
}
/* Download button styling */
.download-button {
background: linear-gradient(135deg, var(--accent-secondary) 0%, #0A6BC4 100%) !important;
color: white !important;
border: none !important;
padding: 10px 20px !important;
font-weight: 600 !important;
transition: all 0.3s ease !important;
}
.download-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 24px rgba(16, 152, 247, 0.4) !important;
}
/* Responsive layout for performance card section */
@media (max-width: 1200px) {
.performance-card {
padding: 24px !important;
}
.card-model-name {
font-size: 1.7rem !important;
}
.metric-value {
font-size: 1.2rem !important;
}
}
@media (max-width: 900px) {
/* Stack the controls above the card on smaller screens */
#card-display-container {
margin-top: 20px;
}
.performance-card {
padding: 20px !important;
}
.card-model-name {
font-size: 1.5rem !important;
}
.metric-value {
font-size: 1.1rem !important;
}
.domains-grid {
grid-template-columns: repeat(3, 1fr) !important;
}
}
/* Button states */
.download-button:disabled {
opacity: 0.6 !important;
cursor: not-allowed !important;
}
</style>
<!-- Include html2canvas library -->
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/html2canvas.min.js"></script>
""")
# Wire up the card generator to selection change
card_model_selector.change(
fn=generate_performance_card,
inputs=[card_model_selector],
outputs=[card_display]
)
# Wire up download button with improved functionality
download_card_btn.click(
fn=None,
js="""
() => {
// Wait a bit to ensure the card is fully rendered
setTimeout(() => {
const card = document.querySelector('.performance-card');
if (!card) {
alert('Performance card not found. Please select a model first.');
return;
}
// Check if html2canvas is loaded
if (typeof html2canvas === 'undefined') {
// Try to load html2canvas dynamically
const script = document.createElement('script');
script.src = 'https://cdn.jsdelivr.net/npm/[email protected]/dist/html2canvas.min.js';
script.onload = () => {
captureCard();
};
script.onerror = () => {
alert('Failed to load html2canvas library. Please try again.');
};
document.head.appendChild(script);
} else {
captureCard();
}
function captureCard() {
// Show loading indicator
const btn = document.getElementById('download-card-btn');
const originalText = btn.textContent;
btn.textContent = 'Generating...';
btn.disabled = true;
html2canvas(card, {
backgroundColor: '#01091A',
scale: 2,
logging: false,
useCORS: true,
allowTaint: true
}).then(canvas => {
// Create download link
const link = document.createElement('a');
const modelName = card.querySelector('.card-model-name')?.textContent || 'model';
const timestamp = new Date().toISOString().slice(0,10);
const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`;
link.download = fileName;
link.href = canvas.toDataURL('image/png');
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
// Restore button
btn.textContent = originalText;
btn.disabled = false;
}).catch(error => {
console.error('Error capturing card:', error);
alert('Failed to capture performance card. Please try again.');
btn.textContent = originalText;
btn.disabled = false;
});
}
}, 100);
}
"""
)
# Also update card when filters change to keep model selector in sync
for input_component in filter_inputs:
def update_dropdown_and_card(*args):
filtered_df = apply_filters(load_leaderboard_data(), args[0], args[1], args[2])
choices = filtered_df['Model'].tolist()
# Select first model from filtered list
value = choices[0] if choices else None
return gr.Dropdown(choices=choices, value=value)
input_component.change(
fn=update_dropdown_and_card,
inputs=filter_inputs,
outputs=[card_model_selector]
)
# Footer CTAs
gr.HTML("""
<div style="margin-top: 60px; padding: 40px 20px; background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%); border-radius: 20px; border: 1px solid var(--border-subtle);">
<div style="text-align: center; margin-bottom: 30px;">
<h3 style="font-size: 2rem; color: var(--text-primary); margin-bottom: 10px; font-family: 'Geist', sans-serif; font-weight: 700;">
Ready to Take Your AI to the Next Level?
</h3>
<p style="color: var(--text-secondary); font-size: 1.1rem; font-family: 'Geist', sans-serif;">
Learn more about building better agents and evaluating your models
</p>
</div>
<div style="display: flex; justify-content: center; gap: 16px; flex-wrap: wrap;">
<a href="https://galileo.ai/mastering-agents-ebook?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="header-action-button">
<span class="action-button-icon">πŸ“š</span>Mastering Agents eBook
</a>
<a href="https://app.galileo.ai/sign-up?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="header-action-button">
<span class="action-button-icon">πŸš€</span>Evaluate your GenAI for free
</a>
</div>
</div>
""")
# Add FAQ section at the end
gr.HTML(get_faq_section())
return leaderboard_table
def create_leaderboard_v2_interface():
"""Create the complete leaderboard v2 interface"""
return create_leaderboard_v2_tab()
def create_domain_radar_chart(df, metric_type, selected_models=None, max_models=5):
"""Create a radar chart showing model performance across domains for the selected metric"""
# Map the metric_type to actual column name using shared mapping
actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type)
if selected_models is None or len(selected_models) == 0:
# Default to top 5 models by the selected metric if available
if actual_metric_type in df.columns:
selected_models = df.nlargest(max_models, actual_metric_type)['Model'].tolist()
else:
selected_models = df.head(max_models)['Model'].tolist()
# Limit to max_models for readability
selected_models = selected_models[:max_models]
# Define domain mapping based on metric type
domain_mapping = {
'Avg AC': {
'Banking': 'Banking AC',
'Healthcare': 'Healthcare AC',
'Insurance': 'Insurance AC',
'Investment': 'Investment AC',
'Telecom': 'Telecom AC'
},
'Avg TSQ': {
'Banking': 'Banking TSQ',
'Healthcare': 'Healthcare TSQ',
'Insurance': 'Insurance TSQ',
'Investment': 'Investment TSQ',
'Telecom': 'Telecom TSQ'
},
'Avg Total Cost': {
'Banking': 'Banking Cost',
'Healthcare': 'Healthcare Cost',
'Insurance': 'Insurance Cost',
'Investment': 'Investment Cost',
'Telecom': 'Telecom Cost'
},
'Avg Session Duration': {
'Banking': 'Banking Duration',
'Healthcare': 'Healthcare Duration',
'Insurance': 'Insurance Duration',
'Investment': 'Investment Duration',
'Telecom': 'Telecom Duration'
},
'Avg Turns': {
'Banking': 'Banking Turns',
'Healthcare': 'Healthcare Turns',
'Insurance': 'Insurance Turns',
'Investment': 'Investment Turns',
'Telecom': 'Telecom Turns'
}
}
# Only show radar chart for AC and TSQ metrics that have domain breakdowns
if actual_metric_type not in domain_mapping:
return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}")
fig = go.Figure()
domains = list(domain_mapping[actual_metric_type].keys())
domain_columns = list(domain_mapping[actual_metric_type].values())
# Galileo dark theme color scheme
galileo_dark_colors = [
{'fill': 'rgba(227, 84, 84, 0.25)', 'line': '#E35454', 'name': 'Vanguard'}, # Vanguard Red
{'fill': 'rgba(16, 152, 247, 0.15)', 'line': '#1098F7', 'name': 'Airglow'}, # Airglow Blue
{'fill': 'rgba(245, 246, 247, 0.15)', 'line': '#F5F6F7', 'name': 'Mercury'}, # Light Mercury
{'fill': 'rgba(227, 84, 84, 0.35)', 'line': '#B94545', 'name': 'Deep Red'}, # Darker Vanguard
{'fill': 'rgba(16, 152, 247, 0.25)', 'line': '#0A6BC4', 'name': 'Deep Blue'} # Darker Airglow
]
for idx, model_name in enumerate(selected_models):
model_data = df[df['Model'] == model_name]
if model_data.empty:
continue
model_row = model_data.iloc[0]
values = []
# Get values for each domain
for col in domain_columns:
if col in df.columns and col in model_row:
val = model_row[col]
if pd.isna(val) or val == '':
val = 0
else:
val = float(val)
values.append(val)
else:
values.append(0)
# Close the radar chart by repeating first value
values_plot = values + [values[0]]
domains_plot = domains + [domains[0]]
colors = galileo_dark_colors[idx % len(galileo_dark_colors)]
fig.add_trace(
go.Scatterpolar(
r=values_plot,
theta=domains_plot,
fill='toself',
fillcolor=colors['fill'],
line=dict(
color=colors['line'],
width=3,
shape='spline',
smoothing=0.8
),
marker=dict(
size=10,
color=colors['line'],
symbol='circle',
line=dict(width=2, color='#01091A')
),
name=model_name,
mode="lines+markers",
hovertemplate="<b>%{fullData.name}</b><br>" +
"<span style='color: #94A3B8'>%{theta}</span><br>" +
"<b style='font-size: 14px; color: #F5F6F7'>%{r:.3f}</b><br>" +
"<extra></extra>",
hoverlabel=dict(
bgcolor="rgba(1, 9, 26, 0.95)",
bordercolor=colors['line'],
font=dict(color="#F5F6F7", size=12, family="'Geist', sans-serif")
)
)
)
# Determine appropriate range based on metric type
if actual_metric_type in ['Avg AC', 'Avg TSQ']:
max_range = 1.0
else:
# Calculate max from data for other metrics (Cost, Duration, Turns)
all_values = []
for model_name in selected_models:
model_data = df[df['Model'] == model_name]
if not model_data.empty:
model_row = model_data.iloc[0]
for col in domain_columns:
if col in df.columns and col in model_row:
val = model_row[col]
if pd.notna(val) and val != '':
all_values.append(float(val))
max_range = max(all_values) * 1.1 if all_values else 1.0
# Create custom tick values for better readability
tick_vals = [i * max_range / 5 for i in range(6)]
tick_text = [f"{val:.2f}" for val in tick_vals]
fig.update_layout(
polar=dict(
bgcolor='rgba(245, 246, 247, 0.03)',
radialaxis=dict(
visible=True,
range=[0, max_range],
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.1)',
gridwidth=1,
tickvals=tick_vals,
ticktext=tick_text,
tickfont=dict(
size=11,
color='#94A3B8',
family="'Geist Mono', monospace"
),
tickangle=0
),
angularaxis=dict(
showline=True,
linewidth=2,
linecolor='rgba(245, 246, 247, 0.2)',
gridcolor='rgba(245, 246, 247, 0.08)',
tickfont=dict(
size=14,
family="'Geist', sans-serif",
color='#F5F6F7',
weight=600
),
rotation=90,
direction="clockwise",
),
),
showlegend=True,
legend=dict(
orientation="v",
yanchor="middle",
y=0.5,
xanchor="left",
x=1.05,
font=dict(
size=12,
family="'Geist', sans-serif",
color='#F5F6F7'
),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1,
itemsizing='constant',
itemwidth=30
),
title=dict(
text=f"<b>Domain Performance: {metric_type}</b>",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="#F5F6F7",
weight=700
),
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=200),
annotations=[
dict(
text="Galileo Agent Leaderboard",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=10, color='#64748B'),
showarrow=False
)
]
)
return fig
def create_empty_radar_chart(message):
"""Create an empty radar chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"πŸ“Š {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="#94A3B8",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=200),
title=dict(
text="<b>Domain Performance Chart</b>",
x=0.5,
y=0.97,
font=dict(
size=22,
family="'Geist', sans-serif",
color="#F5F6F7",
weight=700
),
),
annotations=[
dict(
text="Galileo Agent Leaderboard",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=10, color='#64748B'),
showarrow=False
)
]
)
return fig
# NEW VISUALIZATION FUNCTIONS
def create_cost_performance_scatter(df, metric="Avg AC"):
"""Create scatter plot showing cost vs performance efficiency"""
# Filter out models without cost or performance data
df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy()
if df_filtered.empty:
return create_empty_chart("No data available for cost-performance analysis")
# Convert to numeric
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce')
# Create color mapping for model type
color_map = {
'Proprietary': '#1098F7', # Airglow Blue for Proprietary
'Open source': '#58BC82' # Green for Open source
}
df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7')
fig = go.Figure()
# Add scatter points
for model_type in df_filtered['Model Type'].unique():
df_type = df_filtered[df_filtered['Model Type'] == model_type]
fig.add_trace(go.Scatter(
x=df_type[metric],
y=df_type['Avg Total Cost'],
mode='markers+text',
name=model_type,
text=df_type['Model'],
textposition="top center",
textfont=dict(size=10, color='#94A3B8'),
marker=dict(
size=df_type['Avg Turns'] * 3, # Size based on number of turns
color=color_map.get(model_type, '#F5F6F7'),
opacity=0.8,
line=dict(width=2, color='#01091A')
),
hovertemplate="<b>%{text}</b><br>" +
f"{metric}: %{{x:.3f}}<br>" +
"Cost: $%{y:.3f}<br>" +
"Turns: %{marker.size:.1f}<br>" +
"<extra></extra>"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Total Cost'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="πŸ’Ž High Performance<br>Low Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#F5F6F7"), bgcolor="rgba(245, 246, 247, 0.1)")
fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance<br>High Cost",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#E35454"), bgcolor="rgba(227, 84, 84, 0.1)")
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Cost-Performance Efficiency: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display}</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="<b>Average Session Cost ($)</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
showlegend=True,
legend=dict(
orientation="h",
yanchor="bottom",
y=1.02,
xanchor="right",
x=1,
font=dict(size=12, family="'Geist', sans-serif", color='#F5F6F7'),
bgcolor='rgba(1, 9, 26, 0.8)',
bordercolor='rgba(245, 246, 247, 0.2)',
borderwidth=1
),
margin=dict(t=100, b=80, l=80, r=80)
)
return fig
def create_speed_accuracy_plot(df, metric="Avg AC"):
"""Create scatter plot showing speed vs accuracy trade-off"""
# Filter out models without duration or performance data
df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy()
if df_filtered.empty:
return create_empty_chart("No data available for speed-accuracy analysis")
# Convert to numeric
df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce')
df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce')
# Create color scale based on cost
df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce')
fig = go.Figure()
# Add scatter trace
fig.add_trace(go.Scatter(
x=df_filtered[metric],
y=df_filtered['Avg Session Duration'],
mode='markers+text',
text=df_filtered['Model'],
textposition="top center",
textfont=dict(size=9, color='#94A3B8'),
marker=dict(
size=12,
color=df_filtered['Avg Total Cost'],
colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']],
showscale=True,
colorbar=dict(
title=dict(
text="Cost ($)",
font=dict(color="#F5F6F7")
),
tickfont=dict(color="#94A3B8"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
x=1.02
),
line=dict(width=2, color='#01091A')
),
hovertemplate="<b>%{text}</b><br>" +
f"{metric}: %{{x:.3f}}<br>" +
"Duration: %{y:.1f}s<br>" +
"Cost: $%{marker.color:.3f}<br>" +
"<extra></extra>"
))
# Add quadrant lines
median_x = df_filtered[metric].median()
median_y = df_filtered['Avg Session Duration'].median()
fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5)
fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5)
# Add quadrant labels
fig.add_annotation(x=0.95, y=0.05, text="⚑ Fast & Accurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#F5F6F7", weight=600))
fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate",
showarrow=False, xref="paper", yref="paper",
font=dict(size=12, color="#E35454", weight=600))
metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Speed vs Accuracy Trade-off: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display}</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
yaxis=dict(
title=dict(
text="<b>Average Session Duration (seconds)</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
zerolinecolor="rgba(245, 246, 247, 0.2)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=900,
width=1450,
margin=dict(t=100, b=80, l=80, r=120)
)
return fig
def create_performance_heatmap(df):
"""Create a heatmap showing all metrics for all models"""
# Select relevant columns
metrics = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns']
# Filter models with data
df_filtered = df[df['Avg AC'] != ''].copy()
if df_filtered.empty:
return create_empty_chart("No data available for performance heatmap")
# Convert to numeric and normalize
for col in metrics:
df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce')
# Create normalized data (0-1 scale)
# For cost, duration, and turns, lower is better so we invert
normalized_data = []
metric_labels = []
for col in metrics:
if col in ['Avg Total Cost', 'Avg Session Duration', 'Avg Turns']:
# Invert these metrics (lower is better)
normalized = 1 - (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min())
else:
# Higher is better for AC and TSQ
normalized = (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min())
normalized_data.append(normalized.values)
# Create better labels
label_map = {
'Avg AC': 'Action Completion',
'Avg TSQ': 'Tool Selection',
'Avg Total Cost': 'Cost Efficiency',
'Avg Session Duration': 'Speed',
'Avg Turns': 'Conversation Efficiency'
}
metric_labels.append(label_map.get(col, col))
# Create heatmap
fig = go.Figure(data=go.Heatmap(
z=normalized_data,
x=df_filtered['Model'].tolist(),
y=metric_labels,
colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']],
hovertemplate="<b>%{x}</b><br>" +
"%{y}: %{z:.2f}<br>" +
"<extra></extra>",
text=[[f"{val:.2f}" for val in row] for row in normalized_data],
texttemplate="%{text}",
textfont={"size": 10, "color": "#F5F6F7"},
showscale=True,
colorbar=dict(
title=dict(
text="Performance<br>Score",
font=dict(color="#F5F6F7")
),
tickfont=dict(color="#94A3B8"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1
)
))
fig.update_layout(
title=dict(
text="<b>Comprehensive Performance Heatmap</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
side="bottom",
tickfont=dict(size=11, color="#94A3B8"),
tickangle=-45
),
yaxis=dict(
tickfont=dict(size=13, color="#F5F6F7", weight=600)
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=700,
width=1550,
margin=dict(t=100, b=120, l=170, r=120)
)
return fig
def create_domain_specialization_matrix(df, metric_type="AC"):
"""Create bubble chart showing domain specialization"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Prepare data
data = []
for _, model in df.iterrows():
if model['Model'] == '':
continue
model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce')
if pd.isna(model_avg):
continue
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in model and model[domain_col] != '':
domain_val = pd.to_numeric(model[domain_col], errors='coerce')
if not pd.isna(domain_val):
# Calculate specialization strength (deviation from model average)
specialization = domain_val - model_avg
data.append({
'Model': model['Model'],
'Domain': domain,
'Performance': domain_val,
'Specialization': specialization,
'Model Type': model['Model Type']
})
if not data:
return create_empty_chart("No domain specialization data available")
df_plot = pd.DataFrame(data)
# Create bubble chart
fig = go.Figure()
# Color based on specialization strength
fig.add_trace(go.Scatter(
x=df_plot['Domain'],
y=df_plot['Model'],
mode='markers',
marker=dict(
size=df_plot['Performance'] * 30, # Size based on absolute performance
color=df_plot['Specialization'],
colorscale=[[0, '#1098F7'], [0.5, '#F5F6F7'], [1, '#E35454']],
showscale=True,
colorbar=dict(
title=dict(
text="Specialization<br>Strength",
font=dict(color="#F5F6F7")
),
tickfont=dict(color="#94A3B8"),
bgcolor="rgba(1, 9, 26, 0.8)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1
),
line=dict(width=2, color='#01091A'),
opacity=0.8
),
text=[f"Performance: {p:.3f}<br>Specialization: {s:+.3f}"
for p, s in zip(df_plot['Performance'], df_plot['Specialization'])],
hovertemplate="<b>%{y}</b><br>" +
"Domain: %{x}<br>" +
"%{text}<br>" +
"<extra></extra>"
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Domain Specialization Matrix: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
title=dict(
text="<b>Business Domains</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=13, color="#F5F6F7"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
yaxis=dict(
title=dict(
text="<b>Models</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=11, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=1100,
width=1450,
margin=dict(t=100, b=80, l=220, r=120)
)
return fig
def create_performance_gap_analysis(df, metric_type="AC"):
"""Create range plot showing performance gaps by domain"""
domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom']
# Calculate min, max, median for each domain
gap_data = []
for domain in domains:
domain_col = f'{domain} {metric_type}'
if domain_col in df.columns:
domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna()
if len(domain_values) > 0:
gap_data.append({
'Domain': domain,
'Min': domain_values.min(),
'Max': domain_values.max(),
'Median': domain_values.median(),
'Q1': domain_values.quantile(0.25),
'Q3': domain_values.quantile(0.75),
'Gap': domain_values.max() - domain_values.min()
})
if not gap_data:
return create_empty_chart("No data available for gap analysis")
df_gap = pd.DataFrame(gap_data)
df_gap = df_gap.sort_values('Gap', ascending=True)
fig = go.Figure()
# Add range bars
for idx, row in df_gap.iterrows():
# Add full range line
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='lines',
line=dict(color='#64748B', width=2),
showlegend=False,
hoverinfo='skip'
))
# Add IQR box
fig.add_trace(go.Scatter(
x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']],
y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']],
fill='toself',
fillcolor='rgba(227, 84, 84, 0.3)',
line=dict(color='#E35454', width=2),
showlegend=False,
hoverinfo='skip',
mode='lines'
))
# Add median marker
fig.add_trace(go.Scatter(
x=[row['Median']],
y=[row['Domain']],
mode='markers',
marker=dict(
size=12,
color='#E35454',
symbol='diamond',
line=dict(width=2, color='#01091A')
),
showlegend=False,
hovertemplate=f"<b>{row['Domain']}</b><br>" +
f"Min: {row['Min']:.3f}<br>" +
f"Q1: {row['Q1']:.3f}<br>" +
f"Median: {row['Median']:.3f}<br>" +
f"Q3: {row['Q3']:.3f}<br>" +
f"Max: {row['Max']:.3f}<br>" +
f"Gap: {row['Gap']:.3f}<br>" +
"<extra></extra>"
))
# Add min/max points
for idx, row in df_gap.iterrows():
fig.add_trace(go.Scatter(
x=[row['Min'], row['Max']],
y=[row['Domain'], row['Domain']],
mode='markers',
marker=dict(size=8, color='#F5F6F7', line=dict(width=2, color='#01091A')),
showlegend=False,
hoverinfo='skip'
))
metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality"
fig.update_layout(
title=dict(
text=f"<b>Performance Gap Analysis by Domain: {metric_display}</b>",
x=0.5,
y=0.97,
font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700)
),
xaxis=dict(
title=dict(
text=f"<b>{metric_display} Score</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=12, color="#94A3B8"),
gridcolor="rgba(245, 246, 247, 0.1)",
range=[0, 1] if metric_type in ['AC', 'TSQ'] else None
),
yaxis=dict(
title=dict(
text="<b>Business Domain</b>",
font=dict(size=16, color="#F5F6F7")
),
tickfont=dict(size=13, color="#F5F6F7"),
gridcolor="rgba(245, 246, 247, 0.1)"
),
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=800,
width=1450,
margin=dict(t=100, b=80, l=140, r=80),
showlegend=False
)
# Add legend manually
fig.add_annotation(
text="β—† Median ━ IQR ─ Full Range",
xref="paper", yref="paper",
x=0.98, y=0.02,
xanchor='right', yanchor='bottom',
font=dict(size=12, color='#94A3B8'),
showarrow=False
)
return fig
def create_empty_chart(message):
"""Create an empty chart with a message"""
fig = go.Figure()
fig.add_annotation(
text=f"πŸ“Š {message}",
xref="paper", yref="paper",
x=0.5, y=0.5,
xanchor='center', yanchor='middle',
font=dict(
size=18,
color="#94A3B8",
family="'Geist', sans-serif"
),
showarrow=False,
bgcolor="rgba(245, 246, 247, 0.05)",
bordercolor="rgba(245, 246, 247, 0.2)",
borderwidth=1,
borderpad=20
)
fig.update_layout(
paper_bgcolor="#01091A",
plot_bgcolor="rgba(245, 246, 247, 0.02)",
height=700,
width=1450,
margin=dict(t=80, b=80, l=80, r=80)
)