""" Agent Leaderboard v1 - Main leaderboard interface Updated implementation with LLM Type support and optimized radar charts """ import base64 import math import re from datetime import datetime from pathlib import Path import gradio as gr import pandas as pd import plotly.graph_objects as go # Import components and styles from modular files from components.leaderboard_components import ( get_chart_colors, get_rank_badge, get_type_badge, get_metric_tooltip, get_responsive_styles, get_faq_section ) from styles.leaderboard_styles import get_leaderboard_css ASSET_ICON_PATH = Path("krew_icon.png") KREW_ICON_BASE64 = "" if ASSET_ICON_PATH.exists(): KREW_ICON_BASE64 = base64.b64encode(ASSET_ICON_PATH.read_bytes()).decode("utf-8") CSV_PATH = Path("combined_evaluation_summary.csv") if CSV_PATH.exists(): EVALUATION_DATE = datetime.fromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d") else: EVALUATION_DATE = datetime.today().strftime("%Y-%m-%d") def create_leaderboard_v2_tab(): """Create the main leaderboard v1 tab with interactive table""" token_to_cost_factor = 2e-6 # Rough cost per token ($2 per 1M tokens) tokens_per_turn = 1000 # Approximate tokens exchanged per turn for scaling level_ids = [f"L{i}" for i in range(1, 8)] level_tsq_sources = { "L1": "L1_ArgAcc", "L2": "L2_SelectAcc", "L3": "L3_PSM", "L4": "L4_Coverage", "L5": "L5_AdaptiveRoutingScore", "L6": "L6_EffScore", "L7": "L7_ContextRetention", } def load_leaderboard_data(): """Load and prepare the leaderboard data""" df = pd.read_csv('combined_evaluation_summary.csv') # Clean and prepare data df = df.copy() numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')] for col in numeric_candidate_cols: df[col] = pd.to_numeric(df[col], errors='coerce') # Derive per-level helper columns for cost and turns sr_columns = [] tsq_columns = [] duration_columns = [] cost_columns = [] turns_columns = [] for level in level_ids: sr_col = f"{level}_SR" if sr_col in df.columns: sr_columns.append(sr_col) df[sr_col] = df[sr_col].round(3) tsq_source = level_tsq_sources.get(level) if tsq_source and tsq_source in df.columns: tsq_columns.append(tsq_source) duration_col = f"{level}_Avg_Exec_Time" if duration_col in df.columns: duration_columns.append(duration_col) token_col = f"{level}_Avg_Tokens" if token_col in df.columns: cost_col = f"{level}_Avg_Cost" turns_col = f"{level}_Avg_Turns" df[cost_col] = df[token_col] * token_to_cost_factor df[turns_col] = df[token_col] / tokens_per_turn cost_columns.append(cost_col) turns_columns.append(turns_col) if sr_columns: df['Avg AC'] = df[sr_columns].mean(axis=1) if tsq_columns: df['Avg TSQ'] = df[tsq_columns].mean(axis=1) if cost_columns: df['Avg Total Cost'] = df[cost_columns].mean(axis=1) if duration_columns: df['Avg Session Duration'] = df[duration_columns].mean(axis=1) if turns_columns: df['Avg Turns'] = df[turns_columns].mean(axis=1) # Derive core capability metrics for radar visualization if sr_columns: df['Overall Success'] = df[sr_columns].mean(axis=1) execution_cols = [col for col in ['L1_CallEM', 'L1_ArgAcc', 'L2_SelectAcc'] if col in df.columns] if execution_cols: df['Execution Accuracy'] = df[execution_cols].mean(axis=1) reasoning_cols = [col for col in ['L3_ProvAcc', 'L3_PSM', 'L4_Coverage'] if col in df.columns] if reasoning_cols: df['Complex Reasoning'] = df[reasoning_cols].mean(axis=1) robustness_cols = [col for col in ['L5_AdaptiveRoutingScore', 'L5_FallbackSR'] if col in df.columns] if robustness_cols: df['Robustness'] = df[robustness_cols].mean(axis=1) context_cols = [col for col in ['L6_ReuseRage', 'L6_EffScore', 'L7_ContextRetention'] if col in df.columns] if context_cols: df['Context & Efficiency'] = df[context_cols].mean(axis=1) epr_cols = [f"L{i}_EPR_CVR" for i in range(1, 8) if f"L{i}_EPR_CVR" in df.columns] if epr_cols: df['Call Validity'] = df[epr_cols].mean(axis=1) # Use LLM Type from CSV directly, with mapping to display names if 'LLM Type' in df.columns: # Clean the LLM Type column to remove any whitespace df['LLM Type'] = df['LLM Type'].astype(str).str.strip() # Map LLM Type to Model Type def map_llm_type(llm_type): if llm_type.upper() == "OSS": return "Open source" else: return "Proprietary" df['Model Type'] = df['LLM Type'].apply(map_llm_type) else: # Fallback to vendor mapping if LLM Type column doesn't exist vendor_model_type_map = { "OpenAI": "Proprietary", "Anthropic": "Proprietary", "Google": "Proprietary", "Microsoft": "Proprietary", "Mistral": "Proprietary", "Databricks": "Open source", "Meta": "Open source", "Alibaba": "Open source", "알리바바": "Open source", # Korean name for Alibaba "Kakao": "Open source", "SKT": "Open source", "KT": "Open source", "xAI": "Proprietary", } df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary') # Round numeric columns for better display round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy', 'Complex Reasoning', 'Robustness', 'Context & Efficiency', 'Call Validity'] round_one_cols = ['Avg Session Duration', 'Avg Turns'] for col in round_three_cols: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').round(3) for col in round_one_cols: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').round(1) if cost_columns: df[cost_columns] = df[cost_columns].apply(pd.to_numeric, errors='coerce').round(3) if turns_columns: df[turns_columns] = df[turns_columns].apply(pd.to_numeric, errors='coerce').round(2) if duration_columns: df[duration_columns] = df[duration_columns].apply(pd.to_numeric, errors='coerce').round(2) # Fill NaN values appropriately df = df.fillna('') return df def build_static_radar_chart(values, labels): """Render a small static radar chart as inline SVG""" if not values or all(v == 0 for v in values): return """
Radar Chart Execution Accuracy · Complex Reasoning · Robustness · Context & Efficiency · Overall Success · Validity
""" size = 220 center = size / 2 radius = size * 0.38 n = len(values) def point(v, idx, scale=1.0): angle = (2 * math.pi * idx / n) - math.pi / 2 r = radius * v * scale x = center + r * math.cos(angle) y = center + r * math.sin(angle) return x, y polygon_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(v, i) for i, v in enumerate(values))) ring_polygons = [] for step in (0.33, 0.66, 1.0): ring_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(step, i) for i in range(n))) opacity = 0.04 if step < 1.0 else 0.08 ring_polygons.append(f'') axis_lines = "\n".join( f'' for idx in range(n) ) label_spans = "\n".join( f'{label}' for idx, label in enumerate(labels) ) svg = f""" {''.join(ring_polygons)} {axis_lines} {label_spans} """ return svg # Level metadata for the 7-stage task framework level_details = { "ALL": { "title": "ALL · All Tasks", "description": "First, observe the overall average performance across all seven tasks. This average should then be utilized as a baseline to conduct a more detailed per-level comparison." }, "L1": { "title": "L1 · Single Tool Call", "description": "Evaluates single tool invocation capability and basic command execution accuracy." }, "L2": { "title": "L2 · Tool Selection", "description": "Measures the ability to choose the right tool and invoke it with appropriate parameters." }, "L3": { "title": "L3 · Sequential Tool Reasoning", "description": "Validates multi-step sequential reasoning for solving tasks." }, "L4": { "title": "L4 · Parallel Tool Reasoning", "description": "Evaluates the ability to integrate and summarize information from multiple sources in parallel." }, "L5": { "title": "L5 · Error Handling & Robustness", "description": "Checks awareness of unexpected failures and the strategies used to recover." }, "L6": { "title": "L6 · Efficient Tool Utilization", "description": "Examines operational efficiency in achieving goals with minimal calls and cost." }, "L7": { "title": "L7 · Long-Context Memory", "description": "Analyzes the ability to retain and leverage long conversational context." } } default_level = "ALL" sr_column_map = {level: f"{level}_SR" for level in level_ids} overall_sort_column = "Overall Success" def resolve_level(level_value): """Normalize the incoming level filter value""" if not level_value: return default_level return level_value if level_value in level_details else default_level def generate_html_table(filtered_df, highlight_column): """Generate styled HTML table with per-level success rates""" valid_highlights = list(sr_column_map.values()) + ["Overall Success"] highlight_column = highlight_column if highlight_column in valid_highlights else None overall_column = "Overall Success" overall_highlight = (highlight_column == overall_column) highlight_map = {level: (sr_column_map[level] == highlight_column) for level in level_ids} table_html = """
""" overall_header_classes = ["numeric-cell"] if overall_highlight: overall_header_classes.append("highlight-header") table_html += f""" """ for level in level_ids: header_classes = ["numeric-cell"] if highlight_map.get(level): header_classes.append("highlight-header") table_html += f""" """ table_html += """ """ def safe_float(value): if value is None: return '' if isinstance(value, str) and value.strip() == '': return '' if pd.isna(value): return '' try: return float(value) except (TypeError, ValueError): return '' # Generate table rows for idx, (_, row) in enumerate(filtered_df.iterrows()): rank = idx + 1 table_html += f""" """ overall_value = safe_float(row.get(overall_column, '')) if overall_value != '': overall_display = f'{overall_value:.3f}' else: overall_display = '-' overall_classes = ["numeric-cell"] if overall_highlight: overall_classes.append("highlight-cell") table_html += f'' for level in level_ids: sr_col = sr_column_map[level] value = safe_float(row.get(sr_col, '')) if value != '': value_display = f'{value:.3f}' else: value_display = '-' cell_classes = ["numeric-cell"] if highlight_map.get(level): cell_classes.append("highlight-cell") table_html += f'' table_html += "" table_html += """
Rank Model Vendor LLM Type Overall {level}
{get_rank_badge(rank)} {row['Model']} {row['Vendor']} {get_type_badge(row['Model Type'])}{overall_display}{value_display}
""" return table_html def update_leaderboard_title(level_filter): """Update the leaderboard title based on selected level""" level_key = resolve_level(level_filter) level_info = level_details.get(level_key, level_details[default_level]) level_title = level_info["title"] level_description = level_info["description"] return f"""

Agent Leaderboard · {level_title}

{level_description}

""" model_type_lookup = { "OSS": "Open source", "API": "Proprietary" } def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"): """Apply shared filters and sorting to the leaderboard dataframe.""" filtered_df = df.copy() level_key = resolve_level(level_filter) highlight_column = None if model_type_filter != "All": mapped_type = model_type_lookup.get(model_type_filter, model_type_filter) filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type] actual_sort_column = sort_by if sort_by in filtered_df.columns else None if not actual_sort_column: if level_key == "ALL": actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None else: actual_sort_column = sr_column_map.get(level_key) if level_key in sr_column_map: highlight_column = sr_column_map[level_key] elif level_key == "ALL" and overall_sort_column in filtered_df.columns: highlight_column = overall_sort_column if actual_sort_column and actual_sort_column in filtered_df.columns: ascending = (sort_order == "Ascending") filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') return filtered_df, level_key, highlight_column def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order): """Filter and sort the leaderboard data""" df = load_leaderboard_data() filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by) # Generate HTML table return generate_html_table(filtered_df, highlight_column) # Load initial data initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending") initial_df = load_leaderboard_data() # Load raw data for model selector if not initial_df.empty: overall_success_numeric = pd.to_numeric(initial_df.get('Overall Success'), errors='coerce') if overall_success_numeric.notna().any(): initial_df = initial_df.assign(**{'Overall Success': overall_success_numeric}).sort_values( 'Overall Success', ascending=False, na_position='last' ) else: initial_df = initial_df.sort_values('Model') initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else [] initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else [] initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models) initial_level_metric_level = level_ids[0] if level_ids else None initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else [] initial_level_model_values = initial_level_model_choices[:5] initial_level_metric_chart = create_level_metric_chart( initial_df, initial_level_metric_level, initial_level_model_values ) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available") # Load custom CSS and responsive styles custom_css = get_leaderboard_css() + get_responsive_styles() + """ """ gr.HTML(custom_css) # Header styles and navigation gr.HTML(""" """) gr.HTML("
") gr.Image( value="banner_wide.png", show_label=False, interactive=False, type="filepath", elem_id="hero-banner" ) gr.HTML("
") gr.HTML("""

Hugging Face KREW Ko-AgentBench

Agent benchmark optimized for real Korean usage.

""") # Links section below title gr.HTML(""" """) # Section 1: Task Design by Stage gr.HTML("""

7-Level Task Design

We analyzed agent capabilities across seven stages—from simple tool calls to long-context retention and robustness.

Single Turn

80%
  • L1: Single Tool Call
  • L2: Tool Selection
  • L3: Sequential Tool Reasoning
  • L4: Parallel Tool Reasoning
  • L5: Error Handling & Robustness

Multi Turn

20%
  • L6: Efficient Tool Utilization
  • L7: Long-Context Memory
""") # Section 2: Core Scenario Design gr.HTML("""

High-quality scenario design tailored to 18 Korea-specific APIs and real-world use cases.

We built realistic scenarios—such as appointment booking and blog review search—by integrating APIs widely used in Korea including Naver Maps, Kakao services, and local websites.

""") # Section 3: Key Evaluation Criteria gr.HTML("""

Key Evaluation Criteria

Cache-based Iterative Evaluation

  • Improved handling of failed API responses
  • Addresses chronic benchmark issues such as mismatched response attributes
  • Ensures benchmark consistency and reliability

Robustness Testing

  • Evaluates recognition and response strategies for intentional failure scenarios (e.g., discontinued products)
  • Surfaces models that remain stable in real-world deployments

Level-specific Precision Metrics

  • Evaluates each phase of problem solving, including tool selection, parameter setup, and data flow
  • Quantitatively identifies model strengths and weaknesses
""") # Metrics overview cards removed per updated design # Domain filter section with enhanced styling gr.HTML(""" """) level_options = list(level_details.keys()) # Main leaderboard table with dynamic title and integrated controls leaderboard_title = gr.HTML(update_leaderboard_title(default_level)) # Integrated controls within leaderboard section - stacked vertically gr.HTML("

Select Task Level

") domain_filter = gr.Radio( choices=level_options, value=default_level, label="", interactive=True, container=False, elem_classes=["domain-radio", "inline-radio"] ) gr.HTML("

🔍 Filters & Sorting

") with gr.Row(): with gr.Column(scale=1): gr.HTML("Model Access") model_type_filter = gr.Radio( choices=["All", "OSS", "API"], value="All", label="", elem_classes=["domain-radio", "inline-radio"], container=False ) with gr.Column(scale=1): gr.HTML("You can select up to five models.

") model_selector = gr.Dropdown( choices=initial_df['Model'].tolist()[:10], value=initial_df['Model'].tolist()[:5], multiselect=True, label="", info=None, container=False, ) # Radar chart plot - wrapped in centered container gr.HTML('
') radar_chart = gr.Plot( label="", value=create_domain_radar_chart( load_leaderboard_data(), initial_df['Model'].tolist()[:5] ), elem_classes=["radar-chart", "plot-container"] ) gr.HTML('
') gr.HTML("
") # Define generate_performance_card function before using it def generate_performance_card(model_name): """Generate HTML for the model performance card""" if not model_name: return """
Please select a model to generate its performance card
""" # Get model data df = load_leaderboard_data() model_data = df[df['Model'] == model_name] if model_data.empty: return """
Model not found in the database
""" row = model_data.iloc[0] # Get overall rank based on overall success df_with_success = df.copy() df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce') df_with_success = df_with_success[df_with_success['Overall Success'].notna()] df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True) try: rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1 except: rank = 'N/A' # Format values def format_value(val, decimals=3, prefix='', suffix=''): if pd.isna(val) or val == '': return 'N/A' return f"{prefix}{float(val):.{decimals}f}{suffix}" def format_score(value): if pd.isna(value) or value == '': return 'N/A' return f"{float(value):.3f}" radar_metrics = [ ("Execution Accuracy", row.get('Execution Accuracy')), ("Complex Reasoning", row.get('Complex Reasoning')), ("Robustness", row.get('Robustness')), ("Context & Efficiency", row.get('Context & Efficiency')), ("Overall Success", row.get('Overall Success')), ("Validity", row.get('Call Validity')), ] radar_values = [] radar_labels = [] for label, value in radar_metrics: if pd.isna(value) or value == '': radar_values.append(0.0) else: try: radar_values.append(max(0.0, min(1.0, float(value)))) except (TypeError, ValueError): radar_values.append(0.0) radar_labels.append(label) mini_radar_html = build_static_radar_chart(radar_values, radar_labels) level_blocks = [] for level in level_ids: sr_col = sr_column_map.get(level) level_blocks.append((level, row.get(sr_col, ''))) evaluation_date = EVALUATION_DATE icon_html = "" if KREW_ICON_BASE64: icon_html = f'Krew icon' else: icon_html = '
🤖
' card_html = f"""
{icon_html}
{model_name}
Vendor · {row['Vendor']}
Evaluation Date {evaluation_date}
RANK
#{rank}
{mini_radar_html}
""" ordered_labels = ["Execution Accuracy", "Complex Reasoning", "Robustness", "Context & Efficiency", "Overall Success", "Validity"] ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels)) top_metrics = ordered_metrics[:3] bottom_metrics = ordered_metrics[3:] card_html += """
""" for label, value in top_metrics: card_html += f"""
{label}
{format_score(value)}
""" card_html += """
""" for label, value in bottom_metrics: card_html += f"""
{label}
{format_score(value)}
""" card_html += """
""" for level, value in level_blocks: card_html += f"""
{level}
{format_score(value)}
""" card_html += """
""" return card_html # MODEL PERFORMANCE CARD SECTION gr.HTML("""

Model Performance Card

Explore detailed performance cards that visualize six core metrics plus overall SR across L1–L7 levels.

※ Ranks are determined by the average SR across L1–L7.

""") with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"): gr.HTML("""

Choose a model to generate its analysis card.

""") card_model_selector = gr.Dropdown( choices=initial_df['Model'].tolist(), value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None, label="", info=None, container=False, # elem_classes=["model-dropdown"] ) download_card_btn = gr.Button( "Download as PNG", elem_id="download-card-btn-en", elem_classes=["pill-button"] ) gr.HTML("""
""") # Card display area - generate initial card initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None initial_card_html = generate_performance_card(initial_model) if initial_model else "" card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html-en") gr.HTML("""
""") # Level metric breakdown section gr.HTML("""

Level-specific Metrics

Compare model scores with each Ko-AgentBench level's dedicated metrics for deeper insights.

""") with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"): level_metric_selector = gr.Dropdown( choices=level_ids, value=level_ids[0] if level_ids else None, multiselect=False, label="", info=None, container=False, elem_classes=["level-dropdown"] ) level_model_selector = gr.Dropdown( choices=initial_level_model_choices, value=initial_level_model_values, multiselect=True, label="", info=None, container=False, elem_classes=["model-dropdown", "level-model-dropdown"] ) gr.HTML('
') level_metric_chart = gr.Plot( label="", value=initial_level_metric_chart, elem_classes=["level-metric-plot", "plot-container"] ) gr.HTML("""
""") # # Heatmap section # gr.HTML(""" #
#
#

Comprehensive Performance Heatmap

#

See each model's L1–L7 SR scores at a glance.

#
#
# """) # heatmap_chart = gr.Plot( # label="", # value=initial_heatmap, # elem_classes=["heatmap-plot", "plot-container"] # ) # gr.HTML(""" #
#
# """) # Update functions def get_optimal_sort_order(sort_by_value): """Return the optimal sort order for a given metric""" # Metrics where higher is better (descending) descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids] # Metrics where lower is better (ascending) ascending_metrics = [] if sort_by_value in descending_metrics: return "Descending" elif sort_by_value in ascending_metrics: return "Ascending" else: return "Descending" # Default fallback def update_table(level_filter, model_type_filter, sort_order): title_html = update_leaderboard_title(level_filter) sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success") table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order) return title_html, table_html def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): # Get filtered dataframe df = load_leaderboard_data() sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) # Update model selector choices based on filtered data available_models_all = filtered_df['Model'].tolist() available_models = available_models_all[:15] # Top 15 from filtered results # If selected models are not in available models, reset to top 5 if selected_models: valid_selected = [m for m in selected_models if m in available_models] # Check if more than 5 models are selected and show alert if len(valid_selected) > 5: gr.Warning("You can select up to 5 models.") # Remove the last selected item (6th item) instead of keeping first 5 valid_selected = valid_selected[:-1] if not valid_selected: valid_selected = available_models[:5] else: valid_selected = available_models[:5] # Create radar chart chart = create_domain_radar_chart(filtered_df, valid_selected) # Prepare heatmap order prioritizing selected models # Level metric chart effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) available_level_models = available_models_all if level_selected_models: valid_level_models = [m for m in level_selected_models if m in available_level_models][:5] if not valid_level_models: valid_level_models = available_level_models[:5] else: valid_level_models = available_level_models[:5] level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") return ( gr.Dropdown( choices=available_models, value=valid_selected, multiselect=True, label="", info=None, container=False, # elem_classes=["model-dropdown"] ), chart, gr.Dropdown( choices=available_level_models, value=valid_level_models, multiselect=True, label="", info=None, container=False, elem_classes=["model-dropdown", "level-model-dropdown"] ), level_metric_fig, ) def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): # Get filtered dataframe df = load_leaderboard_data() sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) available_models_all = filtered_df['Model'].tolist() if selected_models: valid_selected = [m for m in selected_models if m in available_models_all] # Check if more than 5 models are selected and show alert if len(valid_selected) > 5: # JavaScript alert for exceeding 5 models gr.Warning("You can select up to 5 models.") # Remove the last selected item (6th item) instead of keeping first 5 valid_selected = valid_selected[:-1] if not valid_selected: valid_selected = available_models_all[:5] else: valid_selected = available_models_all[:5] effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) available_level_models = available_models_all if level_selected_models: valid_level_models = [m for m in level_selected_models if m in available_level_models][:5] if not valid_level_models: valid_level_models = available_level_models[:5] else: valid_level_models = available_level_models[:5] level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") return ( gr.Dropdown( choices=available_models_all[:15], value=valid_selected, multiselect=True, label="", info=None, container=False, ), create_domain_radar_chart(filtered_df, valid_selected), gr.Dropdown( choices=available_level_models, value=valid_level_models, multiselect=True, label="", info=None, container=False, elem_classes=["model-dropdown", "level-model-dropdown"] ), level_metric_fig, ) def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): df = load_leaderboard_data() sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) available_models = filtered_df['Model'].tolist() if level_selected_models: valid_level_models = [m for m in level_selected_models if m in available_models] # Check if more than 5 models are selected and show alert if len(valid_level_models) > 5: gr.Warning("You can select up to 5 models.") # Remove the last selected item (6th item) instead of keeping first 5 valid_level_models = valid_level_models[:-1] if not valid_level_models: valid_level_models = available_models[:5] else: valid_level_models = available_models[:5] effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") return ( gr.Dropdown( choices=available_models, value=valid_level_models, multiselect=True, label="", info=None, container=False, elem_classes=["model-dropdown", "level-model-dropdown"] ), level_chart, ) # Update table when filters change filter_inputs = [domain_filter, model_type_filter, sort_order] for input_component in filter_inputs: input_component.change( fn=update_table, inputs=filter_inputs, outputs=[leaderboard_title, leaderboard_table] ) # Also update radar chart when filters change input_component.change( fn=update_radar_chart, inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart] ) # Update radar chart when model selection changes model_selector.change( fn=update_radar_only, inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart] ) level_metric_selector.change( fn=update_level_metric_only, inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], outputs=[level_model_selector, level_metric_chart] ) level_model_selector.change( fn=update_level_metric_only, inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], outputs=[level_model_selector, level_metric_chart] ) # Add custom CSS for the performance card gr.HTML(""" """) # Wire up the card generator to selection change card_model_selector.change( fn=generate_performance_card, inputs=[card_model_selector], outputs=[card_display] ) # Wire up download button with html2canvas capture download_card_btn.click( fn=None, js=""" async () => { const ensureHtml2Canvas = () => new Promise((resolve, reject) => { if (window.html2canvas) { resolve(window.html2canvas); return; } const existing = document.querySelector('script[data-html2canvas]'); if (existing) { existing.addEventListener('load', () => resolve(window.html2canvas)); existing.addEventListener('error', reject); return; } const script = document.createElement('script'); script.src = 'https://cdn.jsdelivr.net/npm/html2canvas@1.4.1/dist/html2canvas.min.js'; script.async = true; script.dataset.html2canvas = 'true'; script.onload = () => resolve(window.html2canvas); script.onerror = () => reject(new Error('Failed to load html2canvas')); document.head.appendChild(script); }); const pause = (ms) => new Promise(resolve => setTimeout(resolve, ms)); await pause(60); const container = document.getElementById('performance-card-html-en'); const card = container?.querySelector('.performance-card'); if (!container || !card) { alert('Performance card not found. Please select a model first.'); return; } const btn = document.getElementById('download-card-btn-en'); const originalText = btn?.textContent || ''; if (btn) { btn.textContent = 'Generating...'; btn.disabled = true; } try { const html2canvasLib = await ensureHtml2Canvas(); if (!html2canvasLib) { throw new Error('html2canvas unavailable'); } const canvas = await html2canvasLib(card, { backgroundColor: '#01091A', scale: 2, logging: false, useCORS: true }); if (!canvas || !canvas.width || !canvas.height) { throw new Error('Captured canvas is empty'); } const link = document.createElement('a'); const modelName = card.querySelector('.card-model-name')?.textContent || 'model'; const timestamp = new Date().toISOString().slice(0, 10); const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`; link.download = fileName; const dataUrl = canvas.toDataURL('image/png'); if (!dataUrl || dataUrl === 'data:,' || dataUrl.length <= 'data:image/png;base64,'.length) { throw new Error('Failed to generate PNG data'); } link.href = dataUrl; document.body.appendChild(link); link.click(); document.body.removeChild(link); } catch (error) { console.error('Error capturing card:', error); alert('Failed to capture performance card. Please try again.'); } finally { if (btn) { btn.textContent = originalText; btn.disabled = false; } } } """ ) # Also update card when filters change to keep model selector in sync for input_component in filter_inputs: def update_dropdown_and_card(*args): filtered_df, _, _ = apply_filters( load_leaderboard_data(), args[0], args[1], args[2], "Overall Success" if args[0] == "ALL" else sr_column_map.get(resolve_level(args[0]), "Overall Success") ) choices = filtered_df['Model'].tolist() # Select first model from filtered list value = choices[0] if choices else None return gr.Dropdown( choices=choices, value=value, label="", info=None, container=False, # elem_classes=["model-dropdown"] ) input_component.change( fn=update_dropdown_and_card, inputs=filter_inputs, outputs=[card_model_selector] ) return leaderboard_table def create_leaderboard_v2_interface(): """Create the complete leaderboard v1 interface""" return create_leaderboard_v2_tab() def create_domain_radar_chart(df, selected_models=None, max_models=5): """Visualize six core capability metrics on a radar chart.""" df = df.copy() metrics_info = [ {"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"}, {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM · ArgAcc · SelectAcc"}, {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc · PSM · Coverage"}, {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting · FallbackSR"}, {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate · EffScore · ContextRetention"}, {"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"}, ] required_columns = [m["column"] for m in metrics_info] if df.empty or not any(col in df.columns for col in required_columns): return create_empty_radar_chart("Not enough data to build the capability radar") # Default model selection if not selected_models: if "Overall Success" in df.columns: top_models = df.sort_values("Overall Success", ascending=False) else: top_models = df selected_models = top_models['Model'].head(max_models).tolist() selected_models = selected_models[:max_models] # Ensure metric columns are numeric for metric in metrics_info: col = metric["column"] if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce') fig = go.Figure() angle_labels = [m["label"] for m in metrics_info] palette = [ {'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'}, {'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'}, {'fill': 'rgba(161, 98, 7, 0.22)', 'line': '#A16207'}, {'fill': 'rgba(220, 38, 38, 0.20)', 'line': '#DC2626'}, {'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'}, ] for idx, model_name in enumerate(selected_models): model_data = df[df['Model'] == model_name] if model_data.empty: continue row = model_data.iloc[0] values = [] tooltips = [] for metric in metrics_info: col = metric["column"] value = row[col] if col in row else float('nan') if pd.isna(value) or value == '': value = 0 values.append(float(value)) tooltips.append(metric["description"]) if not values: continue values_loop = values + [values[0]] angles_loop = angle_labels + [angle_labels[0]] tooltips_loop = tooltips + [tooltips[0]] colors = palette[idx % len(palette)] fig.add_trace( go.Scatterpolar( r=values_loop, theta=angles_loop, fill='toself', fillcolor=colors['fill'], line=dict(color=colors['line'], width=3), marker=dict( size=10, color=colors['line'], symbol='circle', line=dict(width=2, color='#01091A') ), name=model_name, customdata=tooltips_loop, mode="lines+markers", hovertemplate="%{fullData.name}
" + "%{theta}
" + "%{customdata}
" + "%{r:.3f}
" + "", hoverlabel=dict( bgcolor="rgba(1, 9, 26, 0.95)", bordercolor=colors['line'], font=dict(color="white", size=12, family="'Geist', sans-serif") ) ) ) tick_vals = [i / 5 for i in range(6)] tick_text = [f"{val:.2f}" for val in tick_vals] fig.update_layout( polar=dict( bgcolor='rgba(245, 246, 247, 0.03)', radialaxis=dict( visible=True, range=[0, 1], showline=True, linewidth=2, linecolor='rgba(245, 246, 247, 0.2)', gridcolor='rgba(245, 246, 247, 0.1)', gridwidth=1, tickvals=tick_vals, ticktext=tick_text, tickfont=dict( size=11, color='white', family="'Geist Mono', monospace" ) ), angularaxis=dict( showline=True, linewidth=2, linecolor='rgba(245, 246, 247, 0.2)', gridcolor='rgba(245, 246, 247, 0.08)', tickfont=dict( size=13, family="'Geist', sans-serif", color='white', weight=600 ), rotation=90, direction="clockwise", ), ), showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=-0.15, xanchor="center", x=0.5, font=dict(size=12, family="'Geist', sans-serif", color='white'), bgcolor='rgba(1, 9, 26, 0.8)', bordercolor='rgba(245, 246, 247, 0.2)', borderwidth=1, itemsizing='constant', itemwidth=30 ), title=dict( text="Core Capability Radar", x=0.5, y=0.97, font=dict( size=22, family="'Geist', sans-serif", color="white", weight=700 ), ), paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=800, width=900, margin=dict(t=30, b=50, l=10, r=10), autosize=True, annotations=[] ) return fig def create_performance_heatmap(df, ordered_models=None, max_models=12): """Render a heatmap of SR scores across task levels for selected models.""" df = df.copy() level_sequence = [f"L{i}" for i in range(1, 8)] sr_columns = [] for level in level_sequence: col = f"{level}_SR" if col in df.columns: df[col] = pd.to_numeric(df[col], errors="coerce") sr_columns.append((level, col)) if df.empty or not sr_columns: return create_empty_heatmap("Not enough SR data to render the heatmap") df = df.drop_duplicates(subset=["Model"]) if df.empty: return create_empty_heatmap("No models available to render the heatmap") sort_column = "Overall Success" if "Overall Success" in df.columns else sr_columns[0] df = df.sort_values(sort_column, ascending=False) if ordered_models: ordered_models = [m for m in ordered_models if m in df["Model"].tolist()] else: ordered_models = df["Model"].tolist() if not ordered_models: return create_empty_heatmap("No models available to render the heatmap") ordered_models = ordered_models[:max_models] heatmap_df = df.set_index("Model").reindex(ordered_models) level_labels = [] z_matrix = [] has_values = False for level, col in sr_columns: if col not in heatmap_df.columns: continue label = f"{level} · SR" level_labels.append(label) row_values = [] for model in ordered_models: value = heatmap_df.at[model, col] if model in heatmap_df.index else None if pd.isna(value): row_values.append(None) else: val = float(value) row_values.append(val) has_values = True z_matrix.append(row_values) if not level_labels or not has_values: return create_empty_heatmap("Not enough SR data to render the heatmap") colorscale = [ [0.0, "#0A0A0A"], [0.25, "#1A1411"], [0.5, "#332818"], [0.75, "#B8660A"], [1.0, "#FFD21E"], ] fig = go.Figure() fig.add_trace( go.Heatmap( z=z_matrix, x=ordered_models, y=level_labels, colorscale=colorscale, zmin=0, zmax=1, hovertemplate="%{y}
%{x}
SR · %{z:.3f}", colorbar=dict( title="Success Rate", titlefont=dict(color="white", family="'Geist', sans-serif", size=12), tickfont=dict(color="white", family="'Geist', sans-serif", size=10), thickness=12, len=0.7, outlinecolor="rgba(255, 255, 255, 0.1)", bgcolor="rgba(1, 9, 26, 0.75)" ), showscale=True ) ) annotations = [] for y_idx, level in enumerate(level_labels): for x_idx, model in enumerate(ordered_models): value = z_matrix[y_idx][x_idx] if value is None: continue font_color = "#0B1120" if value >= 0.6 else "#F8FAFC" annotations.append( dict( x=model, y=level, text=f"{value:.3f}", showarrow=False, font=dict( family="'Geist Mono', monospace", size=11, color=font_color ) ) ) fig.update_layout( paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", margin=dict(t=80, b=90, l=110, r=160), height=520, width=1450, font=dict(family="'Geist', sans-serif", color="white"), xaxis=dict( tickangle=-25, showgrid=False, ticks="", tickfont=dict(size=11, family="'Geist', sans-serif", color="white") ), yaxis=dict( showgrid=False, ticks="", tickfont=dict(size=12, family="'Geist', sans-serif", color="white") ), annotations=annotations, title=dict( text="Comprehensive Performance Heatmap", x=0.5, y=0.98, font=dict( size=20, family="'Geist', sans-serif", color="white", weight=700 ), ) ) fig.update_xaxes(side="bottom") return fig def create_empty_heatmap(message): """Render an empty state for the heatmap with a centered message.""" fig = go.Figure() fig.add_annotation( text=f"🗺️ {message}", xref="paper", yref="paper", x=0.5, y=0.5, xanchor='center', yanchor='middle', font=dict( size=18, color="white", family="'Geist', sans-serif" ), showarrow=False, bgcolor="rgba(245, 246, 247, 0.05)", bordercolor="rgba(245, 246, 247, 0.2)", borderwidth=1, borderpad=20 ) fig.update_layout( paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=520, # width=1450, autosize=True, margin=dict(t=80, b=80, l=80, r=160), title=dict( text="Comprehensive Performance Heatmap", x=0.5, y=0.98, font=dict( size=20, family="'Geist', sans-serif", color="white", weight=700 ), ) ) fig.update_xaxes(visible=False) fig.update_yaxes(visible=False) return fig def create_level_metric_chart(df, level, selected_models=None, max_models=5): """Render a grouped horizontal bar chart showing per-model scores for a level's metrics.""" if not level: return create_empty_level_metric_chart("Select a level to view its metrics") df = df.copy() level_prefix = f"{level}_" level_columns = [col for col in df.columns if col.startswith(level_prefix)] metric_columns = [] for col in level_columns: metric_suffix = col[len(level_prefix):] metric_key_lower = metric_suffix.lower() if "cost" in metric_key_lower: continue numeric_series = pd.to_numeric(df[col], errors='coerce') valid_values = numeric_series.dropna() if valid_values.empty: continue if (valid_values < 0).any() or (valid_values > 1.05).any(): continue df[col] = numeric_series metric_columns.append(col) if not metric_columns: return create_empty_level_metric_chart("This level has no 0-1 metrics to visualize") df = df.drop_duplicates(subset=['Model']) if df.empty: return create_empty_level_metric_chart("No models available to render level metrics") if selected_models: model_order = [m for m in selected_models if m in df['Model'].tolist()] else: sort_col = 'Overall Success' if 'Overall Success' in df.columns else metric_columns[0] model_order = df.sort_values(sort_col, ascending=False)['Model'].tolist() if not model_order: model_order = df['Model'].tolist() model_order = model_order[:max_models] df_models = df[df['Model'].isin(model_order)].set_index('Model') if df_models.empty: return create_empty_level_metric_chart("No matching models for selected filters") def prettify_metric_name(metric_key): raw = metric_key[len(level_prefix):] text = raw.replace('_', ' ') text = re.sub(r'(?<=.)([A-Z])', r' \1', text) text = text.replace('Avg', 'Average') replacements = { 'Sr': 'SR', 'Ac': 'AC', 'Tsq': 'TSQ', 'Cvr': 'CVR', 'Psm': 'PSM', 'Prov': 'Prov', 'Call Em': 'CallEM', 'Reuse Rate': 'Reuse Rate', 'Eff Score': 'Eff Score' } words = text.title().split() words = [replacements.get(word, word) for word in words] return ' '.join(words) metric_labels = [] for col in metric_columns: label = prettify_metric_name(col) if label in metric_labels: suffix = 2 while f"{label} ({suffix})" in metric_labels: suffix += 1 label = f"{label} ({suffix})" metric_labels.append(label) model_palette = [ '#ffd21e', '#FF8A3C', '#A16207', '#DC2626', '#F8FAFC', '#38BDF8', ] fig = go.Figure() max_value = 0 for idx, model in enumerate(model_order): values = [] for col in metric_columns: value = df_models.at[model, col] if (model in df_models.index and col in df_models.columns) else float('nan') if pd.notna(value): values.append(float(value)) max_value = max(max_value, float(value)) else: values.append(None) color = model_palette[idx % len(model_palette)] fig.add_trace( go.Bar( name=model, y=metric_labels, x=values, orientation='h', marker=dict(color=color, line=dict(color='rgba(1,9,26,0.8)', width=1)), hovertemplate="%{y}
Model · %{fullData.name}
Score · %{x:.3f}", ) ) plot_height = max(360, 140 + 48 * len(metric_labels)) if max_value <= 0: x_range = [0, 1] else: x_range = [0, max_value * 1.05] fig.update_layout( barmode='group', bargap=0.25, bargroupgap=0.18, paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=plot_height, # width=1450, autosize=True, margin=dict(t=90, b=80, l=220, r=160), legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, bgcolor='rgba(1, 9, 26, 0.75)', bordercolor='rgba(245, 246, 247, 0.2)', borderwidth=1, font=dict(size=11, family="'Geist', sans-serif", color='white') ), xaxis=dict( title=dict(text=f"{level} Metric Score", font=dict(size=14, color="white")), tickfont=dict(size=11, color="white"), gridcolor='rgba(245, 246, 247, 0.08)', zerolinecolor='rgba(245, 246, 247, 0.18)', range=x_range ), yaxis=dict( tickfont=dict(size=13, color="white"), automargin=True ), title=dict( text=f"{level} Metric Breakdown", x=0.5, y=0.98, font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700) ) ) return fig def create_empty_level_metric_chart(message): fig = go.Figure() fig.add_annotation( text=f"🧭 {message}", xref="paper", yref="paper", x=0.5, y=0.5, xanchor='center', yanchor='middle', font=dict(size=18, color="white", family="'Geist', sans-serif"), showarrow=False, bgcolor="rgba(245, 246, 247, 0.05)", bordercolor="rgba(245, 246, 247, 0.2)", borderwidth=1, borderpad=20 ) fig.update_layout( paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=420, width=1450, margin=dict(t=80, b=60, l=80, r=120), title=dict( text="Level Metric Breakdown", x=0.5, y=0.98, font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700) ) ) fig.update_xaxes(visible=False) fig.update_yaxes(visible=False) return fig def create_empty_radar_chart(message): """Create an empty radar chart with a message""" fig = go.Figure() fig.add_annotation( text=f"📊 {message}", xref="paper", yref="paper", x=0.5, y=0.5, xanchor='center', yanchor='middle', font=dict( size=18, color="white", family="'Geist', sans-serif" ), showarrow=False, bgcolor="rgba(245, 246, 247, 0.05)", bordercolor="rgba(245, 246, 247, 0.2)", borderwidth=1, borderpad=20 ) fig.update_layout( paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=1450, width=1450, margin=dict(t=100, b=80, l=80, r=200), title=dict( text="Core Capability Radar", x=0.5, y=0.97, font=dict( size=22, family="'Geist', sans-serif", color="white", weight=700 ), ), annotations=[ dict( xref="paper", yref="paper", x=0.98, y=0.02, xanchor='right', yanchor='bottom', font=dict(size=10, color='#64748B'), showarrow=False ) ] ) return fig # NEW VISUALIZATION FUNCTIONS def create_cost_performance_scatter(df, metric="Avg AC"): """Create scatter plot showing cost vs performance efficiency""" # Filter out models without cost or performance data df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy() label_map = { 'Proprietary': 'API', 'Open source': 'OSS' } if df_filtered.empty: return create_empty_chart("No data available for cost-performance analysis") # Convert to numeric df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce') # Create color mapping for model type color_map = { 'Proprietary': '#1098F7', # Airglow Blue for Proprietary 'Open source': '#58BC82' # Green for Open source } df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7') fig = go.Figure() # Add scatter points for model_type in df_filtered['Model Type'].unique(): df_type = df_filtered[df_filtered['Model Type'] == model_type] legend_name = label_map.get(model_type, model_type) fig.add_trace(go.Scatter( x=df_type[metric], y=df_type['Avg Total Cost'], mode='markers+text', name=legend_name, text=df_type['Model'], textposition="top center", textfont=dict(size=10, color='white'), marker=dict( size=df_type['Avg Turns'] * 3, # Size based on number of turns color=color_map.get(model_type, '#F5F6F7'), opacity=0.8, line=dict(width=2, color='#01091A') ), hovertemplate="%{text}
" + f"{metric}: %{{x:.3f}}
" + "Cost: $%{y:.3f}
" + "Turns: %{marker.size:.1f}
" + "" )) # Add quadrant lines median_x = df_filtered[metric].median() median_y = df_filtered['Avg Total Cost'].median() fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) # Add quadrant labels fig.add_annotation(x=0.95, y=0.05, text="💎 High Performance
Low Cost", showarrow=False, xref="paper", yref="paper", font=dict(size=12, color="white"), bgcolor="rgba(245, 246, 247, 0.1)") fig.add_annotation(x=0.05, y=0.95, text="⚠️ Low Performance
High Cost", showarrow=False, xref="paper", yref="paper", font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)") metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" fig.update_layout( title=dict( text=f"Cost-Performance Efficiency: {metric_display}", x=0.5, y=0.97, font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) ), xaxis=dict( title=dict( text=f"{metric_display}", font=dict(size=16, color="white") ), tickfont=dict(size=12, color="white"), gridcolor="rgba(245, 246, 247, 0.1)", zerolinecolor="rgba(245, 246, 247, 0.2)" ), yaxis=dict( title=dict( text="Average Session Cost ($)", font=dict(size=16, color="white") ), tickfont=dict(size=12, color="white"), gridcolor="rgba(245, 246, 247, 0.1)", zerolinecolor="rgba(245, 246, 247, 0.2)" ), paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=900, width=1450, showlegend=True, legend=dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, font=dict(size=12, family="'Geist', sans-serif", color='white'), bgcolor='rgba(1, 9, 26, 0.8)', bordercolor='rgba(245, 246, 247, 0.2)', borderwidth=1 ), margin=dict(t=100, b=80, l=80, r=80) ) return fig def create_speed_accuracy_plot(df, metric="Avg AC"): """Create scatter plot showing speed vs accuracy trade-off""" # Filter out models without duration or performance data df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy() if df_filtered.empty: return create_empty_chart("No data available for speed-accuracy analysis") # Convert to numeric df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce') df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') # Create color scale based on cost df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') fig = go.Figure() # Add scatter trace fig.add_trace(go.Scatter( x=df_filtered[metric], y=df_filtered['Avg Session Duration'], mode='markers+text', text=df_filtered['Model'], textposition="top center", textfont=dict(size=9, color='white'), marker=dict( size=12, color=df_filtered['Avg Total Cost'], colorscale=[[0, '#0A0A0A'], [0.5, '#B8660A'], [1, '#ffd21e']], showscale=True, colorbar=dict( title=dict( text="Cost ($)", font=dict(color="white") ), tickfont=dict(color="white"), bgcolor="rgba(1, 9, 26, 0.8)", bordercolor="rgba(245, 246, 247, 0.2)", borderwidth=1, x=1.02 ), line=dict(width=2, color='#01091A') ), hovertemplate="%{text}
" + f"{metric}: %{{x:.3f}}
" + "Duration: %{y:.1f}s
" + "Cost: $%{marker.color:.3f}
" + "" )) # Add quadrant lines median_x = df_filtered[metric].median() median_y = df_filtered['Avg Session Duration'].median() fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) # Add quadrant labels fig.add_annotation(x=0.95, y=0.05, text="⚡ Fast & Accurate", showarrow=False, xref="paper", yref="paper", font=dict(size=12, color="white", weight=600)) fig.add_annotation(x=0.05, y=0.95, text="🐌 Slow & Inaccurate", showarrow=False, xref="paper", yref="paper", font=dict(size=12, color="#ffd21e", weight=600)) metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" fig.update_layout( title=dict( text=f"Speed vs Accuracy Trade-off: {metric_display}", x=0.5, y=0.97, font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) ), xaxis=dict( title=dict( text=f"{metric_display}", font=dict(size=16, color="white") ), tickfont=dict(size=12, color="white"), gridcolor="rgba(245, 246, 247, 0.1)", zerolinecolor="rgba(245, 246, 247, 0.2)" ), yaxis=dict( title=dict( text="Average Session Duration (seconds)", font=dict(size=16, color="white") ), tickfont=dict(size=12, color="white"), gridcolor="rgba(245, 246, 247, 0.1)", zerolinecolor="rgba(245, 246, 247, 0.2)" ), paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=900, width=1450, margin=dict(t=100, b=80, l=80, r=120) ) return fig def create_domain_specialization_matrix(df, metric_type="AC"): """Create bubble chart showing domain specialization""" domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] # Prepare data data = [] for _, model in df.iterrows(): if model['Model'] == '': continue model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce') if pd.isna(model_avg): continue for domain in domains: domain_col = f'{domain} {metric_type}' if domain_col in model and model[domain_col] != '': domain_val = pd.to_numeric(model[domain_col], errors='coerce') if not pd.isna(domain_val): # Calculate specialization strength (deviation from model average) specialization = domain_val - model_avg data.append({ 'Model': model['Model'], 'Domain': domain, 'Performance': domain_val, 'Specialization': specialization, 'Model Type': model['Model Type'] }) if not data: return create_empty_chart("No domain specialization data available") df_plot = pd.DataFrame(data) # Create bubble chart fig = go.Figure() # Color based on specialization strength fig.add_trace(go.Scatter( x=df_plot['Domain'], y=df_plot['Model'], mode='markers', marker=dict( size=df_plot['Performance'] * 30, # Size based on absolute performance color=df_plot['Specialization'], colorscale=[[0, '#B8660A'], [0.5, '#E6B800'], [1, '#ffd21e']], showscale=True, colorbar=dict( title=dict( text="Specialization
Strength", font=dict(color="white") ), tickfont=dict(color="white"), bgcolor="rgba(1, 9, 26, 0.8)", bordercolor="rgba(245, 246, 247, 0.2)", borderwidth=1 ), line=dict(width=2, color='#01091A'), opacity=0.8 ), text=[f"Performance: {p:.3f}
Specialization: {s:+.3f}" for p, s in zip(df_plot['Performance'], df_plot['Specialization'])], hovertemplate="%{y}
" + "Domain: %{x}
" + "%{text}
" + "" )) metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" fig.update_layout( title=dict( text=f"Domain Specialization Matrix: {metric_display}", x=0.5, y=0.97, font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) ), xaxis=dict( title=dict( text="Business Domains", font=dict(size=16, color="white") ), tickfont=dict(size=13, color="white"), gridcolor="rgba(245, 246, 247, 0.1)" ), yaxis=dict( title=dict( text="Models", font=dict(size=16, color="white") ), tickfont=dict(size=11, color="white"), gridcolor="rgba(245, 246, 247, 0.1)" ), paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=1100, width=1450, margin=dict(t=100, b=80, l=220, r=120) ) return fig def create_performance_gap_analysis(df, metric_type="AC"): """Create range plot showing performance gaps by domain""" domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] # Calculate min, max, median for each domain gap_data = [] for domain in domains: domain_col = f'{domain} {metric_type}' if domain_col in df.columns: domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna() if len(domain_values) > 0: gap_data.append({ 'Domain': domain, 'Min': domain_values.min(), 'Max': domain_values.max(), 'Median': domain_values.median(), 'Q1': domain_values.quantile(0.25), 'Q3': domain_values.quantile(0.75), 'Gap': domain_values.max() - domain_values.min() }) if not gap_data: return create_empty_chart("No data available for gap analysis") df_gap = pd.DataFrame(gap_data) df_gap = df_gap.sort_values('Gap', ascending=True) fig = go.Figure() # Add range bars for idx, row in df_gap.iterrows(): # Add full range line fig.add_trace(go.Scatter( x=[row['Min'], row['Max']], y=[row['Domain'], row['Domain']], mode='lines', line=dict(color='#64748B', width=2), showlegend=False, hoverinfo='skip' )) # Add IQR box fig.add_trace(go.Scatter( x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']], y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']], fill='toself', fillcolor='rgba(255, 210, 30, 0.3)', line=dict(color='#ffd21e', width=2), showlegend=False, hoverinfo='skip', mode='lines' )) # Add median marker fig.add_trace(go.Scatter( x=[row['Median']], y=[row['Domain']], mode='markers', marker=dict( size=12, color='#ffd21e', symbol='diamond', line=dict(width=2, color='#01091A') ), showlegend=False, hovertemplate=f"{row['Domain']}
" + f"Min: {row['Min']:.3f}
" + f"Q1: {row['Q1']:.3f}
" + f"Median: {row['Median']:.3f}
" + f"Q3: {row['Q3']:.3f}
" + f"Max: {row['Max']:.3f}
" + f"Gap: {row['Gap']:.3f}
" + "" )) # Add min/max points for idx, row in df_gap.iterrows(): fig.add_trace(go.Scatter( x=[row['Min'], row['Max']], y=[row['Domain'], row['Domain']], mode='markers', marker=dict(size=8, color='white', line=dict(width=2, color='#01091A')), showlegend=False, hoverinfo='skip' )) metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" fig.update_layout( title=dict( text=f"Performance Gap Analysis by Domain: {metric_display}", x=0.5, y=0.97, font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) ), xaxis=dict( title=dict( text=f"{metric_display} Score", font=dict(size=16, color="white") ), tickfont=dict(size=12, color="white"), gridcolor="rgba(245, 246, 247, 0.1)", range=[0, 1] if metric_type in ['AC', 'TSQ'] else None ), yaxis=dict( title=dict( text="Business Domain", font=dict(size=16, color="white") ), tickfont=dict(size=13, color="white"), gridcolor="rgba(245, 246, 247, 0.1)" ), paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=800, width=1450, margin=dict(t=100, b=80, l=140, r=80), showlegend=False ) # Add legend manually fig.add_annotation( text="◆ Median ━ IQR ─ Full Range", xref="paper", yref="paper", x=0.98, y=0.02, xanchor='right', yanchor='bottom', font=dict(size=12, color='white'), showarrow=False ) return fig def create_empty_chart(message): """Create an empty chart with a message""" fig = go.Figure() fig.add_annotation( text=f"📊 {message}", xref="paper", yref="paper", x=0.5, y=0.5, xanchor='center', yanchor='middle', font=dict( size=18, color="white", family="'Geist', sans-serif" ), showarrow=False, bgcolor="rgba(245, 246, 247, 0.05)", bordercolor="rgba(245, 246, 247, 0.2)", borderwidth=1, borderpad=20 ) fig.update_layout( paper_bgcolor="#01091A", plot_bgcolor="rgba(245, 246, 247, 0.02)", height=700, width=1450, margin=dict(t=80, b=80, l=80, r=80) )