Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| """ | |
| Agent Leaderboard v1 - Main leaderboard interface | |
| Updated implementation with LLM Type support and optimized radar charts | |
| """ | |
| import base64 | |
| import math | |
| import re | |
| from datetime import datetime | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| # Import components and styles from modular files | |
| from components.leaderboard_components import ( | |
| get_chart_colors, get_rank_badge, get_type_badge, | |
| get_metric_tooltip, get_responsive_styles, get_faq_section | |
| ) | |
| from styles.leaderboard_styles import get_leaderboard_css | |
| ASSET_ICON_PATH = Path("krew_icon.png") | |
| KREW_ICON_BASE64 = "" | |
| if ASSET_ICON_PATH.exists(): | |
| KREW_ICON_BASE64 = base64.b64encode(ASSET_ICON_PATH.read_bytes()).decode("utf-8") | |
| CSV_PATH = Path("combined_evaluation_summary.csv") | |
| if CSV_PATH.exists(): | |
| EVALUATION_DATE = datetime.fromtimestamp(CSV_PATH.stat().st_mtime).strftime("%Y-%m-%d") | |
| else: | |
| EVALUATION_DATE = datetime.today().strftime("%Y-%m-%d") | |
| def create_leaderboard_v2_tab(): | |
| """Create the main leaderboard v1 tab with interactive table""" | |
| token_to_cost_factor = 2e-6 # Rough cost per token ($2 per 1M tokens) | |
| tokens_per_turn = 1000 # Approximate tokens exchanged per turn for scaling | |
| level_ids = [f"L{i}" for i in range(1, 8)] | |
| level_tsq_sources = { | |
| "L1": "L1_ArgAcc", | |
| "L2": "L2_SelectAcc", | |
| "L3": "L3_PSM", | |
| "L4": "L4_Coverage", | |
| "L5": "L5_AdaptiveRoutingScore", | |
| "L6": "L6_EffScore", | |
| "L7": "L7_ContextRetention", | |
| } | |
| def load_leaderboard_data(): | |
| """Load and prepare the leaderboard data""" | |
| df = pd.read_csv('combined_evaluation_summary.csv') | |
| # Clean and prepare data | |
| df = df.copy() | |
| numeric_candidate_cols = [col for col in df.columns if col not in ('Model', 'Vendor')] | |
| for col in numeric_candidate_cols: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| # Derive per-level helper columns for cost and turns | |
| sr_columns = [] | |
| tsq_columns = [] | |
| duration_columns = [] | |
| cost_columns = [] | |
| turns_columns = [] | |
| for level in level_ids: | |
| sr_col = f"{level}_SR" | |
| if sr_col in df.columns: | |
| sr_columns.append(sr_col) | |
| df[sr_col] = df[sr_col].round(3) | |
| tsq_source = level_tsq_sources.get(level) | |
| if tsq_source and tsq_source in df.columns: | |
| tsq_columns.append(tsq_source) | |
| duration_col = f"{level}_Avg_Exec_Time" | |
| if duration_col in df.columns: | |
| duration_columns.append(duration_col) | |
| token_col = f"{level}_Avg_Tokens" | |
| if token_col in df.columns: | |
| cost_col = f"{level}_Avg_Cost" | |
| turns_col = f"{level}_Avg_Turns" | |
| df[cost_col] = df[token_col] * token_to_cost_factor | |
| df[turns_col] = df[token_col] / tokens_per_turn | |
| cost_columns.append(cost_col) | |
| turns_columns.append(turns_col) | |
| if sr_columns: | |
| df['Avg AC'] = df[sr_columns].mean(axis=1) | |
| if tsq_columns: | |
| df['Avg TSQ'] = df[tsq_columns].mean(axis=1) | |
| if cost_columns: | |
| df['Avg Total Cost'] = df[cost_columns].mean(axis=1) | |
| if duration_columns: | |
| df['Avg Session Duration'] = df[duration_columns].mean(axis=1) | |
| if turns_columns: | |
| df['Avg Turns'] = df[turns_columns].mean(axis=1) | |
| # Derive core capability metrics for radar visualization | |
| if sr_columns: | |
| df['Overall Success'] = df[sr_columns].mean(axis=1) | |
| execution_cols = [col for col in ['L1_CallEM', 'L1_ArgAcc', 'L2_SelectAcc'] if col in df.columns] | |
| if execution_cols: | |
| df['Execution Accuracy'] = df[execution_cols].mean(axis=1) | |
| reasoning_cols = [col for col in ['L3_ProvAcc', 'L3_PSM', 'L4_Coverage'] if col in df.columns] | |
| if reasoning_cols: | |
| df['Complex Reasoning'] = df[reasoning_cols].mean(axis=1) | |
| robustness_cols = [col for col in ['L5_AdaptiveRoutingScore', 'L5_FallbackSR'] if col in df.columns] | |
| if robustness_cols: | |
| df['Robustness'] = df[robustness_cols].mean(axis=1) | |
| context_cols = [col for col in ['L6_ReuseRage', 'L6_EffScore', 'L7_ContextRetention'] if col in df.columns] | |
| if context_cols: | |
| df['Context & Efficiency'] = df[context_cols].mean(axis=1) | |
| epr_cols = [f"L{i}_EPR_CVR" for i in range(1, 8) if f"L{i}_EPR_CVR" in df.columns] | |
| if epr_cols: | |
| df['Call Validity'] = df[epr_cols].mean(axis=1) | |
| # Use LLM Type from CSV directly, with mapping to display names | |
| if 'LLM Type' in df.columns: | |
| # Clean the LLM Type column to remove any whitespace | |
| df['LLM Type'] = df['LLM Type'].astype(str).str.strip() | |
| # Map LLM Type to Model Type | |
| def map_llm_type(llm_type): | |
| if llm_type.upper() == "OSS": | |
| return "Open source" | |
| else: | |
| return "Proprietary" | |
| df['Model Type'] = df['LLM Type'].apply(map_llm_type) | |
| else: | |
| # Fallback to vendor mapping if LLM Type column doesn't exist | |
| vendor_model_type_map = { | |
| "OpenAI": "Proprietary", | |
| "Anthropic": "Proprietary", | |
| "Google": "Proprietary", | |
| "Microsoft": "Proprietary", | |
| "Mistral": "Proprietary", | |
| "Databricks": "Open source", | |
| "Meta": "Open source", | |
| "Alibaba": "Open source", | |
| "μ리λ°λ°": "Open source", # Korean name for Alibaba | |
| "Kakao": "Open source", | |
| "SKT": "Open source", | |
| "KT": "Open source", | |
| "xAI": "Proprietary", | |
| } | |
| df['Model Type'] = df['Vendor'].map(vendor_model_type_map).fillna('Proprietary') | |
| # Round numeric columns for better display | |
| round_three_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Overall Success', 'Execution Accuracy', | |
| 'Complex Reasoning', 'Robustness', 'Context & Efficiency', 'Call Validity'] | |
| round_one_cols = ['Avg Session Duration', 'Avg Turns'] | |
| for col in round_three_cols: | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce').round(3) | |
| for col in round_one_cols: | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce').round(1) | |
| if cost_columns: | |
| df[cost_columns] = df[cost_columns].apply(pd.to_numeric, errors='coerce').round(3) | |
| if turns_columns: | |
| df[turns_columns] = df[turns_columns].apply(pd.to_numeric, errors='coerce').round(2) | |
| if duration_columns: | |
| df[duration_columns] = df[duration_columns].apply(pd.to_numeric, errors='coerce').round(2) | |
| # Fill NaN values appropriately | |
| df = df.fillna('') | |
| return df | |
| def build_static_radar_chart(values, labels): | |
| """Render a small static radar chart as inline SVG""" | |
| if not values or all(v == 0 for v in values): | |
| return """ | |
| <div class="radar-placeholder"> | |
| <span>Radar Chart</span> | |
| <small>Execution Accuracy Β· Complex Reasoning Β· Robustness Β· Context & Efficiency Β· Overall Success Β· Validity</small> | |
| </div> | |
| """ | |
| size = 220 | |
| center = size / 2 | |
| radius = size * 0.38 | |
| n = len(values) | |
| def point(v, idx, scale=1.0): | |
| angle = (2 * math.pi * idx / n) - math.pi / 2 | |
| r = radius * v * scale | |
| x = center + r * math.cos(angle) | |
| y = center + r * math.sin(angle) | |
| return x, y | |
| polygon_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(v, i) for i, v in enumerate(values))) | |
| ring_polygons = [] | |
| for step in (0.33, 0.66, 1.0): | |
| ring_points = " ".join(f"{x:.2f},{y:.2f}" for x, y in (point(step, i) for i in range(n))) | |
| opacity = 0.04 if step < 1.0 else 0.08 | |
| ring_polygons.append(f'<polygon points="{ring_points}" fill="rgba(245,246,247,{opacity})" stroke="rgba(148,163,184,0.35)" stroke-width="1" />') | |
| axis_lines = "\n".join( | |
| f'<line x1="{center:.2f}" y1="{center:.2f}" x2="{point(1, idx)[0]:.2f}" y2="{point(1, idx)[1]:.2f}" stroke="rgba(148,163,184,0.35)" stroke-width="1" />' | |
| for idx in range(n) | |
| ) | |
| label_spans = "\n".join( | |
| f'<text x="{point(1.1, idx)[0]:.2f}" y="{point(1.1, idx)[1]:.2f}" text-anchor="middle" dominant-baseline="middle" font-size="9" fill="white">{label}</text>' | |
| for idx, label in enumerate(labels) | |
| ) | |
| svg = f""" | |
| <svg width="{size}" height="{size}" viewBox="0 0 {size} {size}" xmlns="http://www.w3.org/2000/svg" role="img"> | |
| <defs> | |
| <radialGradient id="radarGlow" cx="50%" cy="50%" r="50%"> | |
| <stop offset="0%" stop-color="rgba(255,210,30,0.25)" /> | |
| <stop offset="100%" stop-color="rgba(255,210,30,0.0)" /> | |
| </radialGradient> | |
| </defs> | |
| <rect width="{size}" height="{size}" fill="url(#radarGlow)" opacity="0.2" /> | |
| {''.join(ring_polygons)} | |
| {axis_lines} | |
| <polygon points="{polygon_points}" fill="rgba(255,210,30,0.35)" stroke="#ffd21e" stroke-width="2" /> | |
| {label_spans} | |
| </svg> | |
| """ | |
| return svg | |
| # Level metadata for the 7-stage task framework | |
| level_details = { | |
| "ALL": { | |
| "title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>ALL Β· All Tasks</span>", | |
| "description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>First, observe the overall average performance across all seven tasks. This average should then be utilized as a baseline to conduct a more detailed per-level comparison.</span>" | |
| }, | |
| "L1": { | |
| "title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L1 Β· Single Tool Call</span>", | |
| "description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Evaluates single tool invocation capability and basic command execution accuracy.</span>" | |
| }, | |
| "L2": { | |
| "title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L2 Β· Tool Selection</span>", | |
| "description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Measures the ability to choose the right tool and invoke it with appropriate parameters.</span>" | |
| }, | |
| "L3": { | |
| "title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L3 Β· Sequential Tool Reasoning</span>", | |
| "description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Validates multi-step sequential reasoning for solving tasks.</span>" | |
| }, | |
| "L4": { | |
| "title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L4 Β· Parallel Tool Reasoning</span>", | |
| "description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Evaluates the ability to integrate and summarize information from multiple sources in parallel.</span>" | |
| }, | |
| "L5": { | |
| "title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L5 Β· Error Handling & Robustness</span>", | |
| "description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Checks awareness of unexpected failures and the strategies used to recover.</span>" | |
| }, | |
| "L6": { | |
| "title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L6 Β· Efficient Tool Utilization</span>", | |
| "description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Examines operational efficiency in achieving goals with minimal calls and cost.</span>" | |
| }, | |
| "L7": { | |
| "title": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>L7 Β· Long-Context Memory</span>", | |
| "description": "<span style='color: var(--text-primary); font-family: \"Nanum Gothic\", sans-serif !important;'>Analyzes the ability to retain and leverage long conversational context.</span>" | |
| } | |
| } | |
| default_level = "ALL" | |
| sr_column_map = {level: f"{level}_SR" for level in level_ids} | |
| overall_sort_column = "Overall Success" | |
| def resolve_level(level_value): | |
| """Normalize the incoming level filter value""" | |
| if not level_value: | |
| return default_level | |
| return level_value if level_value in level_details else default_level | |
| def generate_html_table(filtered_df, highlight_column): | |
| """Generate styled HTML table with per-level success rates""" | |
| valid_highlights = list(sr_column_map.values()) + ["Overall Success"] | |
| highlight_column = highlight_column if highlight_column in valid_highlights else None | |
| overall_column = "Overall Success" | |
| overall_highlight = (highlight_column == overall_column) | |
| highlight_map = {level: (sr_column_map[level] == highlight_column) for level in level_ids} | |
| table_html = """ | |
| <style> | |
| /* Dark theme table styling */ | |
| .v2-table-container { | |
| background: var(--bg-card); | |
| border-radius: 16px; | |
| overflow: hidden; | |
| border: 1px solid var(--border-subtle); | |
| margin-top: 20px; | |
| } | |
| .v2-styled-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif; | |
| background: var(--bg-card); | |
| color: #FFFFFF; | |
| } | |
| .v2-styled-table thead { | |
| position: sticky; | |
| top: 0; | |
| background: rgba(255, 210, 30, 0.1); | |
| z-index: 1; | |
| } | |
| .v2-styled-table th { | |
| padding: 14px 12px; | |
| text-align: left; | |
| font-weight: 600; | |
| color: #FFFFFF; | |
| border-bottom: 2px solid var(--accent-primary); | |
| font-size: 13px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| .v2-styled-table th.numeric-cell { | |
| text-align: center; | |
| } | |
| .v2-styled-table td { | |
| padding: 12px; | |
| border-bottom: 1px solid var(--border-subtle); | |
| color: #FFFFFF; | |
| transition: all 0.2s ease; | |
| } | |
| .v2-styled-table tbody tr { | |
| transition: all 0.3s ease; | |
| } | |
| .v2-styled-table tbody tr:hover { | |
| background: rgba(255, 210, 30, 0.15) !important; | |
| box-shadow: 0 0 20px rgba(255, 210, 30, 0.3), inset 0 0 20px rgba(255, 210, 30, 0.1); | |
| transform: scale(1.01); | |
| } | |
| .v2-styled-table tbody tr:nth-child(even) { | |
| background: var(--bg-secondary); | |
| } | |
| .model-name { | |
| font-weight: 500; | |
| color: #FFFFFF; | |
| transition: color 0.2s ease; | |
| } | |
| /* Keep model name color consistent on hover to emphasize row highlight */ | |
| .v2-styled-table tr:hover .model-name { | |
| color: #FFFFFF; | |
| } | |
| .numeric-cell { | |
| font-family: 'Geist Mono', monospace; | |
| font-size: 13px; | |
| text-align: center; | |
| color: #FFFFFF; | |
| } | |
| .highlight-header { | |
| background: rgba(255, 210, 30, 0.14); | |
| color: #FFFFFF; | |
| } | |
| .highlight-cell { | |
| background: rgba(255, 210, 30, 0.08); | |
| color: #FFFFFF; | |
| font-weight: 600; | |
| } | |
| </style> | |
| <div class="v2-table-container"> | |
| <table class="v2-styled-table"> | |
| <thead> | |
| <tr> | |
| <th style="width: 80px;">Rank</th> | |
| <th>Model</th> | |
| <th>Vendor</th> | |
| <th style="width: 120px;">LLM Type</th> | |
| """ | |
| overall_header_classes = ["numeric-cell"] | |
| if overall_highlight: | |
| overall_header_classes.append("highlight-header") | |
| table_html += f""" | |
| <th class="{' '.join(overall_header_classes)}" title="Average success rate across all levels"> | |
| <span class="metric-header">Overall <span class="info-icon">β</span></span> | |
| </th> | |
| """ | |
| for level in level_ids: | |
| header_classes = ["numeric-cell"] | |
| if highlight_map.get(level): | |
| header_classes.append("highlight-header") | |
| table_html += f""" | |
| <th class="{' '.join(header_classes)}" title="Average success rate for {level}"> | |
| <span class="metric-header">{level} <span class="info-icon">β</span></span> | |
| </th> | |
| """ | |
| table_html += """ | |
| </tr> | |
| </thead> | |
| <tbody> | |
| """ | |
| def safe_float(value): | |
| if value is None: | |
| return '' | |
| if isinstance(value, str) and value.strip() == '': | |
| return '' | |
| if pd.isna(value): | |
| return '' | |
| try: | |
| return float(value) | |
| except (TypeError, ValueError): | |
| return '' | |
| # Generate table rows | |
| for idx, (_, row) in enumerate(filtered_df.iterrows()): | |
| rank = idx + 1 | |
| table_html += f""" | |
| <tr> | |
| <td>{get_rank_badge(rank)}</td> | |
| <td class="model-name">{row['Model']}</td> | |
| <td>{row['Vendor']}</td> | |
| <td>{get_type_badge(row['Model Type'])}</td> | |
| """ | |
| overall_value = safe_float(row.get(overall_column, '')) | |
| if overall_value != '': | |
| overall_display = f'{overall_value:.3f}' | |
| else: | |
| overall_display = '-' | |
| overall_classes = ["numeric-cell"] | |
| if overall_highlight: | |
| overall_classes.append("highlight-cell") | |
| table_html += f'<td class="{" ".join(overall_classes)}">{overall_display}</td>' | |
| for level in level_ids: | |
| sr_col = sr_column_map[level] | |
| value = safe_float(row.get(sr_col, '')) | |
| if value != '': | |
| value_display = f'{value:.3f}' | |
| else: | |
| value_display = '-' | |
| cell_classes = ["numeric-cell"] | |
| if highlight_map.get(level): | |
| cell_classes.append("highlight-cell") | |
| table_html += f'<td class="{" ".join(cell_classes)}">{value_display}</td>' | |
| table_html += "</tr>" | |
| table_html += """ | |
| </tbody> | |
| </table> | |
| </div> | |
| """ | |
| return table_html | |
| def update_leaderboard_title(level_filter): | |
| """Update the leaderboard title based on selected level""" | |
| level_key = resolve_level(level_filter) | |
| level_info = level_details.get(level_key, level_details[default_level]) | |
| level_title = level_info["title"] | |
| level_description = level_info["description"] | |
| return f""" | |
| <div class="domain-selector-container leaderboard-intro"> | |
| <div class="domain-header"> | |
| <h2 class="domain-title" >Agent Leaderboard Β· {level_title}</h2> | |
| <p class="domain-subtitle" >{level_description}</p> | |
| </div> | |
| <div class="dataframe-container"> | |
| """ | |
| model_type_lookup = { | |
| "OSS": "Open source", | |
| "API": "Proprietary" | |
| } | |
| def apply_filters(df, level_filter, model_type_filter, sort_order, sort_by="Overall Success"): | |
| """Apply shared filters and sorting to the leaderboard dataframe.""" | |
| filtered_df = df.copy() | |
| level_key = resolve_level(level_filter) | |
| highlight_column = None | |
| if model_type_filter != "All": | |
| mapped_type = model_type_lookup.get(model_type_filter, model_type_filter) | |
| filtered_df = filtered_df[filtered_df['Model Type'] == mapped_type] | |
| actual_sort_column = sort_by if sort_by in filtered_df.columns else None | |
| if not actual_sort_column: | |
| if level_key == "ALL": | |
| actual_sort_column = overall_sort_column if overall_sort_column in filtered_df.columns else None | |
| else: | |
| actual_sort_column = sr_column_map.get(level_key) | |
| if level_key in sr_column_map: | |
| highlight_column = sr_column_map[level_key] | |
| elif level_key == "ALL" and overall_sort_column in filtered_df.columns: | |
| highlight_column = overall_sort_column | |
| if actual_sort_column and actual_sort_column in filtered_df.columns: | |
| ascending = (sort_order == "Ascending") | |
| filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') | |
| return filtered_df, level_key, highlight_column | |
| def filter_and_sort_data(level_filter, model_type_filter, sort_by, sort_order): | |
| """Filter and sort the leaderboard data""" | |
| df = load_leaderboard_data() | |
| filtered_df, level_key, highlight_column = apply_filters(df, level_filter, model_type_filter, sort_order, sort_by) | |
| # Generate HTML table | |
| return generate_html_table(filtered_df, highlight_column) | |
| # Load initial data | |
| initial_table = filter_and_sort_data(default_level, "All", "Overall Success", "Descending") | |
| initial_df = load_leaderboard_data() # Load raw data for model selector | |
| if not initial_df.empty: | |
| overall_success_numeric = pd.to_numeric(initial_df.get('Overall Success'), errors='coerce') | |
| if overall_success_numeric.notna().any(): | |
| initial_df = initial_df.assign(**{'Overall Success': overall_success_numeric}).sort_values( | |
| 'Overall Success', ascending=False, na_position='last' | |
| ) | |
| else: | |
| initial_df = initial_df.sort_values('Model') | |
| initial_selected_models = initial_df['Model'].tolist()[:5] if len(initial_df) > 0 else [] | |
| initial_heatmap_models = initial_df['Model'].tolist()[:12] if len(initial_df) > 0 else [] | |
| initial_heatmap = create_performance_heatmap(initial_df, initial_heatmap_models) | |
| initial_level_metric_level = level_ids[0] if level_ids else None | |
| initial_level_model_choices = initial_df['Model'].tolist() if len(initial_df) > 0 else [] | |
| initial_level_model_values = initial_level_model_choices[:5] | |
| initial_level_metric_chart = create_level_metric_chart( | |
| initial_df, | |
| initial_level_metric_level, | |
| initial_level_model_values | |
| ) if initial_level_metric_level else create_empty_level_metric_chart("No level metrics available") | |
| # Load custom CSS and responsive styles | |
| custom_css = get_leaderboard_css() + get_responsive_styles() + """ | |
| <style> | |
| /* Page-specific styles for leaderboard v2 */ | |
| /* Metric header styles with info icons */ | |
| .metric-header { | |
| cursor: help; | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| } | |
| .info-icon { | |
| color: var(--accent-secondary); | |
| font-size: 1em; | |
| opacity: 0.8; | |
| transition: opacity 0.2s ease; | |
| font-weight: normal; | |
| } | |
| .metric-header:hover .info-icon { | |
| opacity: 1; | |
| } | |
| /* Native tooltip styling */ | |
| .v2-styled-table th[title] { | |
| cursor: help; | |
| } | |
| /* Custom tooltip using CSS only */ | |
| [data-tooltip] { | |
| position: relative; | |
| cursor: help; | |
| } | |
| [data-tooltip]::before { | |
| content: attr(data-tooltip); | |
| position: absolute; | |
| bottom: 100%; | |
| left: 50%; | |
| transform: translateX(-50%); | |
| background: rgba(26, 26, 46, 0.95); | |
| color: #f5f6f7; | |
| padding: 8px 12px; | |
| border-radius: 6px; | |
| font-size: 12px; | |
| white-space: nowrap; | |
| max-width: 300px; | |
| z-index: 10000; | |
| opacity: 0; | |
| pointer-events: none; | |
| transition: opacity 0.3s; | |
| margin-bottom: 5px; | |
| border: 1px solid rgba(16, 152, 247, 0.3); | |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.8); | |
| } | |
| [data-tooltip]:hover::before { | |
| opacity: 0.8; | |
| } | |
| /* Dark theme table styling */ | |
| .v2-table-container { | |
| background: var(--bg-card); | |
| border-radius: 16px; | |
| overflow: visible; /* Changed from hidden to visible for tooltips */ | |
| border: 1px solid var(--border-subtle); | |
| margin-top: 20px; | |
| position: relative; | |
| } | |
| .v2-styled-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif; | |
| background: var(--bg-card); | |
| color: #FFFFFF; | |
| } | |
| .v2-styled-table thead { | |
| position: sticky; | |
| top: 0; | |
| background: rgba(255, 210, 30, 0.1); | |
| z-index: 1; | |
| } | |
| .v2-styled-table th { | |
| padding: 14px 12px; | |
| text-align: left; | |
| font-weight: 600; | |
| color: #FFFFFF; | |
| border-bottom: 2px solid var(--accent-primary); | |
| font-size: 14px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| position: relative; /* Added for tooltip positioning */ | |
| } | |
| .v2-styled-table td { | |
| padding: 12px; | |
| border-bottom: 1px solid var(--border-subtle); | |
| color: #FFFFFF; | |
| font-size: 14px; | |
| transition: all 0.2s ease; | |
| } | |
| .v2-styled-table tbody tr { | |
| transition: all 0.3s ease; | |
| } | |
| .v2-styled-table tbody tr:hover { | |
| background: rgba(255, 210, 30, 0.15) !important; | |
| box-shadow: 0 0 20px rgba(255, 210, 30, 0.3), inset 0 0 20px rgba(255, 210, 30, 0.1); | |
| transform: scale(1.01); | |
| } | |
| .v2-styled-table tbody tr:nth-child(even) { | |
| background: var(--bg-secondary); | |
| } | |
| .model-name { | |
| font-weight: 500; | |
| color: var(--accent-primary); | |
| font-size: 14px; | |
| transition: color 0.2s ease; | |
| } | |
| .v2-styled-table tr:hover .model-name { | |
| color: var(--accent-secondary); | |
| } | |
| .numeric-cell { | |
| font-family: 'Geist Mono', monospace; | |
| font-size: 14px; | |
| text-align: center; | |
| } | |
| </style> | |
| <script> | |
| // Function to update radio button styling | |
| function updateRadioStyling() { | |
| // Remove selected class from all labels first | |
| document.querySelectorAll('.selected').forEach(function(label) { | |
| label.classList.remove('selected'); | |
| }); | |
| document.querySelectorAll('.domain-radio label').forEach(function(label) { | |
| label.style.background = ''; | |
| label.style.borderColor = ''; | |
| label.style.transform = ''; | |
| label.style.fontWeight = ''; | |
| }); | |
| // Apply selected class to checked radio buttons | |
| document.querySelectorAll('input[type="radio"]:checked').forEach(function(input) { | |
| var label = input.closest('label'); | |
| if (label) { | |
| label.classList.add('selected'); | |
| // For domain radio buttons, apply special styling | |
| if (label.closest('.domain-radio')) { | |
| label.style.background = '#ffd21e33'; | |
| label.style.borderColor = 'var(--accent-primary)'; | |
| label.style.transform = 'scale(1.05)'; | |
| label.style.fontWeight = '600'; | |
| } | |
| } | |
| }); | |
| } | |
| // Wait for Gradio to initialize | |
| function initializeRadioStyles() { | |
| updateRadioStyling(); | |
| // Create observer to watch for changes | |
| var observer = new MutationObserver(function(mutations) { | |
| mutations.forEach(function(mutation) { | |
| if (mutation.type === 'attributes' && mutation.attributeName === 'checked') { | |
| updateRadioStyling(); | |
| } | |
| }); | |
| }); | |
| // Observe all radio inputs | |
| document.querySelectorAll('input[type="radio"]').forEach(function(radio) { | |
| observer.observe(radio, { attributes: true }); | |
| }); | |
| } | |
| // Try multiple initialization strategies | |
| document.addEventListener('DOMContentLoaded', function() { | |
| setTimeout(initializeRadioStyles, 100); | |
| setTimeout(initializeRadioStyles, 500); | |
| setTimeout(initializeRadioStyles, 1000); | |
| }); | |
| // Also check when window loads | |
| window.addEventListener('load', function() { | |
| setTimeout(initializeRadioStyles, 100); | |
| }); | |
| // Listen for Gradio's custom events | |
| document.addEventListener('gradio:loaded', initializeRadioStyles); | |
| </script> | |
| """ | |
| gr.HTML(custom_css) | |
| # Header styles and navigation | |
| gr.HTML(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap'); | |
| /* Enhanced button styling with better gradio compatibility */ | |
| .header-action-button { | |
| display: inline-block !important; | |
| padding: 14px 28px !important; | |
| background: #ffd21e !important; | |
| color: #FFFFFF !important; | |
| text-decoration: none !important; | |
| border-radius: 16px !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 700 !important; | |
| font-size: 1.1rem !important; | |
| transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; | |
| border: none !important; | |
| cursor: pointer !important; | |
| box-shadow: 0 8px 24px rgba(255, 210, 30, 0.4), 0 4px 12px rgba(0, 0, 0, 0.3) !important; | |
| position: relative !important; | |
| overflow: hidden !important; | |
| text-shadow: 0 1px 2px rgba(0, 0, 0, 0.35) !important; | |
| } | |
| .header-action-button::before { | |
| content: ''; | |
| position: absolute; | |
| top: 0; | |
| left: -100%; | |
| width: 100%; | |
| height: 100%; | |
| background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent); | |
| transition: left 0.6s; | |
| } | |
| .header-action-button:hover::before { | |
| left: 100%; | |
| } | |
| .header-action-button:hover { | |
| transform: translateY(-3px) !important; | |
| box-shadow: 0 12px 32px rgba(255, 210, 30, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important; | |
| background: #ffd21e !important; | |
| color: #FFFFFF !important; | |
| text-decoration: none !important; | |
| text-shadow: 0 2px 6px rgba(0, 0, 0, 0.45) !important; | |
| } | |
| .header-action-button:active { | |
| transform: translateY(-1px) !important; | |
| } | |
| .action-button-icon { | |
| font-size: 1.2rem !important; | |
| margin-right: 8px !important; | |
| filter: drop-shadow(0 0 8px rgba(255, 255, 255, 0.3)); | |
| } | |
| .hero-banner-wrapper { | |
| position: relative; | |
| width: 100vw; | |
| margin: 0 calc(-50vw + 50%) 20px calc(-50vw + 50%); | |
| border-radius: 0 !important; | |
| overflow: hidden !important; | |
| box-shadow: 0 12px 32px rgba(0, 0, 0, 0.25) !important; | |
| } | |
| .hero-banner-wrapper::before { | |
| content: ""; | |
| position: absolute; | |
| inset: 0; | |
| background: #01091A; | |
| z-index: 0; | |
| } | |
| #hero-banner { | |
| position: relative; | |
| width: 100% !important; | |
| height: auto !important; | |
| z-index: 1; | |
| } | |
| #hero-banner img { | |
| width: 100% !important; | |
| height: auto !important; | |
| display: block !important; | |
| object-fit: cover !important; | |
| } | |
| .hero-title { | |
| font-size: 10rem; | |
| font-weight: 800; | |
| line-height: 1.1; | |
| background: linear-gradient(135deg, #FFE082 0%, #FFC107 50%, #FFB300 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| margin-bottom: 1rem; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| .hero-subtitle { | |
| color: var(--text-secondary); | |
| font-size: 3rem; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| margin-top: 0; | |
| } | |
| .hero-actions { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| gap: 16px; | |
| flex-wrap: wrap; | |
| margin: 32px 0; | |
| padding: 0 20px; | |
| } | |
| .hero-action-button { | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| gap: 12px !important; | |
| padding: 10px 16px !important; | |
| background: rgba(245, 246, 247, 0.06) !important; | |
| border: 1px solid var(--border-subtle) !important; | |
| border-radius: 999px !important; | |
| color: #FFFFFF !important; | |
| text-decoration: none !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 600 !important; | |
| font-size: 0.95rem !important; | |
| transition: all 0.3s ease !important; | |
| backdrop-filter: blur(10px) !important; | |
| -webkit-backdrop-filter: blur(10px) !important; | |
| } | |
| .hero-action-button:hover { | |
| transform: translateY(-2px) !important; | |
| border-color: var(--accent-primary) !important; | |
| background: rgba(255, 210, 30, 0.12) !important; | |
| text-decoration: none !important; | |
| } | |
| .hero-action-button svg { | |
| width: 20px; | |
| height: 20px; | |
| } | |
| .hero-action-button span { | |
| font-weight: 600; | |
| letter-spacing: 0.01em; | |
| } | |
| .dashboard-section { | |
| margin: 48px auto 0 auto; | |
| max-width: 1100px; | |
| padding: 40px; | |
| background: rgba(245, 246, 247, 0.06); | |
| border: 1px solid var(--border-subtle); | |
| border-radius: 24px; | |
| box-shadow: 0 12px 30px rgba(0, 0, 0, 0.25); | |
| backdrop-filter: blur(12px); | |
| -webkit-backdrop-filter: blur(12px); | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| .dashboard-section.emphasized { | |
| background: #ffd21e26; | |
| border-color: rgba(255, 210, 30, 0.6); | |
| box-shadow: 0 24px 50px rgba(255, 210, 30, 0.25); | |
| } | |
| .dashboard-section .section-header { | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| justify-content: center; | |
| text-align: center; | |
| gap: 12px; | |
| margin-bottom: 32px; | |
| } | |
| .section-title { | |
| font-size: 3.75rem; | |
| font-weight: 700; | |
| color: #FFFFFF; | |
| margin-bottom: 12px; | |
| text-align: center !important; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| .section-lead, .section-subtitle { | |
| font-size: 1.32rem !important; | |
| color: var(--text-secondary); | |
| max-width: 720px; | |
| margin: 0 auto 24px auto; | |
| line-height: 1.7; | |
| text-align: center !important; | |
| word-break: keep-all; | |
| white-space: normal; | |
| display: block; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| .phase-grid { | |
| display: grid; | |
| grid-template-columns: repeat(2, minmax(0, 1fr)); | |
| gap: 24px; | |
| } | |
| .phase-card { | |
| padding: 28px; | |
| border-radius: 20px; | |
| border: 1px solid var(--border-subtle); | |
| background: rgba(1, 9, 26, 0.65); | |
| box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.03); | |
| } | |
| .phase-card h3 { | |
| font-size: 1.44rem !important; | |
| color: #FFFFFF; | |
| margin-bottom: 20px; | |
| font-weight: 700; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| .phase-chart { | |
| width: 120px; | |
| height: 120px; | |
| border-radius: 50%; | |
| background: conic-gradient(var(--accent-primary) var(--progress), rgba(255, 210, 30, 0.15) var(--progress)); | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| margin-bottom: 24px; | |
| margin-left: auto; | |
| margin-right: auto; | |
| position: relative; | |
| } | |
| .phase-chart::after { | |
| content: ''; | |
| position: absolute; | |
| width: 80px; | |
| height: 80px; | |
| border-radius: 50%; | |
| background: rgba(1, 9, 26, 0.95); | |
| } | |
| .phase-chart span { | |
| position: relative; | |
| font-size: 1.2rem !important; | |
| font-weight: 700; | |
| color: #FFFFFF !important; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| /* Additional specific selectors */ | |
| .phase-card .phase-chart span { | |
| color: #FFFFFF !important; | |
| text-shadow: 0 1px 2px rgba(0, 0, 0, 0.8) !important; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| .phase-grid .phase-chart span { | |
| color: #FFFFFF !important; | |
| z-index: 10 !important; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| .phase-list { | |
| list-style: none; | |
| padding: 0; | |
| margin: 0; | |
| display: grid; | |
| gap: 12px; | |
| } | |
| .phase-list li { | |
| padding: 12px 16px; | |
| border-radius: 12px; | |
| background: rgba(245, 246, 247, 0.05); | |
| border: 1px solid rgba(245, 246, 247, 0.08); | |
| color: var(--text-secondary); | |
| font-size: 1.08rem !important; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| .scenario-body { | |
| max-width: 760px; | |
| margin: 0 auto; | |
| text-align: center; | |
| } | |
| .scenario-body p { | |
| font-size: 1.05rem; | |
| line-height: 1.7; | |
| color: var(--text-secondary); | |
| margin-bottom: 32px; | |
| } | |
| .section-flow { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| margin-top: 24px; | |
| color: var(--accent-primary); | |
| font-size: 2rem; | |
| } | |
| .criteria-grid { | |
| display: grid; | |
| grid-template-columns: repeat(3, minmax(0, 1fr)); | |
| gap: 24px; | |
| } | |
| .criteria-card { | |
| padding: 24px; | |
| border-radius: 20px; | |
| border: 1px solid var(--border-subtle); | |
| background: rgba(1, 9, 26, 0.7); | |
| display: flex; | |
| flex-direction: column; | |
| gap: 16px; | |
| box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.03); | |
| } | |
| .criteria-card h3 { | |
| font-size: 1.25rem; | |
| font-weight: 700; | |
| color: #FFFFFF; | |
| margin: 0; | |
| } | |
| .criteria-card ul { | |
| list-style: disc; | |
| margin: 0; | |
| padding-left: 20px; | |
| color: var(--text-secondary); | |
| display: grid; | |
| gap: 10px; | |
| font-size: 0.95rem; | |
| line-height: 1.5; | |
| } | |
| /* Responsive design */ | |
| @media (max-width: 768px) { | |
| .hero-title { | |
| font-size: 10rem; | |
| } | |
| .hero-action-button { | |
| width: 100% !important; | |
| justify-content: center !important; | |
| } | |
| .dashboard-section { | |
| padding: 28px; | |
| margin: 32px 16px 0 16px; | |
| } | |
| .phase-grid { | |
| grid-template-columns: 1fr; | |
| } | |
| .criteria-grid { | |
| grid-template-columns: 1fr; | |
| } | |
| } | |
| @media (max-width: 480px) { | |
| .hero-actions { | |
| flex-direction: column; | |
| gap: 8px; | |
| } | |
| .section-title { | |
| font-size: 2.7rem; | |
| } | |
| .phase-chart { | |
| width: 100px; | |
| height: 100px; | |
| } | |
| .phase-chart::after { | |
| width: 68px; | |
| height: 68px; | |
| } | |
| } | |
| </style> | |
| """) | |
| gr.HTML("<div class='hero-banner-wrapper'>") | |
| gr.Image( | |
| value="banner_wide.png", | |
| show_label=False, | |
| interactive=False, | |
| type="filepath", | |
| elem_id="hero-banner" | |
| ) | |
| gr.HTML("</div>") | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px 0;"> | |
| <h1 class="hero-title">Hugging Face KREW Ko-AgentBench</h1> | |
| <p class="hero-subtitle">Agent benchmark optimized for real Korean usage.</p> | |
| </div> | |
| """) | |
| # Links section below title | |
| gr.HTML(""" | |
| <div class="hero-actions"> | |
| <a href="https://hugging-face-krew.github.io/" target="_blank" rel="noopener noreferrer" class="hero-action-button"> | |
| <svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> | |
| <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/> | |
| <line x1="8" y1="12" x2="16" y2="12"/> | |
| </svg> | |
| <span>Blog</span> | |
| </a> | |
| <a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button"> | |
| <svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> | |
| <path d="M9 19c-5 1.5-5-2.5-7-3"/> | |
| <path d="M20 21v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/> | |
| </svg> | |
| <span>GitHub</span> | |
| </a> | |
| <a href="https://huggingface.co/datasets/huggingface-KREW/Ko-AgentBench" target="_blank" rel="noopener noreferrer" class="hero-action-button"> | |
| <svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> | |
| <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/> | |
| <polyline points="7 10 12 15 17 10"/> | |
| <line x1="12" y1="15" x2="12" y2="3"/> | |
| </svg> | |
| <span>Dataset</span> | |
| </a> | |
| <a href="https://github.com/Hugging-Face-KREW/Ko-AgentBench/blob/main/README_en.md#-metrics" target="_blank" rel="noopener noreferrer" class="hero-action-button"> | |
| <svg viewBox="0 0 24 24" fill="none" stroke="white" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> | |
| <path d="M3 3v18h18"/> | |
| <path d="M7 17v-6"/> | |
| <path d="M12 17V7"/> | |
| <path d="M17 17v-3"/> | |
| </svg> | |
| <span>Metrics</span> | |
| </a> | |
| </div> | |
| """) | |
| # Section 1: Task Design by Stage | |
| gr.HTML(""" | |
| <div class="dashboard-section"> | |
| <div class="section-header"> | |
| <h2 class="section-title" style="font-family: 'Nanum Gothic', sans-serif; font-size: 2.5rem;">7-Level Task Design</h2> | |
| </div> | |
| <p class="section-lead" style="text-align: center; margin: 0 auto 24px auto; max-width: 720px; line-height: 1.7; word-break: keep-all;">We analyzed agent capabilities across seven stagesβfrom simple tool calls to long-context retention and robustness.</p> | |
| <div class="phase-grid"> | |
| <div class="phase-card"> | |
| <h3>Single Turn</h3> | |
| <div class="phase-chart" style="--progress:80%;"> | |
| <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">80%</span> | |
| </div> | |
| <ul class="phase-list"> | |
| <li style="color: #FFFFFF;">L1: Single Tool Call</li> | |
| <li style="color: #FFFFFF;">L2: Tool Selection</li> | |
| <li style="color: #FFFFFF;">L3: Sequential Tool Reasoning</li> | |
| <li style="color: #FFFFFF;">L4: Parallel Tool Reasoning</li> | |
| <li style="color: #FFFFFF;">L5: Error Handling & Robustness</li> | |
| </ul> | |
| </div> | |
| <div class="phase-card"> | |
| <h3>Multi Turn</h3> | |
| <div class="phase-chart" style="--progress:20%;"> | |
| <span style="color: #FFFFFF !important; text-shadow: 0 1px 2px rgba(0,0,0,0.8) !important; font-weight: 700 !important;">20%</span> | |
| </div> | |
| <ul class="phase-list"> | |
| <li style="color: #FFFFFF;">L6: Efficient Tool Utilization</li> | |
| <li style="color: #FFFFFF;">L7: Long-Context Memory</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # Section 2: Core Scenario Design | |
| gr.HTML(""" | |
| <div class="dashboard-section emphasized"> | |
| <div class="section-header"> | |
| <h2 class="section-title" style="font-size: 2.0rem;">High-quality scenario design tailored to 18 Korea-specific APIs and real-world use cases.</h2> | |
| </div> | |
| <div class="scenario-body"> | |
| <p style="color: var(--text-primary);">We built realistic scenariosβsuch as appointment booking and blog review searchβby integrating APIs widely used in Korea including Naver Maps, Kakao services, and local websites.</p> | |
| </div> | |
| </div> | |
| <div class="section-flow">β</div> | |
| """) | |
| # Section 3: Key Evaluation Criteria | |
| gr.HTML(""" | |
| <div class="dashboard-section"> | |
| <div class="section-header"> | |
| <h2 class="section-title" style="font-size: 2.0rem;">Key Evaluation Criteria</h2> | |
| </div> | |
| <div class="criteria-grid"> | |
| <div class="criteria-card"> | |
| <h3>Cache-based Iterative Evaluation</h3> | |
| <ul> | |
| <li>Improved handling of failed API responses</li> | |
| <li>Addresses chronic benchmark issues such as mismatched response attributes</li> | |
| <li>Ensures benchmark consistency and reliability</li> | |
| </ul> | |
| </div> | |
| <div class="criteria-card"> | |
| <h3>Robustness Testing</h3> | |
| <ul> | |
| <li>Evaluates recognition and response strategies for intentional failure scenarios (e.g., discontinued products)</li> | |
| <li>Surfaces models that remain stable in real-world deployments</li> | |
| </ul> | |
| </div> | |
| <div class="criteria-card"> | |
| <h3>Level-specific Precision Metrics</h3> | |
| <ul> | |
| <li>Evaluates each phase of problem solving, including tool selection, parameter setup, and data flow</li> | |
| <li>Quantitatively identifies model strengths and weaknesses</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # Metrics overview cards removed per updated design | |
| # Domain filter section with enhanced styling | |
| gr.HTML(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Nanum+Gothic:wght@400;700;800&family=Gowun+Dodum&family=Do+Hyeon&display=swap'); | |
| /* Enhanced domain selector styling */ | |
| .domain-selector-container { | |
| background: #ffd21e0d; | |
| border-radius: 20px; | |
| padding: 32px; | |
| margin-bottom: 32px; | |
| border: 1px solid var(--border-subtle); | |
| position: relative; | |
| overflow: visible; | |
| box-shadow: | |
| 0 8px 32px rgba(0, 0, 0, 0.3), | |
| inset 0 1px 0 rgba(255, 255, 255, 0.05); | |
| } | |
| .domain-selector-container.leaderboard-intro { | |
| padding-bottom: 0; | |
| margin-bottom: 32px; | |
| } | |
| .leaderboard-intro .domain-header { | |
| margin-bottom: 20px; | |
| } | |
| .leaderboard-intro .domain-subtitle { | |
| font-size: 1.1rem; | |
| max-width: 720px; | |
| margin: 0 auto; | |
| } | |
| .leaderboard-intro .dataframe-container { | |
| margin: 16px -32px -32px; | |
| padding: 0 32px 32px; | |
| background: transparent; | |
| border-radius: 0 0 20px 20px; | |
| } | |
| .domain-performance-container { | |
| margin-bottom: 32px; | |
| } | |
| .domain-performance-container .domain-header { | |
| margin-bottom: 24px; | |
| } | |
| .domain-performance-container .domain-subtitle { | |
| font-size: 1.05rem; | |
| max-width: 720px; | |
| margin: 0 auto; | |
| } | |
| .leaderboard-intro .domain-title, | |
| .domain-performance-container > .domain-header .domain-title, | |
| .performance-card-container > .domain-header .domain-title { | |
| font-size: 2.6rem !important; | |
| } | |
| .performance-card-content { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 24px; | |
| align-items: stretch; | |
| } | |
| .performance-card-display { | |
| flex: 1; | |
| min-width: 0; | |
| } | |
| .performance-card-container { | |
| margin-top: 32px; | |
| } | |
| .performance-card-container .domain-header { | |
| margin-bottom: 24px; | |
| } | |
| .performance-card-container .domain-subtitle { | |
| font-size: 1.05rem; | |
| max-width: 720px; | |
| margin: 0 auto; | |
| } | |
| .domain-header { | |
| text-align: center; | |
| margin-bottom: 28px; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| .domain-title { | |
| font-size: 2rem; | |
| font-weight: 800; | |
| position: relative; | |
| display: inline-block; | |
| margin-bottom: 8px; | |
| padding: 4px 0; | |
| color: transparent !important; | |
| background: linear-gradient(120deg, rgba(255, 255, 255, 0.75) 0%, rgba(255, 210, 30, 0.95) 25%, rgba(255, 139, 0, 0.85) 50%, rgba(255, 210, 30, 0.95) 75%, rgba(255, 255, 255, 0.8) 100%); | |
| background-size: 220% 100%; | |
| -webkit-background-clip: text; | |
| background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| text-shadow: 0 0 3px rgba(255, 210, 30, 0.08), 0 0 8px rgba(255, 210, 30, 0.05); | |
| filter: drop-shadow(0 0 2px rgba(255, 210, 30, 0.06)); | |
| letter-spacing: 0.02em; | |
| animation: title-shimmer 1.25s ease-in-out infinite; | |
| font-family: 'Nanum Gothic', sans-serif !important; | |
| } | |
| @keyframes title-shimmer { | |
| 0% { | |
| background-position: 0% 50%; | |
| } | |
| 50% { | |
| background-position: 100% 50%; | |
| } | |
| 100% { | |
| background-position: 0% 50%; | |
| } | |
| } | |
| .domain-subtitle { | |
| color: var(--text-secondary); | |
| font-size: 1.2rem; | |
| font-family: 'Geist', sans-serif; | |
| } | |
| /* Custom radio button styling */ | |
| .domain-radio { | |
| display: flex !important; | |
| gap: 12px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: center !important; | |
| position: relative; | |
| z-index: 1; | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 6px 0 !important; | |
| } | |
| /* Gradio radio button wrapper */ | |
| .domain-radio .wrap { | |
| display: flex !important; | |
| gap: 12px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: center !important; | |
| width: 100% !important; | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 6px 0 !important; | |
| } | |
| .domain-selector-container [role="radiogroup"], | |
| .domain-selector-container fieldset, | |
| .domain-selector-container .gradio-radio, | |
| .domain-selector-container .gradio-radio-group, | |
| .domain-selector-container .gr-form, | |
| .domain-selector-container .gradio-radio-group > div { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 0 !important; | |
| } | |
| #filters-sorting-container { | |
| padding: 28px !important; | |
| } | |
| #filters-sorting-container .gr-box, | |
| #filters-sorting-container .gradio-column, | |
| #filters-sorting-container .gradio-row, | |
| #filters-sorting-container .gradio-group, | |
| #filters-sorting-container .gradio-radio, | |
| #filters-sorting-container [role="radiogroup"], | |
| #filters-sorting-container fieldset { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| } | |
| .filters-sorting-row { | |
| gap: 18px !important; | |
| justify-content: center !important; | |
| flex-wrap: nowrap !important; | |
| } | |
| .filter-group { | |
| flex: 1 1 260px !important; | |
| display: flex !important; | |
| flex-direction: column !important; | |
| gap: 12px !important; | |
| align-items: flex-start !important; | |
| width: 100% !important; | |
| } | |
| .filter-group-row { | |
| display: flex !important; | |
| align-items: center !important; | |
| gap: 4px !important; | |
| justify-content: flex-start !important; | |
| flex-wrap: nowrap !important; | |
| width: 100% !important; | |
| } | |
| .filter-group-row > * { | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| flex: 0 0 auto !important; | |
| width: auto !important; | |
| min-width: auto !important; | |
| } | |
| .filter-group-row > .gr-column, | |
| .filter-group-row > .gr-box { | |
| flex: 0 0 auto !important; | |
| width: auto !important; | |
| min-width: auto !important; | |
| } | |
| .filter-group-row .gradio-html, | |
| .filter-group-row .gradio-html > * { | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| flex: 0 0 auto !important; | |
| width: auto !important; | |
| min-width: auto !important; | |
| } | |
| .filter-group-row .domain-radio { | |
| flex: 1 1 auto !important; | |
| width: 100% !important; | |
| margin: 0 !important; | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| min-width: 220px !important; | |
| justify-content: flex-start !important; | |
| padding-right: 8px !important; | |
| } | |
| .filter-group .gr-input-label { | |
| font-size: 1rem !important; | |
| font-weight: 600 !important; | |
| color: #FFFFFF !important; | |
| text-align: center !important; | |
| margin-bottom: 12px !important; | |
| } | |
| .filter-group-label { | |
| font-size: 1rem !important; | |
| font-weight: 600 !important; | |
| color: #FFFFFF !important; | |
| text-align: left !important; | |
| margin: 0 !important; | |
| font-family: 'Geist', sans-serif !important; | |
| white-space: nowrap !important; | |
| } | |
| #filters-sorting-container .domain-radio, | |
| #filters-sorting-container .domain-radio .wrap { | |
| flex-wrap: nowrap !important; | |
| justify-content: flex-start !important; | |
| align-items: center !important; | |
| gap: 12px !important; | |
| width: 100% !important; | |
| } | |
| .domain-radio label, | |
| .domain-radio .wrap > label { | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| gap: 0 !important; | |
| padding: 12px 28px !important; | |
| background: rgba(245, 246, 247, 0.06) !important; | |
| border: 1px solid var(--border-subtle) !important; | |
| border-radius: 999px !important; | |
| cursor: pointer !important; | |
| transition: all 0.3s ease !important; | |
| text-align: center !important; | |
| position: relative !important; | |
| overflow: hidden !important; | |
| color: #FFFFFF !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 600 !important; | |
| font-size: 0.95rem !important; | |
| letter-spacing: 0.01em !important; | |
| text-align: center !important; | |
| line-height: 1 !important; | |
| backdrop-filter: blur(10px) !important; | |
| -webkit-backdrop-filter: blur(10px) !important; | |
| min-width: 0 !important; | |
| flex: 0 0 auto !important; | |
| box-shadow: 0 0 18px -6px rgba(0, 0, 0, 0.45) !important; | |
| z-index: 0 !important; | |
| } | |
| .domain-radio label::before { | |
| content: ''; | |
| position: absolute; | |
| inset: 0; | |
| background: #ffd21e14; | |
| opacity: 0; | |
| transition: opacity 0.3s ease; | |
| pointer-events: none; | |
| z-index: -1; | |
| } | |
| .domain-radio label:hover { | |
| border-color: var(--accent-primary) !important; | |
| background: rgba(255, 210, 30, 0.12) !important; | |
| box-shadow: 0 0 18px 0 rgba(255, 210, 30, 0.45) !important; | |
| border-color: rgba(255, 210, 30, 0.45) !important; | |
| } | |
| .domain-radio label:hover::before { | |
| opacity: 1; | |
| } | |
| .domain-radio input[type="radio"] { | |
| display: none !important; | |
| } | |
| .domain-radio input[type="radio"]:checked + label, | |
| .domain-radio .wrap > label:has(input[type="radio"]:checked), | |
| .domain-radio label.selected, | |
| .domain-radio label[aria-checked="true"] { | |
| background: #ffd21e33 !important; | |
| border-color: var(--accent-primary) !important; | |
| color: var(--accent-tertiary) !important; | |
| box-shadow: 0 0 24px 0 rgba(255, 210, 30, 0.55) !important; | |
| } | |
| .domain-radio input[type="radio"]:checked + label::before, | |
| .domain-radio label[aria-checked="true"]::before { | |
| opacity: 1; | |
| } | |
| /* Model selector styling */ | |
| .model-selector-container { | |
| padding: 28px; | |
| } | |
| .model-selector-container .domain-header { | |
| margin-bottom: 18px; | |
| } | |
| .model-selector-container .domain-title { | |
| font-size: 1.8rem; | |
| } | |
| .model-selector-container .domain-subtitle { | |
| font-size: 1rem; | |
| } | |
| .model-selector-container .model-dropdown, | |
| .model-selector-container .pill-button { | |
| display: flex !important; | |
| justify-content: center !important; | |
| align-items: center !important; | |
| width: 100% !important; | |
| } | |
| .model-selector-container .model-dropdown { | |
| margin-bottom: 12px !important; | |
| } | |
| .model-dropdown { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 0 !important; | |
| margin: 0 !important; | |
| } | |
| .model-dropdown .gradio-dropdown, | |
| .model-dropdown .gradio-dropdown > div, | |
| .model-dropdown .gr-form, | |
| .model-dropdown .gradio-input, | |
| .model-dropdown .gradio-button-group, | |
| .model-dropdown .wrap { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 0 !important; | |
| margin: 0 !important; | |
| width: 100% !important; | |
| } | |
| .model-dropdown select, | |
| .model-dropdown [role="combobox"] { | |
| background: #000000 !important; | |
| border: 1px solid #333333 !important; | |
| border-radius: 999px !important; | |
| padding: 12px 24px !important; | |
| color: #FFFFFF !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 600 !important; | |
| font-size: 1rem !important; | |
| letter-spacing: 0.01em !important; | |
| min-height: 46px !important; | |
| min-width: 240px !important; | |
| width: 100% !important; | |
| cursor: pointer !important; | |
| transition: all 0.3s ease !important; | |
| text-align: center !important; | |
| text-align-last: center !important; | |
| } | |
| .model-dropdown select:focus, | |
| .model-dropdown [role="combobox"]:focus-visible { | |
| outline: none !important; | |
| border-color: var(--accent-primary) !important; | |
| box-shadow: 0 0 0 3px rgba(255, 210, 30, 0.25) !important; | |
| } | |
| .model-dropdown button { | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| gap: 8px !important; | |
| width: 100% !important; | |
| padding: 12px 24px !important; | |
| background: #000000 !important; | |
| border: 1px solid #333333 !important; | |
| border-radius: 999px !important; | |
| color: #FFFFFF !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 600 !important; | |
| font-size: 0.95rem !important; | |
| letter-spacing: 0.01em !important; | |
| cursor: pointer !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .model-dropdown button:hover:not(:disabled), | |
| .model-dropdown [role="combobox"]:hover { | |
| transform: translateY(-1px) !important; | |
| border-color: var(--accent-primary) !important; | |
| box-shadow: 0 12px 28px rgba(255, 210, 30, 0.25) !important; | |
| background: rgba(255, 210, 30, 0.12) !important; | |
| } | |
| .model-dropdown .tags { | |
| display: flex !important; | |
| flex-wrap: wrap !important; | |
| gap: 6px !important; | |
| justify-content: center !important; | |
| } | |
| .model-dropdown .tag { | |
| background: rgba(255, 210, 30, 0.18) !important; | |
| border: 1px solid rgba(255, 210, 30, 0.35) !important; | |
| color: #FFFFFF !important; | |
| border-radius: 999px !important; | |
| padding: 4px 10px !important; | |
| font-size: 0.85rem !important; | |
| font-weight: 500 !important; | |
| } | |
| .model-dropdown label { | |
| display: flex !important; | |
| flex-direction: column !important; | |
| gap: 8px !important; | |
| width: 100% !important; | |
| align-items: center !important; | |
| } | |
| .model-dropdown label > span { | |
| display: none !important; | |
| } | |
| .pill-button button { | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| gap: 8px !important; | |
| padding: 12px 28px !important; | |
| background: #ffd21e !important; | |
| border: 1px solid rgba(255, 210, 30, 0.6) !important; | |
| border-radius: 999px !important; | |
| color: #FFFFFF !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 600 !important; | |
| font-size: 0.95rem !important; | |
| letter-spacing: 0.01em !important; | |
| text-decoration: none !important; | |
| cursor: pointer !important; | |
| transition: all 0.3s ease !important; | |
| box-shadow: 0 12px 28px rgba(255, 210, 30, 0.25) !important; | |
| border-bottom: none !important; | |
| text-shadow: 0 1px 2px rgba(0, 0, 0, 0.35) !important; | |
| } | |
| .pill-button button:hover:not(:disabled) { | |
| transform: translateY(-2px) !important; | |
| background: #ffd21e !important; | |
| box-shadow: 0 16px 36px rgba(255, 210, 30, 0.35) !important; | |
| text-shadow: 0 2px 6px rgba(0, 0, 0, 0.45) !important; | |
| } | |
| .pill-button button:disabled { | |
| opacity: 0.6 !important; | |
| cursor: not-allowed !important; | |
| box-shadow: none !important; | |
| } | |
| .level-selector-container { | |
| padding: 28px; | |
| } | |
| .level-selector-container .domain-header { | |
| margin-bottom: 18px; | |
| } | |
| .level-selector-container .domain-title { | |
| font-size: 1.8rem; | |
| } | |
| .level-selector-container .domain-subtitle { | |
| font-size: 1rem; | |
| } | |
| /* Domain icons */ | |
| .domain-icon { | |
| font-size: 1.5rem; | |
| margin-bottom: 4px; | |
| display: block; | |
| filter: drop-shadow(0 0 10px white); | |
| } | |
| .domain-name { | |
| font-size: 0.95rem; | |
| font-weight: 500; | |
| margin-top: 4px; | |
| } | |
| /* Badge for domain counts */ | |
| .domain-count { | |
| position: absolute; | |
| top: 8px; | |
| right: 8px; | |
| background: var(--accent-primary); | |
| color: #FFFFFF; | |
| font-size: 0.75rem; | |
| padding: 2px 8px; | |
| border-radius: 12px; | |
| font-weight: 600; | |
| opacity: 0.8; | |
| } | |
| /* Filter radio buttons styling - smaller for better fit */ | |
| .filter-radio { | |
| max-width: 100% !important; | |
| } | |
| .filter-radio .gr-row { | |
| gap: 8px !important; | |
| } | |
| .filter-radio .gr-column { | |
| min-width: 0 !important; | |
| flex: 1 !important; | |
| } | |
| .filter-radio .gr-form { | |
| min-width: 0 !important; | |
| } | |
| .filter-radio .gr-radio-group { | |
| gap: 4px !important; | |
| } | |
| .filter-radio .domain-radio { | |
| display: flex !important; | |
| gap: 4px !important; | |
| flex-wrap: nowrap !important; | |
| justify-content: center !important; | |
| } | |
| .filter-radio .domain-radio label { | |
| min-width: auto !important; | |
| max-width: 120px !important; | |
| padding: 8px 12px !important; | |
| font-size: 0.8rem !important; | |
| white-space: nowrap !important; | |
| overflow: hidden !important; | |
| text-overflow: ellipsis !important; | |
| } | |
| /* Additional targeting for the specific filter components */ | |
| .filter-radio .gr-box { | |
| padding: 8px !important; | |
| } | |
| .filter-radio .gr-radio { | |
| gap: 4px !important; | |
| } | |
| .filter-radio .gr-input-label { | |
| font-size: 0.85rem !important; | |
| margin-bottom: 4px !important; | |
| } | |
| /* Force compact layout for the filters */ | |
| @media (max-width: 1400px) { | |
| .filter-radio .domain-radio label { | |
| padding: 6px 10px !important; | |
| font-size: 0.75rem !important; | |
| } | |
| } | |
| /* Compact filter row styling */ | |
| .compact-filter-row { | |
| margin-bottom: 20px !important; | |
| } | |
| .compact-filter-row .gr-column { | |
| padding: 0 8px !important; | |
| } | |
| .compact-filter-row .gr-box { | |
| padding: 0 !important; | |
| } | |
| /* Compact radio button styling */ | |
| .compact-radio { | |
| width: 100% !important; | |
| } | |
| .compact-radio > label { | |
| font-size: 0.85rem !important; | |
| margin-bottom: 8px !important; | |
| font-weight: 600 !important; | |
| color: #FFFFFF !important; | |
| display: block !important; | |
| } | |
| .compact-radio .wrap { | |
| display: flex !important; | |
| flex-wrap: nowrap !important; | |
| gap: 4px !important; | |
| justify-content: center !important; | |
| } | |
| .compact-radio .wrap > label { | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| padding: 6px 10px !important; | |
| margin: 0 !important; | |
| background: var(--bg-card) !important; | |
| border: 1px solid var(--border-default) !important; | |
| border-radius: 8px !important; | |
| cursor: pointer !important; | |
| transition: all 0.2s ease !important; | |
| font-size: 0.75rem !important; | |
| white-space: nowrap !important; | |
| flex: 1 !important; | |
| min-width: 0 !important; | |
| overflow: hidden !important; | |
| text-overflow: ellipsis !important; | |
| } | |
| .compact-radio .wrap > label:has(input[type="radio"]:checked) { | |
| background: transparent !important; | |
| border-color: var(--accent-primary) !important; | |
| color: #FFFFFF !important; | |
| font-weight: 600 !important; | |
| } | |
| .compact-radio .wrap > label:hover { | |
| background: rgba(255, 210, 30, 0.1) !important; | |
| border-color: var(--accent-primary) !important; | |
| transform: scale(1.02) !important; | |
| } | |
| .compact-radio input[type="radio"] { | |
| display: none !important; | |
| } | |
| /* Target Gradio's data attributes for selected state */ | |
| .compact-radio label[data-selected="true"], | |
| .compact-radio label[aria-checked="true"], | |
| .domain-radio label[data-selected="true"], | |
| .domain-radio label[aria-checked="true"] { | |
| background: transparent !important; | |
| border-color: var(--accent-primary) !important; | |
| color: #FFFFFF !important; | |
| font-weight: 600 !important; | |
| } | |
| /* Sort by radio buttons */ | |
| .sort-by-radio .domain-radio { | |
| display: flex !important; | |
| gap: 10px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: flex-start !important; | |
| } | |
| .sort-by-radio .domain-radio .wrap { | |
| display: flex !important; | |
| gap: 10px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: flex-start !important; | |
| width: 100% !important; | |
| } | |
| .sort-by-radio .domain-radio label, | |
| .sort-by-radio .domain-radio .wrap > label { | |
| min-width: 180px !important; | |
| max-width: 220px !important; | |
| padding: 12px 20px !important; | |
| font-size: 0.95rem !important; | |
| } | |
| /* Leaderboard controls row styling */ | |
| .leaderboard-controls-row { | |
| margin: 20px 0 !important; | |
| padding: 20px !important; | |
| background: transparent !important; | |
| border: none !important; | |
| gap: 40px !important; | |
| } | |
| .leaderboard-controls-row .gr-column, | |
| .leaderboard-controls-row .gr-row, | |
| .leaderboard-controls-row .gr-box, | |
| .leaderboard-controls-row .gradio-column, | |
| .leaderboard-controls-row .gradio-row, | |
| .leaderboard-controls-row .gradio-group { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 0 !important; | |
| } | |
| /* Remove all container backgrounds for leaderboard controls */ | |
| .leaderboard-controls-row * { | |
| background-color: transparent !important; | |
| background-image: none !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| } | |
| .leaderboard-controls-row .inline-radio, | |
| .leaderboard-controls-row .domain-radio { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| } | |
| /* Inline radio styling for integrated controls */ | |
| .inline-radio { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 0 !important; | |
| } | |
| .inline-radio .wrap { | |
| display: flex !important; | |
| gap: 8px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: flex-start !important; | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| padding: 0 !important; | |
| } | |
| .inline-radio label { | |
| padding: 8px 16px !important; | |
| background: rgba(245, 246, 247, 0.06) !important; | |
| border: 1px solid var(--border-subtle) !important; | |
| border-radius: 20px !important; | |
| font-size: 0.85rem !important; | |
| color: #FFFFFF !important; | |
| transition: all 0.2s ease !important; | |
| cursor: pointer !important; | |
| } | |
| .inline-radio label:hover { | |
| background: rgba(255, 210, 30, 0.12) !important; | |
| border-color: var(--accent-primary) !important; | |
| } | |
| .inline-radio input[type="radio"]:checked + label, | |
| .inline-radio label[aria-checked="true"] { | |
| background: rgba(255, 210, 30, 0.2) !important; | |
| border-color: var(--accent-primary) !important; | |
| color: #FFFFFF !important; | |
| font-weight: 600 !important; | |
| } | |
| </style> | |
| """) | |
| level_options = list(level_details.keys()) | |
| # Main leaderboard table with dynamic title and integrated controls | |
| leaderboard_title = gr.HTML(update_leaderboard_title(default_level)) | |
| # Integrated controls within leaderboard section - stacked vertically | |
| gr.HTML("<p style='color: var(--text-primary); margin: 5px 0 5px 0; font-size: 1.2rem;'>Select Task Level</p>") | |
| domain_filter = gr.Radio( | |
| choices=level_options, | |
| value=default_level, | |
| label="", | |
| interactive=True, | |
| container=False, | |
| elem_classes=["domain-radio", "inline-radio"] | |
| ) | |
| gr.HTML("<p style='color: var(--text-primary); margin: 5px 0 0px 0; font-size: 1.2rem;'>π Filters & Sorting</p>") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.HTML("<span style='color: var(--text-primary); font-size: 1.2rem; margin-bottom: 5px; display: block;'>Model Access</span>") | |
| model_type_filter = gr.Radio( | |
| choices=["All", "OSS", "API"], | |
| value="All", | |
| label="", | |
| elem_classes=["domain-radio", "inline-radio"], | |
| container=False | |
| ) | |
| with gr.Column(scale=1): | |
| gr.HTML("<span style='color: var(--text-primary);>Sort Order</span>") | |
| sort_order = gr.Radio( | |
| choices=["Descending", "Ascending"], | |
| value="Descending", | |
| label="", | |
| elem_classes=["domain-radio", "inline-radio"], | |
| container=False | |
| ) | |
| leaderboard_table = gr.HTML(initial_table) | |
| # Radar Chart Section | |
| gr.HTML(""" | |
| <div class="domain-selector-container domain-performance-container"> | |
| <div class="domain-header"> | |
| <h2 class="domain-title" >Core Capability Radar</h2> | |
| <p class="domain-subtitle" style="color: var(--text-primary);">Track six essential axes: <br>success, execution, reasoning, robustness, efficiency, and call validity.</p> | |
| </div> | |
| """) | |
| gr.HTML("<p >Select models to compare (up to 5).</p>") | |
| # gr.HTML("<p style='color: #b0b0b0; margin: 0 0 10px 0; font-size: 0.9rem;'>You can select up to five models.</p>") | |
| model_selector = gr.Dropdown( | |
| choices=initial_df['Model'].tolist()[:10], | |
| value=initial_df['Model'].tolist()[:5], | |
| multiselect=True, | |
| label="", | |
| info=None, | |
| container=False, | |
| ) | |
| # Radar chart plot - wrapped in centered container | |
| gr.HTML('<div class="chart-container radar-chart-container">') | |
| radar_chart = gr.Plot( | |
| label="", | |
| value=create_domain_radar_chart( | |
| load_leaderboard_data(), | |
| initial_df['Model'].tolist()[:5] | |
| ), | |
| elem_classes=["radar-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # Define generate_performance_card function before using it | |
| def generate_performance_card(model_name): | |
| """Generate HTML for the model performance card""" | |
| if not model_name: | |
| return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;"> | |
| Please select a model to generate its performance card | |
| </div>""" | |
| # Get model data | |
| df = load_leaderboard_data() | |
| model_data = df[df['Model'] == model_name] | |
| if model_data.empty: | |
| return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;"> | |
| Model not found in the database | |
| </div>""" | |
| row = model_data.iloc[0] | |
| # Get overall rank based on overall success | |
| df_with_success = df.copy() | |
| df_with_success['Overall Success'] = pd.to_numeric(df_with_success.get('Overall Success', pd.Series()), errors='coerce') | |
| df_with_success = df_with_success[df_with_success['Overall Success'].notna()] | |
| df_sorted = df_with_success.sort_values('Overall Success', ascending=False).reset_index(drop=True) | |
| try: | |
| rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1 | |
| except: | |
| rank = 'N/A' | |
| # Format values | |
| def format_value(val, decimals=3, prefix='', suffix=''): | |
| if pd.isna(val) or val == '': | |
| return 'N/A' | |
| return f"{prefix}{float(val):.{decimals}f}{suffix}" | |
| def format_score(value): | |
| if pd.isna(value) or value == '': | |
| return 'N/A' | |
| return f"{float(value):.3f}" | |
| radar_metrics = [ | |
| ("Execution Accuracy", row.get('Execution Accuracy')), | |
| ("Complex Reasoning", row.get('Complex Reasoning')), | |
| ("Robustness", row.get('Robustness')), | |
| ("Context & Efficiency", row.get('Context & Efficiency')), | |
| ("Overall Success", row.get('Overall Success')), | |
| ("Validity", row.get('Call Validity')), | |
| ] | |
| radar_values = [] | |
| radar_labels = [] | |
| for label, value in radar_metrics: | |
| if pd.isna(value) or value == '': | |
| radar_values.append(0.0) | |
| else: | |
| try: | |
| radar_values.append(max(0.0, min(1.0, float(value)))) | |
| except (TypeError, ValueError): | |
| radar_values.append(0.0) | |
| radar_labels.append(label) | |
| mini_radar_html = build_static_radar_chart(radar_values, radar_labels) | |
| level_blocks = [] | |
| for level in level_ids: | |
| sr_col = sr_column_map.get(level) | |
| level_blocks.append((level, row.get(sr_col, ''))) | |
| evaluation_date = EVALUATION_DATE | |
| icon_html = "" | |
| if KREW_ICON_BASE64: | |
| icon_html = f'<img src="data:image/png;base64,{KREW_ICON_BASE64}" alt="Krew icon" />' | |
| else: | |
| icon_html = '<div class="icon-fallback">π€</div>' | |
| card_html = f""" | |
| <div class="performance-card"> | |
| <div class="card-top-row"> | |
| <div class="model-identity"> | |
| <div class="model-icon">{icon_html}</div> | |
| <div class="model-meta"> | |
| <div class="card-model-name">{model_name}</div> | |
| <div class="meta-line">Vendor Β· <span>{row['Vendor']}</span></div> | |
| <div class="meta-line evaluation"> | |
| <span class="date-pill">Evaluation Date</span> | |
| <span>{evaluation_date}</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="rank-panel"> | |
| <div class="rank-label">RANK</div> | |
| <div class="rank-value">#{rank}</div> | |
| </div> | |
| </div> | |
| <div class="card-main"> | |
| <div class="card-body"> | |
| <div class="radar-slot"> | |
| {mini_radar_html} | |
| </div> | |
| <div class="core-section"> | |
| <div class="core-metric-grid"> | |
| """ | |
| ordered_labels = ["Execution Accuracy", "Complex Reasoning", "Robustness", "Context & Efficiency", "Overall Success", "Validity"] | |
| ordered_metrics = sorted(radar_metrics, key=lambda x: ordered_labels.index(x[0]) if x[0] in ordered_labels else len(ordered_labels)) | |
| top_metrics = ordered_metrics[:3] | |
| bottom_metrics = ordered_metrics[3:] | |
| card_html += """ | |
| <div class="core-metric-row"> | |
| """ | |
| for label, value in top_metrics: | |
| card_html += f""" | |
| <div class="core-metric-card"> | |
| <div class="metric-label">{label}</div> | |
| <div class="metric-value">{format_score(value)}</div> | |
| </div> | |
| """ | |
| card_html += """ | |
| </div> | |
| <div class="core-metric-row"> | |
| """ | |
| for label, value in bottom_metrics: | |
| card_html += f""" | |
| <div class="core-metric-card"> | |
| <div class="metric-label">{label}</div> | |
| <div class="metric-value">{format_score(value)}</div> | |
| </div> | |
| """ | |
| card_html += """ | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="level-strip"> | |
| """ | |
| for level, value in level_blocks: | |
| card_html += f""" | |
| <div class="level-tile"> | |
| <div class="level-tile-label">{level}</div> | |
| <div class="level-tile-score">{format_score(value)}</div> | |
| </div> | |
| """ | |
| card_html += """ | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| return card_html | |
| # MODEL PERFORMANCE CARD SECTION | |
| gr.HTML(""" | |
| <div class="domain-selector-container performance-card-container"> | |
| <div class="domain-header"> | |
| <h2 class="domain-title" >Model Performance Card</h2> | |
| <p class="domain-subtitle" style="color: var(--text-primary);"> | |
| Explore detailed performance cards that visualize six core metrics plus overall SR across L1βL7 levels. | |
| </p> | |
| <p class="domain-note" style="color: #bdbdbd; font-size: 0.85em; margin-top: 4px;"> | |
| β» Ranks are determined by the average SR across L1βL7. | |
| </p> | |
| </div> | |
| <div class="performance-card-content"> | |
| """) | |
| with gr.Column(elem_classes=["domain-selector-container", "model-selector-container"], elem_id="model-selector-box"): | |
| gr.HTML(""" | |
| <p class="domain-subtitle" style="color: var(--text-primary);">Choose a model to generate its analysis card.</p> | |
| """) | |
| card_model_selector = gr.Dropdown( | |
| choices=initial_df['Model'].tolist(), | |
| value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None, | |
| label="", | |
| info=None, | |
| container=False, | |
| # elem_classes=["model-dropdown"] | |
| ) | |
| download_card_btn = gr.Button( | |
| "Download as PNG", | |
| elem_id="download-card-btn-en", | |
| elem_classes=["pill-button"] | |
| ) | |
| gr.HTML(""" | |
| <div class="performance-card-display" id="card-display-container-en"> | |
| """) | |
| # Card display area - generate initial card | |
| initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None | |
| initial_card_html = generate_performance_card(initial_model) if initial_model else "" | |
| card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html-en") | |
| gr.HTML(""" | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # Level metric breakdown section | |
| gr.HTML(""" | |
| <div class="domain-selector-container domain-performance-container level-metrics-wrapper"> | |
| <div class="domain-header"> | |
| <h2 class="domain-title" >Level-specific Metrics</h2> | |
| <p class="domain-subtitle" style="color: var(--text-primary);">Compare model scores with each Ko-AgentBench level's dedicated metrics for deeper insights.</p> | |
| </div> | |
| """) | |
| with gr.Column(elem_classes=["domain-selector-container", "level-selector-container"], elem_id="level-selector-box"): | |
| level_metric_selector = gr.Dropdown( | |
| choices=level_ids, | |
| value=level_ids[0] if level_ids else None, | |
| multiselect=False, | |
| label="", | |
| info=None, | |
| container=False, | |
| elem_classes=["level-dropdown"] | |
| ) | |
| level_model_selector = gr.Dropdown( | |
| choices=initial_level_model_choices, | |
| value=initial_level_model_values, | |
| multiselect=True, | |
| label="", | |
| info=None, | |
| container=False, | |
| elem_classes=["model-dropdown", "level-model-dropdown"] | |
| ) | |
| gr.HTML('<div class="chart-container level-metric-chart-container">') | |
| level_metric_chart = gr.Plot( | |
| label="", | |
| value=initial_level_metric_chart, | |
| elem_classes=["level-metric-plot", "plot-container"] | |
| ) | |
| gr.HTML(""" | |
| </div> | |
| </div> | |
| """) | |
| # # Heatmap section | |
| # gr.HTML(""" | |
| # <div class="domain-selector-container domain-performance-container heatmap-wrapper"> | |
| # <div class="domain-header"> | |
| # <h2 class="domain-title" >Comprehensive Performance Heatmap</h2> | |
| # <p class="domain-subtitle" >See each model's L1βL7 SR scores at a glance.</p> | |
| # </div> | |
| # <div class="chart-container heatmap-chart-container"> | |
| # """) | |
| # heatmap_chart = gr.Plot( | |
| # label="", | |
| # value=initial_heatmap, | |
| # elem_classes=["heatmap-plot", "plot-container"] | |
| # ) | |
| # gr.HTML(""" | |
| # </div> | |
| # </div> | |
| # """) | |
| # Update functions | |
| def get_optimal_sort_order(sort_by_value): | |
| """Return the optimal sort order for a given metric""" | |
| # Metrics where higher is better (descending) | |
| descending_metrics = ["Overall Success"] + [sr_column_map[level] for level in level_ids] | |
| # Metrics where lower is better (ascending) | |
| ascending_metrics = [] | |
| if sort_by_value in descending_metrics: | |
| return "Descending" | |
| elif sort_by_value in ascending_metrics: | |
| return "Ascending" | |
| else: | |
| return "Descending" # Default fallback | |
| def update_table(level_filter, model_type_filter, sort_order): | |
| title_html = update_leaderboard_title(level_filter) | |
| sort_metric = "Overall Success" if level_filter == "ALL" else sr_column_map.get(resolve_level(level_filter), "Overall Success") | |
| table_html = filter_and_sort_data(level_filter, model_type_filter, sort_metric, sort_order) | |
| return title_html, table_html | |
| def update_radar_chart(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): | |
| # Get filtered dataframe | |
| df = load_leaderboard_data() | |
| sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") | |
| filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) | |
| # Update model selector choices based on filtered data | |
| available_models_all = filtered_df['Model'].tolist() | |
| available_models = available_models_all[:15] # Top 15 from filtered results | |
| # If selected models are not in available models, reset to top 5 | |
| if selected_models: | |
| valid_selected = [m for m in selected_models if m in available_models] | |
| # Check if more than 5 models are selected and show alert | |
| if len(valid_selected) > 5: | |
| gr.Warning("You can select up to 5 models.") | |
| # Remove the last selected item (6th item) instead of keeping first 5 | |
| valid_selected = valid_selected[:-1] | |
| if not valid_selected: | |
| valid_selected = available_models[:5] | |
| else: | |
| valid_selected = available_models[:5] | |
| # Create radar chart | |
| chart = create_domain_radar_chart(filtered_df, valid_selected) | |
| # Prepare heatmap order prioritizing selected models | |
| # Level metric chart | |
| effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) | |
| available_level_models = available_models_all | |
| if level_selected_models: | |
| valid_level_models = [m for m in level_selected_models if m in available_level_models][:5] | |
| if not valid_level_models: | |
| valid_level_models = available_level_models[:5] | |
| else: | |
| valid_level_models = available_level_models[:5] | |
| level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") | |
| return ( | |
| gr.Dropdown( | |
| choices=available_models, | |
| value=valid_selected, | |
| multiselect=True, | |
| label="", | |
| info=None, | |
| container=False, | |
| # elem_classes=["model-dropdown"] | |
| ), | |
| chart, | |
| gr.Dropdown( | |
| choices=available_level_models, | |
| value=valid_level_models, | |
| multiselect=True, | |
| label="", | |
| info=None, | |
| container=False, | |
| elem_classes=["model-dropdown", "level-model-dropdown"] | |
| ), | |
| level_metric_fig, | |
| ) | |
| def update_radar_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): | |
| # Get filtered dataframe | |
| df = load_leaderboard_data() | |
| sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") | |
| filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) | |
| available_models_all = filtered_df['Model'].tolist() | |
| if selected_models: | |
| valid_selected = [m for m in selected_models if m in available_models_all] | |
| # Check if more than 5 models are selected and show alert | |
| if len(valid_selected) > 5: | |
| # JavaScript alert for exceeding 5 models | |
| gr.Warning("You can select up to 5 models.") | |
| # Remove the last selected item (6th item) instead of keeping first 5 | |
| valid_selected = valid_selected[:-1] | |
| if not valid_selected: | |
| valid_selected = available_models_all[:5] | |
| else: | |
| valid_selected = available_models_all[:5] | |
| effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) | |
| available_level_models = available_models_all | |
| if level_selected_models: | |
| valid_level_models = [m for m in level_selected_models if m in available_level_models][:5] | |
| if not valid_level_models: | |
| valid_level_models = available_level_models[:5] | |
| else: | |
| valid_level_models = available_level_models[:5] | |
| level_metric_fig = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") | |
| return ( | |
| gr.Dropdown( | |
| choices=available_models_all[:15], | |
| value=valid_selected, | |
| multiselect=True, | |
| label="", | |
| info=None, | |
| container=False, | |
| ), | |
| create_domain_radar_chart(filtered_df, valid_selected), | |
| gr.Dropdown( | |
| choices=available_level_models, | |
| value=valid_level_models, | |
| multiselect=True, | |
| label="", | |
| info=None, | |
| container=False, | |
| elem_classes=["model-dropdown", "level-model-dropdown"] | |
| ), | |
| level_metric_fig, | |
| ) | |
| def update_level_metric_only(domain_filter, model_type_filter, sort_order, selected_models, selected_level, level_selected_models): | |
| df = load_leaderboard_data() | |
| sort_metric = "Overall Success" if domain_filter == "ALL" else sr_column_map.get(resolve_level(domain_filter), "Overall Success") | |
| filtered_df, _, _ = apply_filters(df, domain_filter, model_type_filter, sort_order, sort_metric) | |
| available_models = filtered_df['Model'].tolist() | |
| if level_selected_models: | |
| valid_level_models = [m for m in level_selected_models if m in available_models] | |
| # Check if more than 5 models are selected and show alert | |
| if len(valid_level_models) > 5: | |
| gr.Warning("You can select up to 5 models.") | |
| # Remove the last selected item (6th item) instead of keeping first 5 | |
| valid_level_models = valid_level_models[:-1] | |
| if not valid_level_models: | |
| valid_level_models = available_models[:5] | |
| else: | |
| valid_level_models = available_models[:5] | |
| effective_level = selected_level if selected_level in level_ids else (level_ids[0] if level_ids else None) | |
| level_chart = create_level_metric_chart(filtered_df, effective_level, valid_level_models) if effective_level else create_empty_level_metric_chart("Select a level to view its metrics") | |
| return ( | |
| gr.Dropdown( | |
| choices=available_models, | |
| value=valid_level_models, | |
| multiselect=True, | |
| label="", | |
| info=None, | |
| container=False, | |
| elem_classes=["model-dropdown", "level-model-dropdown"] | |
| ), | |
| level_chart, | |
| ) | |
| # Update table when filters change | |
| filter_inputs = [domain_filter, model_type_filter, sort_order] | |
| for input_component in filter_inputs: | |
| input_component.change( | |
| fn=update_table, | |
| inputs=filter_inputs, | |
| outputs=[leaderboard_title, leaderboard_table] | |
| ) | |
| # Also update radar chart when filters change | |
| input_component.change( | |
| fn=update_radar_chart, | |
| inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], | |
| outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart] | |
| ) | |
| # Update radar chart when model selection changes | |
| model_selector.change( | |
| fn=update_radar_only, | |
| inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], | |
| outputs=[model_selector, radar_chart, level_model_selector, level_metric_chart] | |
| ) | |
| level_metric_selector.change( | |
| fn=update_level_metric_only, | |
| inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], | |
| outputs=[level_model_selector, level_metric_chart] | |
| ) | |
| level_model_selector.change( | |
| fn=update_level_metric_only, | |
| inputs=filter_inputs + [model_selector, level_metric_selector, level_model_selector], | |
| outputs=[level_model_selector, level_metric_chart] | |
| ) | |
| # Add custom CSS for the performance card | |
| gr.HTML(""" | |
| <style> | |
| /* Performance Card Styles */ | |
| .performance-card { | |
| background: linear-gradient(140deg, rgba(1, 9, 26, 0.95) 0%, rgba(20, 34, 58, 0.65) 100%); | |
| border: 1px solid rgba(255, 210, 30, 0.35); | |
| border-radius: 24px; | |
| padding: 28px; | |
| max-width: 820px; | |
| margin: 0 auto; | |
| box-shadow: 0 18px 36px rgba(0, 0, 0, 0.35); | |
| display: flex; | |
| flex-direction: column; | |
| gap: 28px; | |
| } | |
| .card-top-row { | |
| display: flex; | |
| justify-content: space-between; | |
| gap: 24px; | |
| align-items: stretch; | |
| } | |
| .card-main { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 20px; | |
| } | |
| .model-identity { | |
| display: flex; | |
| gap: 20px; | |
| align-items: center; | |
| } | |
| .model-icon { | |
| width: 88px; | |
| height: 88px; | |
| background: rgba(245, 246, 247, 0.08); | |
| border-radius: 20px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| padding: 12px; | |
| border: 1px solid rgba(245, 246, 247, 0.12); | |
| } | |
| .model-icon img { | |
| width: 100%; | |
| height: 100%; | |
| object-fit: contain; | |
| } | |
| .icon-fallback { | |
| font-size: 2.8rem; | |
| } | |
| .model-meta { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 6px; | |
| } | |
| .card-model-name { | |
| font-size: 1.9rem; | |
| font-weight: 800; | |
| letter-spacing: 0.01em; | |
| color: #FFFFFF; | |
| } | |
| .meta-line { | |
| font-size: 0.95rem; | |
| color: var(--text-secondary); | |
| display: flex; | |
| gap: 6px; | |
| align-items: center; | |
| } | |
| .meta-line span { | |
| color: #FFFFFF; | |
| font-weight: 600; | |
| } | |
| .date-pill { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| background: rgba(255, 210, 30, 0.18); | |
| color: var(--accent-primary); | |
| padding: 2px 10px; | |
| border-radius: 999px; | |
| font-size: 0.75rem; | |
| font-weight: 600; | |
| letter-spacing: 0.05em; | |
| text-transform: uppercase; | |
| } | |
| .rank-panel { | |
| min-width: 140px; | |
| background: rgba(255, 210, 30, 0.12); | |
| border: 1px solid rgba(255, 210, 30, 0.35); | |
| border-radius: 20px; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| justify-content: center; | |
| padding: 18px; | |
| gap: 6px; | |
| } | |
| .rank-label { | |
| font-size: 0.9rem; | |
| letter-spacing: 0.1em; | |
| color: var(--text-secondary); | |
| } | |
| .rank-value { | |
| font-size: 2.4rem; | |
| font-weight: 800; | |
| color: #FFFFFF; | |
| letter-spacing: 0.04em; | |
| } | |
| .radar-legend { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(130px, 1fr)); | |
| gap: 10px; | |
| font-size: 0.8rem; | |
| color: var(--text-tertiary); | |
| letter-spacing: 0.04em; | |
| text-transform: uppercase; | |
| } | |
| .card-body { | |
| display: grid; | |
| grid-template-columns: 240px 1fr; | |
| gap: 28px; | |
| align-items: start; | |
| } | |
| .radar-slot { | |
| width: 240px; | |
| height: 240px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| border-radius: 20px; | |
| background: rgba(245, 246, 247, 0.04); | |
| border: 1px dashed rgba(245, 246, 247, 0.12); | |
| padding: 10px; | |
| box-sizing: border-box; | |
| justify-self: center; | |
| } | |
| .radar-slot svg { | |
| width: 100%; | |
| height: 100%; | |
| } | |
| .heatmap-wrapper { | |
| margin-top: 40px; | |
| } | |
| .heatmap-chart-container { | |
| padding: 0 20px 32px; | |
| } | |
| .radar-chart-container { | |
| padding: 0 20px 32px; | |
| } | |
| .heatmap-plot { | |
| width: 100%; | |
| max-width: 1400px; | |
| margin: 0 auto; | |
| border-radius: 22px; | |
| overflow: hidden; | |
| background: rgba(245, 246, 247, 0.02); | |
| border: 1px solid var(--border-subtle); | |
| box-shadow: 0 14px 40px rgba(0, 0, 0, 0.35); | |
| } | |
| .heatmap-plot .modebar { | |
| display: none; | |
| } | |
| .level-metrics-wrapper { | |
| margin-top: 40px; | |
| } | |
| .level-metric-controls { | |
| justify-content: center; | |
| align-items: center; | |
| margin-bottom: 20px; | |
| gap: 12px; | |
| } | |
| .level-metric-chart-container { | |
| padding: 0 20px 32px; | |
| } | |
| .level-metric-plot { | |
| width: 100%; | |
| max-width: 1400px; | |
| margin: 0 auto; | |
| border-radius: 22px; | |
| overflow: hidden; | |
| background: rgba(245, 246, 247, 0.02); | |
| border: 1px solid var(--border-subtle); | |
| box-shadow: 0 14px 40px rgba(0, 0, 0, 0.35); | |
| } | |
| .level-dropdown { | |
| max-width: 260px !important; | |
| margin: 0 auto !important; | |
| } | |
| .level-dropdown select, | |
| .level-dropdown [role="combobox"], | |
| .level-dropdown button { | |
| background: #000000 !important; | |
| border: 1px solid #333333 !important; | |
| border-radius: 999px !important; | |
| padding: 12px 20px !important; | |
| color: #FFFFFF !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 600 !important; | |
| font-size: 0.95rem !important; | |
| text-align: center !important; | |
| min-height: 46px !important; | |
| transition: all 0.3s ease !important; | |
| box-shadow: 0 10px 24px rgba(0, 0, 0, 0.3) !important; | |
| } | |
| .level-dropdown select:hover, | |
| .level-dropdown [role="combobox"]:hover, | |
| .level-dropdown button:hover:not(:disabled) { | |
| transform: translateY(-1px) !important; | |
| border-color: var(--accent-primary) !important; | |
| box-shadow: 0 12px 28px rgba(255, 210, 30, 0.25) !important; | |
| background: rgba(255, 210, 30, 0.12) !important; | |
| } | |
| .level-model-dropdown { | |
| width: 100% !important; | |
| margin: 12px auto 0 !important; | |
| } | |
| .level-model-dropdown select, | |
| .level-model-dropdown [role="combobox"], | |
| .level-model-dropdown button { | |
| background: #000000 !important; | |
| border: 1px solid #333333 !important; | |
| color: #FFFFFF !important; | |
| } | |
| .radar-placeholder { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 10px; | |
| color: var(--text-secondary); | |
| font-size: 1rem; | |
| letter-spacing: 0.03em; | |
| } | |
| .radar-placeholder small { | |
| font-size: 0.68rem; | |
| line-height: 1.6; | |
| color: var(--text-tertiary); | |
| } | |
| .core-section { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 20px; | |
| } | |
| .core-metric-grid { | |
| display: flex; | |
| flex-direction: column; | |
| gap: 12px; | |
| } | |
| .core-metric-row { | |
| display: grid; | |
| grid-template-columns: repeat(3, minmax(150px, 1fr)); | |
| gap: 12px; | |
| } | |
| .core-metric-card { | |
| background: rgba(245, 246, 247, 0.06); | |
| border: 1px solid rgba(245, 246, 247, 0.12); | |
| border-radius: 16px; | |
| padding: 18px; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 8px; | |
| transition: transform 0.25s ease, border-color 0.25s ease; | |
| } | |
| .core-metric-card:hover { | |
| transform: translateY(-4px); | |
| border-color: rgba(255, 210, 30, 0.45); | |
| } | |
| .core-metric-card .metric-label { | |
| font-size: 0.95rem; | |
| color: var(--text-secondary); | |
| font-weight: 600; | |
| } | |
| .core-metric-card .metric-value { | |
| font-size: 1.8rem; | |
| font-weight: 700; | |
| color: #FFFFFF; | |
| font-family: 'Geist Mono', monospace; | |
| } | |
| .level-strip { | |
| display: grid; | |
| grid-template-columns: repeat(7, 1fr); | |
| gap: 12px; | |
| margin-top: 0; | |
| width: 100%; | |
| } | |
| .level-tile { | |
| background: rgba(245, 246, 247, 0.05); | |
| border: 1px solid rgba(245, 246, 247, 0.1); | |
| border-radius: 12px; | |
| padding: 12px 10px; | |
| display: flex; | |
| flex-direction: column; | |
| gap: 4px; | |
| text-align: center; | |
| min-width: 0; | |
| } | |
| .level-tile-label { | |
| font-size: 0.82rem; | |
| font-weight: 600; | |
| color: var(--text-secondary); | |
| } | |
| .level-tile-score { | |
| font-size: 1.25rem; | |
| font-weight: 700; | |
| color: #FFFFFF; | |
| font-family: 'Geist Mono', monospace; | |
| } | |
| @media (max-width: 980px) { | |
| .performance-card { | |
| padding: 20px; | |
| } | |
| .card-top-row { | |
| flex-direction: column; | |
| } | |
| .rank-panel { | |
| align-self: flex-start; | |
| } | |
| .card-body { | |
| grid-template-columns: 1fr; | |
| } | |
| .radar-slot { | |
| width: 100%; | |
| max-width: 280px; | |
| margin: 0 auto; | |
| } | |
| .radar-chart-container { | |
| overflow-x: auto; | |
| padding: 0 12px 28px; | |
| -webkit-overflow-scrolling: touch; | |
| } | |
| .radar-metric-plot { | |
| min-width: 720px; | |
| } | |
| .level-strip { | |
| grid-template-columns: repeat(4, 1fr); | |
| } | |
| .heatmap-chart-container { | |
| overflow-x: auto; | |
| padding: 0 12px 28px; | |
| -webkit-overflow-scrolling: touch; | |
| } | |
| .heatmap-plot { | |
| min-width: 720px; | |
| } | |
| .level-metric-chart-container { | |
| overflow-x: auto; | |
| padding: 0 12px 28px; | |
| -webkit-overflow-scrolling: touch; | |
| } | |
| .level-metric-plot { | |
| min-width: 720px; | |
| } | |
| } | |
| @media (max-width: 640px) { | |
| .level-strip { | |
| grid-template-columns: repeat(3, 1fr); | |
| } | |
| .heatmap-plot { | |
| min-width: 640px; | |
| } | |
| .level-metric-plot { | |
| min-width: 640px; | |
| } | |
| } | |
| /* Force fonts - highest priority */ | |
| .dashboard-section, | |
| .dashboard-section *, | |
| .dashboard-section h2, | |
| .dashboard-section h3, | |
| .dashboard-section p, | |
| .dashboard-section li, | |
| .section-lead, | |
| .section-subtitle, | |
| .phase-card h3, | |
| .phase-list li, | |
| .scenario-body p, | |
| .criteria-card h3, | |
| .criteria-card ul, | |
| .criteria-card li { | |
| font-family: "Nanum Gothic", sans-serif !important; | |
| } | |
| /* Force section-title styling */ | |
| .section-title, | |
| h2.section-title, | |
| .dashboard-section .section-title, | |
| .section-header .section-title { | |
| font-family: "Nanum Gothic", sans-serif !important; | |
| } | |
| .domain-title, | |
| h2.domain-title, | |
| .domain-header .domain-title { | |
| font-family: "Nanum Gothic", sans-serif !important; | |
| } | |
| .hero-title, | |
| .hero-subtitle, | |
| h1.hero-title, | |
| p.hero-subtitle { | |
| font-family: "Nanum Gothic", sans-serif !important; | |
| font-size: 2rem; !important; | |
| } | |
| /* Force hero-title sizing */ | |
| .hero-title, | |
| h1.hero-title { | |
| font-size: 4rem !important; | |
| } | |
| .phase-chart span, | |
| .phase-card .phase-chart span, | |
| .phase-grid .phase-chart span { | |
| font-family: "Nanum Gothic", sans-serif !important; | |
| font-size: 1.2rem !important; | |
| } | |
| .section-lead, .section-subtitle { | |
| font-size: 1.32rem !important; | |
| font-family: "Nanum Gothic", sans-serif !important; | |
| } | |
| .phase-card h3 { | |
| font-size: 1.44rem !important; | |
| font-family: "Nanum Gothic", sans-serif !important; | |
| } | |
| .phase-list li { | |
| font-size: 1.08rem !important; | |
| font-family: "Nanum Gothic", sans-serif !important; | |
| } | |
| </style> | |
| """) | |
| # Wire up the card generator to selection change | |
| card_model_selector.change( | |
| fn=generate_performance_card, | |
| inputs=[card_model_selector], | |
| outputs=[card_display] | |
| ) | |
| # Wire up download button with html2canvas capture | |
| download_card_btn.click( | |
| fn=None, | |
| js=""" | |
| async () => { | |
| const ensureHtml2Canvas = () => new Promise((resolve, reject) => { | |
| if (window.html2canvas) { | |
| resolve(window.html2canvas); | |
| return; | |
| } | |
| const existing = document.querySelector('script[data-html2canvas]'); | |
| if (existing) { | |
| existing.addEventListener('load', () => resolve(window.html2canvas)); | |
| existing.addEventListener('error', reject); | |
| return; | |
| } | |
| const script = document.createElement('script'); | |
| script.src = 'https://cdn.jsdelivr.net/npm/[email protected]/dist/html2canvas.min.js'; | |
| script.async = true; | |
| script.dataset.html2canvas = 'true'; | |
| script.onload = () => resolve(window.html2canvas); | |
| script.onerror = () => reject(new Error('Failed to load html2canvas')); | |
| document.head.appendChild(script); | |
| }); | |
| const pause = (ms) => new Promise(resolve => setTimeout(resolve, ms)); | |
| await pause(60); | |
| const container = document.getElementById('performance-card-html-en'); | |
| const card = container?.querySelector('.performance-card'); | |
| if (!container || !card) { | |
| alert('Performance card not found. Please select a model first.'); | |
| return; | |
| } | |
| const btn = document.getElementById('download-card-btn-en'); | |
| const originalText = btn?.textContent || ''; | |
| if (btn) { | |
| btn.textContent = 'Generating...'; | |
| btn.disabled = true; | |
| } | |
| try { | |
| const html2canvasLib = await ensureHtml2Canvas(); | |
| if (!html2canvasLib) { | |
| throw new Error('html2canvas unavailable'); | |
| } | |
| const canvas = await html2canvasLib(card, { | |
| backgroundColor: '#01091A', | |
| scale: 2, | |
| logging: false, | |
| useCORS: true | |
| }); | |
| if (!canvas || !canvas.width || !canvas.height) { | |
| throw new Error('Captured canvas is empty'); | |
| } | |
| const link = document.createElement('a'); | |
| const modelName = card.querySelector('.card-model-name')?.textContent || 'model'; | |
| const timestamp = new Date().toISOString().slice(0, 10); | |
| const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`; | |
| link.download = fileName; | |
| const dataUrl = canvas.toDataURL('image/png'); | |
| if (!dataUrl || dataUrl === 'data:,' || dataUrl.length <= 'data:image/png;base64,'.length) { | |
| throw new Error('Failed to generate PNG data'); | |
| } | |
| link.href = dataUrl; | |
| document.body.appendChild(link); | |
| link.click(); | |
| document.body.removeChild(link); | |
| } catch (error) { | |
| console.error('Error capturing card:', error); | |
| alert('Failed to capture performance card. Please try again.'); | |
| } finally { | |
| if (btn) { | |
| btn.textContent = originalText; | |
| btn.disabled = false; | |
| } | |
| } | |
| } | |
| """ | |
| ) | |
| # Also update card when filters change to keep model selector in sync | |
| for input_component in filter_inputs: | |
| def update_dropdown_and_card(*args): | |
| filtered_df, _, _ = apply_filters( | |
| load_leaderboard_data(), | |
| args[0], | |
| args[1], | |
| args[2], | |
| "Overall Success" if args[0] == "ALL" else sr_column_map.get(resolve_level(args[0]), "Overall Success") | |
| ) | |
| choices = filtered_df['Model'].tolist() | |
| # Select first model from filtered list | |
| value = choices[0] if choices else None | |
| return gr.Dropdown( | |
| choices=choices, | |
| value=value, | |
| label="", | |
| info=None, | |
| container=False, | |
| # elem_classes=["model-dropdown"] | |
| ) | |
| input_component.change( | |
| fn=update_dropdown_and_card, | |
| inputs=filter_inputs, | |
| outputs=[card_model_selector] | |
| ) | |
| return leaderboard_table | |
| def create_leaderboard_v2_interface(): | |
| """Create the complete leaderboard v1 interface""" | |
| return create_leaderboard_v2_tab() | |
| def create_domain_radar_chart(df, selected_models=None, max_models=5): | |
| """Visualize six core capability metrics on a radar chart.""" | |
| df = df.copy() | |
| metrics_info = [ | |
| {"column": "Overall Success", "label": "Overall Success", "description": "Average SR across L1-L7"}, | |
| {"column": "Execution Accuracy", "label": "Execution Accuracy", "description": "CallEM Β· ArgAcc Β· SelectAcc"}, | |
| {"column": "Complex Reasoning", "label": "Complex Reasoning", "description": "ProvAcc Β· PSM Β· Coverage"}, | |
| {"column": "Robustness", "label": "Robustness", "description": "AdaptiveRouting Β· FallbackSR"}, | |
| {"column": "Context & Efficiency", "label": "Context & Efficiency", "description": "ReuseRate Β· EffScore Β· ContextRetention"}, | |
| {"column": "Call Validity", "label": "Call Validity", "description": "Average EPR_CVR across levels"}, | |
| ] | |
| required_columns = [m["column"] for m in metrics_info] | |
| if df.empty or not any(col in df.columns for col in required_columns): | |
| return create_empty_radar_chart("Not enough data to build the capability radar") | |
| # Default model selection | |
| if not selected_models: | |
| if "Overall Success" in df.columns: | |
| top_models = df.sort_values("Overall Success", ascending=False) | |
| else: | |
| top_models = df | |
| selected_models = top_models['Model'].head(max_models).tolist() | |
| selected_models = selected_models[:max_models] | |
| # Ensure metric columns are numeric | |
| for metric in metrics_info: | |
| col = metric["column"] | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| fig = go.Figure() | |
| angle_labels = [m["label"] for m in metrics_info] | |
| palette = [ | |
| {'fill': 'rgba(255, 210, 30, 0.25)', 'line': '#ffd21e'}, | |
| {'fill': 'rgba(255, 138, 60, 0.22)', 'line': '#FF8A3C'}, | |
| {'fill': 'rgba(161, 98, 7, 0.22)', 'line': '#A16207'}, | |
| {'fill': 'rgba(220, 38, 38, 0.20)', 'line': '#DC2626'}, | |
| {'fill': 'rgba(248, 250, 252, 0.20)', 'line': '#F8FAFC'}, | |
| ] | |
| for idx, model_name in enumerate(selected_models): | |
| model_data = df[df['Model'] == model_name] | |
| if model_data.empty: | |
| continue | |
| row = model_data.iloc[0] | |
| values = [] | |
| tooltips = [] | |
| for metric in metrics_info: | |
| col = metric["column"] | |
| value = row[col] if col in row else float('nan') | |
| if pd.isna(value) or value == '': | |
| value = 0 | |
| values.append(float(value)) | |
| tooltips.append(metric["description"]) | |
| if not values: | |
| continue | |
| values_loop = values + [values[0]] | |
| angles_loop = angle_labels + [angle_labels[0]] | |
| tooltips_loop = tooltips + [tooltips[0]] | |
| colors = palette[idx % len(palette)] | |
| fig.add_trace( | |
| go.Scatterpolar( | |
| r=values_loop, | |
| theta=angles_loop, | |
| fill='toself', | |
| fillcolor=colors['fill'], | |
| line=dict(color=colors['line'], width=3), | |
| marker=dict( | |
| size=10, | |
| color=colors['line'], | |
| symbol='circle', | |
| line=dict(width=2, color='#01091A') | |
| ), | |
| name=model_name, | |
| customdata=tooltips_loop, | |
| mode="lines+markers", | |
| hovertemplate="<b>%{fullData.name}</b><br>" + | |
| "<span style='color: #94A3B8'>%{theta}</span><br>" + | |
| "<span style='color: #F5E7CB; font-size: 12px;'>%{customdata}</span><br>" + | |
| "<b style='font-size: 14px; color: #F5F6F7'>%{r:.3f}</b><br>" + | |
| "<extra></extra>", | |
| hoverlabel=dict( | |
| bgcolor="rgba(1, 9, 26, 0.95)", | |
| bordercolor=colors['line'], | |
| font=dict(color="white", size=12, family="'Geist', sans-serif") | |
| ) | |
| ) | |
| ) | |
| tick_vals = [i / 5 for i in range(6)] | |
| tick_text = [f"{val:.2f}" for val in tick_vals] | |
| fig.update_layout( | |
| polar=dict( | |
| bgcolor='rgba(245, 246, 247, 0.03)', | |
| radialaxis=dict( | |
| visible=True, | |
| range=[0, 1], | |
| showline=True, | |
| linewidth=2, | |
| linecolor='rgba(245, 246, 247, 0.2)', | |
| gridcolor='rgba(245, 246, 247, 0.1)', | |
| gridwidth=1, | |
| tickvals=tick_vals, | |
| ticktext=tick_text, | |
| tickfont=dict( | |
| size=11, | |
| color='white', | |
| family="'Geist Mono', monospace" | |
| ) | |
| ), | |
| angularaxis=dict( | |
| showline=True, | |
| linewidth=2, | |
| linecolor='rgba(245, 246, 247, 0.2)', | |
| gridcolor='rgba(245, 246, 247, 0.08)', | |
| tickfont=dict( | |
| size=13, | |
| family="'Geist', sans-serif", | |
| color='white', | |
| weight=600 | |
| ), | |
| rotation=90, | |
| direction="clockwise", | |
| ), | |
| ), | |
| showlegend=True, | |
| legend=dict( | |
| orientation="h", | |
| yanchor="bottom", | |
| y=-0.15, | |
| xanchor="center", | |
| x=0.5, | |
| font=dict(size=12, family="'Geist', sans-serif", color='white'), | |
| bgcolor='rgba(1, 9, 26, 0.8)', | |
| bordercolor='rgba(245, 246, 247, 0.2)', | |
| borderwidth=1, | |
| itemsizing='constant', | |
| itemwidth=30 | |
| ), | |
| title=dict( | |
| text="<b>Core Capability Radar</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict( | |
| size=22, | |
| family="'Geist', sans-serif", | |
| color="white", | |
| weight=700 | |
| ), | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=800, | |
| width=900, | |
| margin=dict(t=30, b=50, l=10, r=10), | |
| autosize=True, | |
| annotations=[] | |
| ) | |
| return fig | |
| def create_performance_heatmap(df, ordered_models=None, max_models=12): | |
| """Render a heatmap of SR scores across task levels for selected models.""" | |
| df = df.copy() | |
| level_sequence = [f"L{i}" for i in range(1, 8)] | |
| sr_columns = [] | |
| for level in level_sequence: | |
| col = f"{level}_SR" | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors="coerce") | |
| sr_columns.append((level, col)) | |
| if df.empty or not sr_columns: | |
| return create_empty_heatmap("Not enough SR data to render the heatmap") | |
| df = df.drop_duplicates(subset=["Model"]) | |
| if df.empty: | |
| return create_empty_heatmap("No models available to render the heatmap") | |
| sort_column = "Overall Success" if "Overall Success" in df.columns else sr_columns[0] | |
| df = df.sort_values(sort_column, ascending=False) | |
| if ordered_models: | |
| ordered_models = [m for m in ordered_models if m in df["Model"].tolist()] | |
| else: | |
| ordered_models = df["Model"].tolist() | |
| if not ordered_models: | |
| return create_empty_heatmap("No models available to render the heatmap") | |
| ordered_models = ordered_models[:max_models] | |
| heatmap_df = df.set_index("Model").reindex(ordered_models) | |
| level_labels = [] | |
| z_matrix = [] | |
| has_values = False | |
| for level, col in sr_columns: | |
| if col not in heatmap_df.columns: | |
| continue | |
| label = f"{level} Β· SR" | |
| level_labels.append(label) | |
| row_values = [] | |
| for model in ordered_models: | |
| value = heatmap_df.at[model, col] if model in heatmap_df.index else None | |
| if pd.isna(value): | |
| row_values.append(None) | |
| else: | |
| val = float(value) | |
| row_values.append(val) | |
| has_values = True | |
| z_matrix.append(row_values) | |
| if not level_labels or not has_values: | |
| return create_empty_heatmap("Not enough SR data to render the heatmap") | |
| colorscale = [ | |
| [0.0, "#0A0A0A"], | |
| [0.25, "#1A1411"], | |
| [0.5, "#332818"], | |
| [0.75, "#B8660A"], | |
| [1.0, "#FFD21E"], | |
| ] | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Heatmap( | |
| z=z_matrix, | |
| x=ordered_models, | |
| y=level_labels, | |
| colorscale=colorscale, | |
| zmin=0, | |
| zmax=1, | |
| hovertemplate="<b>%{y}</b><br><span style='color:#FFD21E'>%{x}</span><br>SR Β· %{z:.3f}<extra></extra>", | |
| colorbar=dict( | |
| title="Success Rate", | |
| titlefont=dict(color="white", family="'Geist', sans-serif", size=12), | |
| tickfont=dict(color="white", family="'Geist', sans-serif", size=10), | |
| thickness=12, | |
| len=0.7, | |
| outlinecolor="rgba(255, 255, 255, 0.1)", | |
| bgcolor="rgba(1, 9, 26, 0.75)" | |
| ), | |
| showscale=True | |
| ) | |
| ) | |
| annotations = [] | |
| for y_idx, level in enumerate(level_labels): | |
| for x_idx, model in enumerate(ordered_models): | |
| value = z_matrix[y_idx][x_idx] | |
| if value is None: | |
| continue | |
| font_color = "#0B1120" if value >= 0.6 else "#F8FAFC" | |
| annotations.append( | |
| dict( | |
| x=model, | |
| y=level, | |
| text=f"{value:.3f}", | |
| showarrow=False, | |
| font=dict( | |
| family="'Geist Mono', monospace", | |
| size=11, | |
| color=font_color | |
| ) | |
| ) | |
| ) | |
| fig.update_layout( | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| margin=dict(t=80, b=90, l=110, r=160), | |
| height=520, | |
| width=1450, | |
| font=dict(family="'Geist', sans-serif", color="white"), | |
| xaxis=dict( | |
| tickangle=-25, | |
| showgrid=False, | |
| ticks="", | |
| tickfont=dict(size=11, family="'Geist', sans-serif", color="white") | |
| ), | |
| yaxis=dict( | |
| showgrid=False, | |
| ticks="", | |
| tickfont=dict(size=12, family="'Geist', sans-serif", color="white") | |
| ), | |
| annotations=annotations, | |
| title=dict( | |
| text="<b>Comprehensive Performance Heatmap</b>", | |
| x=0.5, | |
| y=0.98, | |
| font=dict( | |
| size=20, | |
| family="'Geist', sans-serif", | |
| color="white", | |
| weight=700 | |
| ), | |
| ) | |
| ) | |
| fig.update_xaxes(side="bottom") | |
| return fig | |
| def create_empty_heatmap(message): | |
| """Render an empty state for the heatmap with a centered message.""" | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text=f"πΊοΈ {message}", | |
| xref="paper", yref="paper", | |
| x=0.5, y=0.5, | |
| xanchor='center', yanchor='middle', | |
| font=dict( | |
| size=18, | |
| color="white", | |
| family="'Geist', sans-serif" | |
| ), | |
| showarrow=False, | |
| bgcolor="rgba(245, 246, 247, 0.05)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1, | |
| borderpad=20 | |
| ) | |
| fig.update_layout( | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=520, | |
| # width=1450, | |
| autosize=True, | |
| margin=dict(t=80, b=80, l=80, r=160), | |
| title=dict( | |
| text="<b>Comprehensive Performance Heatmap</b>", | |
| x=0.5, | |
| y=0.98, | |
| font=dict( | |
| size=20, | |
| family="'Geist', sans-serif", | |
| color="white", | |
| weight=700 | |
| ), | |
| ) | |
| ) | |
| fig.update_xaxes(visible=False) | |
| fig.update_yaxes(visible=False) | |
| return fig | |
| def create_level_metric_chart(df, level, selected_models=None, max_models=5): | |
| """Render a grouped horizontal bar chart showing per-model scores for a level's metrics.""" | |
| if not level: | |
| return create_empty_level_metric_chart("Select a level to view its metrics") | |
| df = df.copy() | |
| level_prefix = f"{level}_" | |
| level_columns = [col for col in df.columns if col.startswith(level_prefix)] | |
| metric_columns = [] | |
| for col in level_columns: | |
| metric_suffix = col[len(level_prefix):] | |
| metric_key_lower = metric_suffix.lower() | |
| if "cost" in metric_key_lower: | |
| continue | |
| numeric_series = pd.to_numeric(df[col], errors='coerce') | |
| valid_values = numeric_series.dropna() | |
| if valid_values.empty: | |
| continue | |
| if (valid_values < 0).any() or (valid_values > 1.05).any(): | |
| continue | |
| df[col] = numeric_series | |
| metric_columns.append(col) | |
| if not metric_columns: | |
| return create_empty_level_metric_chart("This level has no 0-1 metrics to visualize") | |
| df = df.drop_duplicates(subset=['Model']) | |
| if df.empty: | |
| return create_empty_level_metric_chart("No models available to render level metrics") | |
| if selected_models: | |
| model_order = [m for m in selected_models if m in df['Model'].tolist()] | |
| else: | |
| sort_col = 'Overall Success' if 'Overall Success' in df.columns else metric_columns[0] | |
| model_order = df.sort_values(sort_col, ascending=False)['Model'].tolist() | |
| if not model_order: | |
| model_order = df['Model'].tolist() | |
| model_order = model_order[:max_models] | |
| df_models = df[df['Model'].isin(model_order)].set_index('Model') | |
| if df_models.empty: | |
| return create_empty_level_metric_chart("No matching models for selected filters") | |
| def prettify_metric_name(metric_key): | |
| raw = metric_key[len(level_prefix):] | |
| text = raw.replace('_', ' ') | |
| text = re.sub(r'(?<=.)([A-Z])', r' \1', text) | |
| text = text.replace('Avg', 'Average') | |
| replacements = { | |
| 'Sr': 'SR', | |
| 'Ac': 'AC', | |
| 'Tsq': 'TSQ', | |
| 'Cvr': 'CVR', | |
| 'Psm': 'PSM', | |
| 'Prov': 'Prov', | |
| 'Call Em': 'CallEM', | |
| 'Reuse Rate': 'Reuse Rate', | |
| 'Eff Score': 'Eff Score' | |
| } | |
| words = text.title().split() | |
| words = [replacements.get(word, word) for word in words] | |
| return ' '.join(words) | |
| metric_labels = [] | |
| for col in metric_columns: | |
| label = prettify_metric_name(col) | |
| if label in metric_labels: | |
| suffix = 2 | |
| while f"{label} ({suffix})" in metric_labels: | |
| suffix += 1 | |
| label = f"{label} ({suffix})" | |
| metric_labels.append(label) | |
| model_palette = [ | |
| '#ffd21e', | |
| '#FF8A3C', | |
| '#A16207', | |
| '#DC2626', | |
| '#F8FAFC', | |
| '#38BDF8', | |
| ] | |
| fig = go.Figure() | |
| max_value = 0 | |
| for idx, model in enumerate(model_order): | |
| values = [] | |
| for col in metric_columns: | |
| value = df_models.at[model, col] if (model in df_models.index and col in df_models.columns) else float('nan') | |
| if pd.notna(value): | |
| values.append(float(value)) | |
| max_value = max(max_value, float(value)) | |
| else: | |
| values.append(None) | |
| color = model_palette[idx % len(model_palette)] | |
| fig.add_trace( | |
| go.Bar( | |
| name=model, | |
| y=metric_labels, | |
| x=values, | |
| orientation='h', | |
| marker=dict(color=color, line=dict(color='rgba(1,9,26,0.8)', width=1)), | |
| hovertemplate="<b>%{y}</b><br>Model Β· <span style='color:#FFD21E'>%{fullData.name}</span><br>Score Β· %{x:.3f}<extra></extra>", | |
| ) | |
| ) | |
| plot_height = max(360, 140 + 48 * len(metric_labels)) | |
| if max_value <= 0: | |
| x_range = [0, 1] | |
| else: | |
| x_range = [0, max_value * 1.05] | |
| fig.update_layout( | |
| barmode='group', | |
| bargap=0.25, | |
| bargroupgap=0.18, | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=plot_height, | |
| # width=1450, | |
| autosize=True, | |
| margin=dict(t=90, b=80, l=220, r=160), | |
| legend=dict( | |
| orientation="h", | |
| yanchor="bottom", | |
| y=1.02, | |
| xanchor="right", | |
| x=1, | |
| bgcolor='rgba(1, 9, 26, 0.75)', | |
| bordercolor='rgba(245, 246, 247, 0.2)', | |
| borderwidth=1, | |
| font=dict(size=11, family="'Geist', sans-serif", color='white') | |
| ), | |
| xaxis=dict( | |
| title=dict(text=f"<b>{level} Metric Score</b>", font=dict(size=14, color="white")), | |
| tickfont=dict(size=11, color="white"), | |
| gridcolor='rgba(245, 246, 247, 0.08)', | |
| zerolinecolor='rgba(245, 246, 247, 0.18)', | |
| range=x_range | |
| ), | |
| yaxis=dict( | |
| tickfont=dict(size=13, color="white"), | |
| automargin=True | |
| ), | |
| title=dict( | |
| text=f"<b>{level} Metric Breakdown</b>", | |
| x=0.5, | |
| y=0.98, | |
| font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700) | |
| ) | |
| ) | |
| return fig | |
| def create_empty_level_metric_chart(message): | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text=f"π§ {message}", | |
| xref="paper", yref="paper", | |
| x=0.5, y=0.5, | |
| xanchor='center', yanchor='middle', | |
| font=dict(size=18, color="white", family="'Geist', sans-serif"), | |
| showarrow=False, | |
| bgcolor="rgba(245, 246, 247, 0.05)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1, | |
| borderpad=20 | |
| ) | |
| fig.update_layout( | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=420, | |
| width=1450, | |
| margin=dict(t=80, b=60, l=80, r=120), | |
| title=dict( | |
| text="<b>Level Metric Breakdown</b>", | |
| x=0.5, | |
| y=0.98, | |
| font=dict(size=20, family="'Geist', sans-serif", color="white", weight=700) | |
| ) | |
| ) | |
| fig.update_xaxes(visible=False) | |
| fig.update_yaxes(visible=False) | |
| return fig | |
| def create_empty_radar_chart(message): | |
| """Create an empty radar chart with a message""" | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text=f"π {message}", | |
| xref="paper", yref="paper", | |
| x=0.5, y=0.5, | |
| xanchor='center', yanchor='middle', | |
| font=dict( | |
| size=18, | |
| color="white", | |
| family="'Geist', sans-serif" | |
| ), | |
| showarrow=False, | |
| bgcolor="rgba(245, 246, 247, 0.05)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1, | |
| borderpad=20 | |
| ) | |
| fig.update_layout( | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=1450, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=80, r=200), | |
| title=dict( | |
| text="<b>Core Capability Radar</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict( | |
| size=22, | |
| family="'Geist', sans-serif", | |
| color="white", | |
| weight=700 | |
| ), | |
| ), | |
| annotations=[ | |
| dict( | |
| xref="paper", yref="paper", | |
| x=0.98, y=0.02, | |
| xanchor='right', yanchor='bottom', | |
| font=dict(size=10, color='#64748B'), | |
| showarrow=False | |
| ) | |
| ] | |
| ) | |
| return fig | |
| # NEW VISUALIZATION FUNCTIONS | |
| def create_cost_performance_scatter(df, metric="Avg AC"): | |
| """Create scatter plot showing cost vs performance efficiency""" | |
| # Filter out models without cost or performance data | |
| df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy() | |
| label_map = { | |
| 'Proprietary': 'API', | |
| 'Open source': 'OSS' | |
| } | |
| if df_filtered.empty: | |
| return create_empty_chart("No data available for cost-performance analysis") | |
| # Convert to numeric | |
| df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') | |
| df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') | |
| df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce') | |
| # Create color mapping for model type | |
| color_map = { | |
| 'Proprietary': '#1098F7', # Airglow Blue for Proprietary | |
| 'Open source': '#58BC82' # Green for Open source | |
| } | |
| df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7') | |
| fig = go.Figure() | |
| # Add scatter points | |
| for model_type in df_filtered['Model Type'].unique(): | |
| df_type = df_filtered[df_filtered['Model Type'] == model_type] | |
| legend_name = label_map.get(model_type, model_type) | |
| fig.add_trace(go.Scatter( | |
| x=df_type[metric], | |
| y=df_type['Avg Total Cost'], | |
| mode='markers+text', | |
| name=legend_name, | |
| text=df_type['Model'], | |
| textposition="top center", | |
| textfont=dict(size=10, color='white'), | |
| marker=dict( | |
| size=df_type['Avg Turns'] * 3, # Size based on number of turns | |
| color=color_map.get(model_type, '#F5F6F7'), | |
| opacity=0.8, | |
| line=dict(width=2, color='#01091A') | |
| ), | |
| hovertemplate="<b>%{text}</b><br>" + | |
| f"{metric}: %{{x:.3f}}<br>" + | |
| "Cost: $%{y:.3f}<br>" + | |
| "Turns: %{marker.size:.1f}<br>" + | |
| "<extra></extra>" | |
| )) | |
| # Add quadrant lines | |
| median_x = df_filtered[metric].median() | |
| median_y = df_filtered['Avg Total Cost'].median() | |
| fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) | |
| fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) | |
| # Add quadrant labels | |
| fig.add_annotation(x=0.95, y=0.05, text="π High Performance<br>Low Cost", | |
| showarrow=False, xref="paper", yref="paper", | |
| font=dict(size=12, color="white"), bgcolor="rgba(245, 246, 247, 0.1)") | |
| fig.add_annotation(x=0.05, y=0.95, text="β οΈ Low Performance<br>High Cost", | |
| showarrow=False, xref="paper", yref="paper", | |
| font=dict(size=12, color="#ffd21e"), bgcolor="rgba(255, 210, 30, 0.1)") | |
| metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" | |
| fig.update_layout( | |
| title=dict( | |
| text=f"<b>Cost-Performance Efficiency: {metric_display}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) | |
| ), | |
| xaxis=dict( | |
| title=dict( | |
| text=f"<b>{metric_display}</b>", | |
| font=dict(size=16, color="white") | |
| ), | |
| tickfont=dict(size=12, color="white"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| zerolinecolor="rgba(245, 246, 247, 0.2)" | |
| ), | |
| yaxis=dict( | |
| title=dict( | |
| text="<b>Average Session Cost ($)</b>", | |
| font=dict(size=16, color="white") | |
| ), | |
| tickfont=dict(size=12, color="white"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| zerolinecolor="rgba(245, 246, 247, 0.2)" | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=900, | |
| width=1450, | |
| showlegend=True, | |
| legend=dict( | |
| orientation="h", | |
| yanchor="bottom", | |
| y=1.02, | |
| xanchor="right", | |
| x=1, | |
| font=dict(size=12, family="'Geist', sans-serif", color='white'), | |
| bgcolor='rgba(1, 9, 26, 0.8)', | |
| bordercolor='rgba(245, 246, 247, 0.2)', | |
| borderwidth=1 | |
| ), | |
| margin=dict(t=100, b=80, l=80, r=80) | |
| ) | |
| return fig | |
| def create_speed_accuracy_plot(df, metric="Avg AC"): | |
| """Create scatter plot showing speed vs accuracy trade-off""" | |
| # Filter out models without duration or performance data | |
| df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy() | |
| if df_filtered.empty: | |
| return create_empty_chart("No data available for speed-accuracy analysis") | |
| # Convert to numeric | |
| df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce') | |
| df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') | |
| # Create color scale based on cost | |
| df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') | |
| fig = go.Figure() | |
| # Add scatter trace | |
| fig.add_trace(go.Scatter( | |
| x=df_filtered[metric], | |
| y=df_filtered['Avg Session Duration'], | |
| mode='markers+text', | |
| text=df_filtered['Model'], | |
| textposition="top center", | |
| textfont=dict(size=9, color='white'), | |
| marker=dict( | |
| size=12, | |
| color=df_filtered['Avg Total Cost'], | |
| colorscale=[[0, '#0A0A0A'], [0.5, '#B8660A'], [1, '#ffd21e']], | |
| showscale=True, | |
| colorbar=dict( | |
| title=dict( | |
| text="Cost ($)", | |
| font=dict(color="white") | |
| ), | |
| tickfont=dict(color="white"), | |
| bgcolor="rgba(1, 9, 26, 0.8)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1, | |
| x=1.02 | |
| ), | |
| line=dict(width=2, color='#01091A') | |
| ), | |
| hovertemplate="<b>%{text}</b><br>" + | |
| f"{metric}: %{{x:.3f}}<br>" + | |
| "Duration: %{y:.1f}s<br>" + | |
| "Cost: $%{marker.color:.3f}<br>" + | |
| "<extra></extra>" | |
| )) | |
| # Add quadrant lines | |
| median_x = df_filtered[metric].median() | |
| median_y = df_filtered['Avg Session Duration'].median() | |
| fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) | |
| fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) | |
| # Add quadrant labels | |
| fig.add_annotation(x=0.95, y=0.05, text="β‘ Fast & Accurate", | |
| showarrow=False, xref="paper", yref="paper", | |
| font=dict(size=12, color="white", weight=600)) | |
| fig.add_annotation(x=0.05, y=0.95, text="π Slow & Inaccurate", | |
| showarrow=False, xref="paper", yref="paper", | |
| font=dict(size=12, color="#ffd21e", weight=600)) | |
| metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" | |
| fig.update_layout( | |
| title=dict( | |
| text=f"<b>Speed vs Accuracy Trade-off: {metric_display}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) | |
| ), | |
| xaxis=dict( | |
| title=dict( | |
| text=f"<b>{metric_display}</b>", | |
| font=dict(size=16, color="white") | |
| ), | |
| tickfont=dict(size=12, color="white"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| zerolinecolor="rgba(245, 246, 247, 0.2)" | |
| ), | |
| yaxis=dict( | |
| title=dict( | |
| text="<b>Average Session Duration (seconds)</b>", | |
| font=dict(size=16, color="white") | |
| ), | |
| tickfont=dict(size=12, color="white"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| zerolinecolor="rgba(245, 246, 247, 0.2)" | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=900, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=80, r=120) | |
| ) | |
| return fig | |
| def create_domain_specialization_matrix(df, metric_type="AC"): | |
| """Create bubble chart showing domain specialization""" | |
| domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] | |
| # Prepare data | |
| data = [] | |
| for _, model in df.iterrows(): | |
| if model['Model'] == '': | |
| continue | |
| model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce') | |
| if pd.isna(model_avg): | |
| continue | |
| for domain in domains: | |
| domain_col = f'{domain} {metric_type}' | |
| if domain_col in model and model[domain_col] != '': | |
| domain_val = pd.to_numeric(model[domain_col], errors='coerce') | |
| if not pd.isna(domain_val): | |
| # Calculate specialization strength (deviation from model average) | |
| specialization = domain_val - model_avg | |
| data.append({ | |
| 'Model': model['Model'], | |
| 'Domain': domain, | |
| 'Performance': domain_val, | |
| 'Specialization': specialization, | |
| 'Model Type': model['Model Type'] | |
| }) | |
| if not data: | |
| return create_empty_chart("No domain specialization data available") | |
| df_plot = pd.DataFrame(data) | |
| # Create bubble chart | |
| fig = go.Figure() | |
| # Color based on specialization strength | |
| fig.add_trace(go.Scatter( | |
| x=df_plot['Domain'], | |
| y=df_plot['Model'], | |
| mode='markers', | |
| marker=dict( | |
| size=df_plot['Performance'] * 30, # Size based on absolute performance | |
| color=df_plot['Specialization'], | |
| colorscale=[[0, '#B8660A'], [0.5, '#E6B800'], [1, '#ffd21e']], | |
| showscale=True, | |
| colorbar=dict( | |
| title=dict( | |
| text="Specialization<br>Strength", | |
| font=dict(color="white") | |
| ), | |
| tickfont=dict(color="white"), | |
| bgcolor="rgba(1, 9, 26, 0.8)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1 | |
| ), | |
| line=dict(width=2, color='#01091A'), | |
| opacity=0.8 | |
| ), | |
| text=[f"Performance: {p:.3f}<br>Specialization: {s:+.3f}" | |
| for p, s in zip(df_plot['Performance'], df_plot['Specialization'])], | |
| hovertemplate="<b>%{y}</b><br>" + | |
| "Domain: %{x}<br>" + | |
| "%{text}<br>" + | |
| "<extra></extra>" | |
| )) | |
| metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" | |
| fig.update_layout( | |
| title=dict( | |
| text=f"<b>Domain Specialization Matrix: {metric_display}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) | |
| ), | |
| xaxis=dict( | |
| title=dict( | |
| text="<b>Business Domains</b>", | |
| font=dict(size=16, color="white") | |
| ), | |
| tickfont=dict(size=13, color="white"), | |
| gridcolor="rgba(245, 246, 247, 0.1)" | |
| ), | |
| yaxis=dict( | |
| title=dict( | |
| text="<b>Models</b>", | |
| font=dict(size=16, color="white") | |
| ), | |
| tickfont=dict(size=11, color="white"), | |
| gridcolor="rgba(245, 246, 247, 0.1)" | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=1100, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=220, r=120) | |
| ) | |
| return fig | |
| def create_performance_gap_analysis(df, metric_type="AC"): | |
| """Create range plot showing performance gaps by domain""" | |
| domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] | |
| # Calculate min, max, median for each domain | |
| gap_data = [] | |
| for domain in domains: | |
| domain_col = f'{domain} {metric_type}' | |
| if domain_col in df.columns: | |
| domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna() | |
| if len(domain_values) > 0: | |
| gap_data.append({ | |
| 'Domain': domain, | |
| 'Min': domain_values.min(), | |
| 'Max': domain_values.max(), | |
| 'Median': domain_values.median(), | |
| 'Q1': domain_values.quantile(0.25), | |
| 'Q3': domain_values.quantile(0.75), | |
| 'Gap': domain_values.max() - domain_values.min() | |
| }) | |
| if not gap_data: | |
| return create_empty_chart("No data available for gap analysis") | |
| df_gap = pd.DataFrame(gap_data) | |
| df_gap = df_gap.sort_values('Gap', ascending=True) | |
| fig = go.Figure() | |
| # Add range bars | |
| for idx, row in df_gap.iterrows(): | |
| # Add full range line | |
| fig.add_trace(go.Scatter( | |
| x=[row['Min'], row['Max']], | |
| y=[row['Domain'], row['Domain']], | |
| mode='lines', | |
| line=dict(color='#64748B', width=2), | |
| showlegend=False, | |
| hoverinfo='skip' | |
| )) | |
| # Add IQR box | |
| fig.add_trace(go.Scatter( | |
| x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']], | |
| y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']], | |
| fill='toself', | |
| fillcolor='rgba(255, 210, 30, 0.3)', | |
| line=dict(color='#ffd21e', width=2), | |
| showlegend=False, | |
| hoverinfo='skip', | |
| mode='lines' | |
| )) | |
| # Add median marker | |
| fig.add_trace(go.Scatter( | |
| x=[row['Median']], | |
| y=[row['Domain']], | |
| mode='markers', | |
| marker=dict( | |
| size=12, | |
| color='#ffd21e', | |
| symbol='diamond', | |
| line=dict(width=2, color='#01091A') | |
| ), | |
| showlegend=False, | |
| hovertemplate=f"<b>{row['Domain']}</b><br>" + | |
| f"Min: {row['Min']:.3f}<br>" + | |
| f"Q1: {row['Q1']:.3f}<br>" + | |
| f"Median: {row['Median']:.3f}<br>" + | |
| f"Q3: {row['Q3']:.3f}<br>" + | |
| f"Max: {row['Max']:.3f}<br>" + | |
| f"Gap: {row['Gap']:.3f}<br>" + | |
| "<extra></extra>" | |
| )) | |
| # Add min/max points | |
| for idx, row in df_gap.iterrows(): | |
| fig.add_trace(go.Scatter( | |
| x=[row['Min'], row['Max']], | |
| y=[row['Domain'], row['Domain']], | |
| mode='markers', | |
| marker=dict(size=8, color='white', line=dict(width=2, color='#01091A')), | |
| showlegend=False, | |
| hoverinfo='skip' | |
| )) | |
| metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" | |
| fig.update_layout( | |
| title=dict( | |
| text=f"<b>Performance Gap Analysis by Domain: {metric_display}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="white", weight=700) | |
| ), | |
| xaxis=dict( | |
| title=dict( | |
| text=f"<b>{metric_display} Score</b>", | |
| font=dict(size=16, color="white") | |
| ), | |
| tickfont=dict(size=12, color="white"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| range=[0, 1] if metric_type in ['AC', 'TSQ'] else None | |
| ), | |
| yaxis=dict( | |
| title=dict( | |
| text="<b>Business Domain</b>", | |
| font=dict(size=16, color="white") | |
| ), | |
| tickfont=dict(size=13, color="white"), | |
| gridcolor="rgba(245, 246, 247, 0.1)" | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=800, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=140, r=80), | |
| showlegend=False | |
| ) | |
| # Add legend manually | |
| fig.add_annotation( | |
| text="β Median β IQR β Full Range", | |
| xref="paper", yref="paper", | |
| x=0.98, y=0.02, | |
| xanchor='right', yanchor='bottom', | |
| font=dict(size=12, color='white'), | |
| showarrow=False | |
| ) | |
| return fig | |
| def create_empty_chart(message): | |
| """Create an empty chart with a message""" | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text=f"π {message}", | |
| xref="paper", yref="paper", | |
| x=0.5, y=0.5, | |
| xanchor='center', yanchor='middle', | |
| font=dict( | |
| size=18, | |
| color="white", | |
| family="'Geist', sans-serif" | |
| ), | |
| showarrow=False, | |
| bgcolor="rgba(245, 246, 247, 0.05)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1, | |
| borderpad=20 | |
| ) | |
| fig.update_layout( | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=700, | |
| width=1450, | |
| margin=dict(t=80, b=80, l=80, r=80) | |
| ) | |