Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import gradio as gr | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| # Import components and styles from modular files | |
| from components.leaderboard_components import ( | |
| get_chart_colors, get_rank_badge, get_type_badge, | |
| get_output_type_badge, get_score_bar, get_metric_tooltip, | |
| get_responsive_styles, get_faq_section, SORT_COLUMN_MAP | |
| ) | |
| from components.prediction_components import create_ac_prediction_chart | |
| from styles.leaderboard_styles import get_leaderboard_css | |
| def create_leaderboard_v2_tab(): | |
| """Create the main leaderboard v2 tab with interactive table""" | |
| def load_leaderboard_data(): | |
| """Load and prepare the leaderboard data""" | |
| df = pd.read_csv('results_v2.csv') | |
| # Clean and prepare data | |
| df = df.copy() | |
| # Round numeric columns for better display | |
| numeric_cols = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns'] | |
| for col in numeric_cols: | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce').round(3) | |
| # Fill NaN values appropriately | |
| df = df.fillna('') | |
| return df | |
| def generate_html_table(filtered_df, domain_filter): | |
| """Generate styled HTML table with rank badges and score bars""" | |
| table_html = """ | |
| <style> | |
| /* Dark theme table styling */ | |
| .v2-table-container { | |
| background: var(--bg-card); | |
| border-radius: 16px; | |
| overflow: hidden; | |
| border: 1px solid var(--border-subtle); | |
| margin-top: 20px; | |
| } | |
| .v2-styled-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif; | |
| background: var(--bg-card); | |
| color: var(--text-primary); | |
| } | |
| .v2-styled-table thead { | |
| position: sticky; | |
| top: 0; | |
| background: rgba(227, 84, 84, 0.1); | |
| z-index: 1; | |
| } | |
| .v2-styled-table th { | |
| padding: 14px 12px; | |
| text-align: left; | |
| font-weight: 600; | |
| color: var(--text-primary); | |
| border-bottom: 2px solid var(--accent-primary); | |
| font-size: 13px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| } | |
| .v2-styled-table td { | |
| padding: 12px; | |
| border-bottom: 1px solid var(--border-subtle); | |
| color: var(--text-primary); | |
| transition: all 0.2s ease; | |
| } | |
| .v2-styled-table tbody tr { | |
| transition: all 0.3s ease; | |
| } | |
| .v2-styled-table tbody tr:hover { | |
| background: rgba(227, 84, 84, 0.15) !important; | |
| box-shadow: 0 0 20px rgba(227, 84, 84, 0.3), inset 0 0 20px rgba(227, 84, 84, 0.1); | |
| transform: scale(1.01); | |
| } | |
| .v2-styled-table tbody tr:nth-child(even) { | |
| background: var(--bg-secondary); | |
| } | |
| .model-name { | |
| font-weight: 500; | |
| color: var(--accent-primary); | |
| transition: color 0.2s ease; | |
| } | |
| /* Keep model name color consistent on hover to emphasize row highlight */ | |
| .v2-styled-table tr:hover .model-name { | |
| color: var(--accent-secondary); | |
| } | |
| .numeric-cell { | |
| font-family: 'Geist Mono', monospace; | |
| font-size: 13px; | |
| text-align: center; | |
| } | |
| /* Score bar specific styling */ | |
| .score-cell { | |
| min-width: 180px; | |
| } | |
| </style> | |
| <div class="v2-table-container"> | |
| <table class="v2-styled-table"> | |
| <thead> | |
| <tr> | |
| <th style="width: 80px;">Rank</th> | |
| <th>Model</th> | |
| <th style="width: 120px;">Type</th> | |
| <th style="width: 120px;">Output Type</th> | |
| <th>Vendor</th> | |
| <th style="width: 200px;" title="Action Completion (AC): Measures how well the agent accomplishes user goals and completes tasks successfully. Higher is better (0-1 scale)."> | |
| <span class="metric-header">Avg Action Completion <span class="info-icon">β</span></span> | |
| </th> | |
| <th style="width: 200px;" title="Tool Selection Quality (TSQ): Evaluates the accuracy of selecting the right tools and using them with correct parameters. Higher is better (0-1 scale)."> | |
| <span class="metric-header">Avg Tool Selection Quality <span class="info-icon">β</span></span> | |
| </th> | |
| <th title="Average cost per conversation session in USD, including all API calls and processing. Lower is better."> | |
| <span class="metric-header">Avg Cost ($) <span class="info-icon">β</span></span> | |
| </th> | |
| <th title="Average time taken to complete a full conversation session from start to finish, measured in seconds. Lower is generally better."> | |
| <span class="metric-header">Avg Duration (s) <span class="info-icon">β</span></span> | |
| </th> | |
| <th title="Average number of back-and-forth exchanges needed to complete a task. Lower typically indicates more efficient task completion."> | |
| <span class="metric-header">Avg Turns <span class="info-icon">β</span></span> | |
| </th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| """ | |
| # Generate table rows | |
| for idx, (_, row) in enumerate(filtered_df.iterrows()): | |
| rank = idx + 1 | |
| table_html += f""" | |
| <tr> | |
| <td>{get_rank_badge(rank)}</td> | |
| <td class="model-name">{row['Model']}</td> | |
| <td>{get_type_badge(row['Model Type'])}</td> | |
| <td>{get_output_type_badge(row.get('Output Type', 'Normal'))}</td> | |
| <td>{row['Vendor']}</td> | |
| """ | |
| # Get appropriate values based on domain filter | |
| if domain_filter != "All": | |
| # For specific domain, show domain-specific scores | |
| ac_col = f'{domain_filter} AC' | |
| tsq_col = f'{domain_filter} TSQ' | |
| # AC score | |
| if ac_col in row and row[ac_col] != '': | |
| ac_value = float(row[ac_col]) | |
| table_html += f'<td class="score-cell">{get_score_bar(ac_value)}</td>' | |
| else: | |
| table_html += '<td class="numeric-cell">-</td>' | |
| # TSQ score | |
| if tsq_col in row and row[tsq_col] != '': | |
| tsq_value = float(row[tsq_col]) | |
| table_html += f'<td class="score-cell">{get_score_bar(tsq_value)}</td>' | |
| else: | |
| table_html += '<td class="numeric-cell">-</td>' | |
| else: | |
| # For "All", show overall averages | |
| table_html += f""" | |
| <td class="score-cell">{get_score_bar(row['Avg AC'])}</td> | |
| <td class="score-cell">{get_score_bar(row['Avg TSQ'])}</td> | |
| """ | |
| # Add appropriate cost, duration, and turns based on domain filter | |
| if domain_filter != "All": | |
| # Use domain-specific values | |
| cost_col = f'{domain_filter} Cost' | |
| duration_col = f'{domain_filter} Duration' | |
| turns_col = f'{domain_filter} Turns' | |
| cost = row.get(cost_col, '') | |
| duration = row.get(duration_col, '') | |
| turns = row.get(turns_col, '') | |
| # Convert to float if not empty | |
| if cost != '': | |
| cost = float(cost) | |
| if duration != '': | |
| duration = float(duration) | |
| if turns != '': | |
| turns = float(turns) | |
| else: | |
| # Use overall averages for "All" domain | |
| cost = row.get('Avg Total Cost', row.get('Cost ($)', '')) | |
| duration = row.get('Avg Session Duration', row.get('Duration (s)', '')) | |
| turns = row.get('Avg Turns', row.get('Turns', '')) | |
| # Format the values for display | |
| if cost != '': | |
| cost_display = f'{cost:.3f}' | |
| else: | |
| cost_display = '-' | |
| if duration != '': | |
| duration_display = f'{duration:.1f}' | |
| else: | |
| duration_display = '-' | |
| if turns != '': | |
| turns_display = f'{turns:.1f}' | |
| else: | |
| turns_display = '-' | |
| table_html += f""" | |
| <td class="numeric-cell">${cost_display}</td> | |
| <td class="numeric-cell">{duration_display}</td> | |
| <td class="numeric-cell">{turns_display}</td> | |
| </tr> | |
| """ | |
| table_html += """ | |
| </tbody> | |
| </table> | |
| </div> | |
| """ | |
| return table_html | |
| def update_leaderboard_title(domain_filter): | |
| """Update the leaderboard title based on selected domain""" | |
| # Strip emoji prefix from domain filter | |
| domain_filter_clean = domain_filter | |
| if domain_filter.startswith('π'): | |
| domain_filter_clean = "All" | |
| elif domain_filter.startswith('π¦'): | |
| domain_filter_clean = "Banking" | |
| elif domain_filter.startswith('π₯'): | |
| domain_filter_clean = "Healthcare" | |
| elif domain_filter.startswith('π‘οΈ'): | |
| domain_filter_clean = "Insurance" | |
| elif domain_filter.startswith('π°'): | |
| domain_filter_clean = "Investment" | |
| elif domain_filter.startswith('π±'): | |
| domain_filter_clean = "Telecom" | |
| return f""" | |
| <div class="dark-container pulse" style="margin-bottom: 24px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-primary);">π</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Agent Leaderboard for {domain_filter_clean} | |
| </h3> | |
| </div> | |
| <div class="dataframe-container"> | |
| """ | |
| def filter_and_sort_data(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order): | |
| """Filter and sort the leaderboard data""" | |
| df = load_leaderboard_data() | |
| # Apply filters | |
| filtered_df = df.copy() | |
| # Strip emoji prefix from domain filter | |
| domain_filter_clean = domain_filter | |
| if domain_filter.startswith('π'): | |
| domain_filter_clean = "All" | |
| elif domain_filter.startswith('π¦'): | |
| domain_filter_clean = "Banking" | |
| elif domain_filter.startswith('π₯'): | |
| domain_filter_clean = "Healthcare" | |
| elif domain_filter.startswith('π‘οΈ'): | |
| domain_filter_clean = "Insurance" | |
| elif domain_filter.startswith('π°'): | |
| domain_filter_clean = "Investment" | |
| elif domain_filter.startswith('π±'): | |
| domain_filter_clean = "Telecom" | |
| # Domain filtering (check if model performs well in specific domain) | |
| if domain_filter_clean != "All": | |
| domain_col_map = { | |
| "Banking": "Banking AC", | |
| "Healthcare": "Healthcare AC", | |
| "Insurance": "Insurance AC", | |
| "Investment": "Investment AC", | |
| "Telecom": "Telecom AC" | |
| } | |
| if domain_filter_clean in domain_col_map: | |
| domain_col = domain_col_map[domain_filter_clean] | |
| # Only show models that have data for this domain | |
| filtered_df = filtered_df[filtered_df[domain_col] != ''] | |
| # Model type filtering | |
| if model_type_filter != "All": | |
| if model_type_filter == "Open Source": | |
| filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] | |
| elif model_type_filter == "Proprietary": | |
| filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] | |
| # Reasoning filtering | |
| if reasoning_filter != "All": | |
| if reasoning_filter == "Reasoning": | |
| filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] | |
| elif reasoning_filter == "Normal": | |
| filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] | |
| # Map display name to actual column name using shared mapping | |
| actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) | |
| # If domain is selected and sorting by AC or TSQ, use domain-specific column | |
| if domain_filter_clean != "All": | |
| if actual_sort_column == "Avg AC": | |
| actual_sort_column = f"{domain_filter_clean} AC" | |
| elif actual_sort_column == "Avg TSQ": | |
| actual_sort_column = f"{domain_filter_clean} TSQ" | |
| elif actual_sort_column == "Avg Total Cost": | |
| actual_sort_column = f"{domain_filter_clean} Cost" | |
| elif actual_sort_column == "Avg Session Duration": | |
| actual_sort_column = f"{domain_filter_clean} Duration" | |
| elif actual_sort_column == "Avg Turns": | |
| actual_sort_column = f"{domain_filter_clean} Turns" | |
| if actual_sort_column and actual_sort_column in filtered_df.columns: | |
| ascending = (sort_order == "Ascending") | |
| filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') | |
| # Generate HTML table | |
| return generate_html_table(filtered_df, domain_filter_clean) | |
| # Load initial data | |
| initial_table = filter_and_sort_data("π All", "All", "All", "Avg AC", "Descending") | |
| initial_df = load_leaderboard_data() # Load raw data for model selector | |
| # Load custom CSS and responsive styles | |
| custom_css = get_leaderboard_css() + get_responsive_styles() + """ | |
| <style> | |
| /* Page-specific styles for leaderboard v2 */ | |
| /* Metric header styles with info icons */ | |
| .metric-header { | |
| cursor: help; | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| } | |
| .info-icon { | |
| color: var(--accent-secondary); | |
| font-size: 1em; | |
| opacity: 0.8; | |
| transition: opacity 0.2s ease; | |
| font-weight: normal; | |
| } | |
| .metric-header:hover .info-icon { | |
| opacity: 1; | |
| } | |
| /* Native tooltip styling */ | |
| .v2-styled-table th[title] { | |
| cursor: help; | |
| } | |
| /* Custom tooltip using CSS only */ | |
| [data-tooltip] { | |
| position: relative; | |
| cursor: help; | |
| } | |
| [data-tooltip]::before { | |
| content: attr(data-tooltip); | |
| position: absolute; | |
| bottom: 100%; | |
| left: 50%; | |
| transform: translateX(-50%); | |
| background: rgba(26, 26, 46, 0.95); | |
| color: #f5f6f7; | |
| padding: 8px 12px; | |
| border-radius: 6px; | |
| font-size: 12px; | |
| white-space: nowrap; | |
| max-width: 300px; | |
| z-index: 10000; | |
| opacity: 0; | |
| pointer-events: none; | |
| transition: opacity 0.3s; | |
| margin-bottom: 5px; | |
| border: 1px solid rgba(16, 152, 247, 0.3); | |
| box-shadow: 0 4px 12px rgba(0, 0, 0, 0.8); | |
| } | |
| [data-tooltip]:hover::before { | |
| opacity: 1; | |
| } | |
| /* Dark theme table styling */ | |
| .v2-table-container { | |
| background: var(--bg-card); | |
| border-radius: 16px; | |
| overflow: visible; /* Changed from hidden to visible for tooltips */ | |
| border: 1px solid var(--border-subtle); | |
| margin-top: 20px; | |
| position: relative; | |
| } | |
| .v2-styled-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif; | |
| background: var(--bg-card); | |
| color: var(--text-primary); | |
| } | |
| .v2-styled-table thead { | |
| position: sticky; | |
| top: 0; | |
| background: rgba(227, 84, 84, 0.1); | |
| z-index: 1; | |
| } | |
| .v2-styled-table th { | |
| padding: 14px 12px; | |
| text-align: left; | |
| font-weight: 600; | |
| color: var(--text-primary); | |
| border-bottom: 2px solid var(--accent-primary); | |
| font-size: 14px; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| position: relative; /* Added for tooltip positioning */ | |
| } | |
| .v2-styled-table td { | |
| padding: 12px; | |
| border-bottom: 1px solid var(--border-subtle); | |
| color: var(--text-primary); | |
| font-size: 14px; | |
| transition: all 0.2s ease; | |
| } | |
| .v2-styled-table tbody tr { | |
| transition: all 0.3s ease; | |
| } | |
| .v2-styled-table tbody tr:hover { | |
| background: rgba(227, 84, 84, 0.15) !important; | |
| box-shadow: 0 0 20px rgba(227, 84, 84, 0.3), inset 0 0 20px rgba(227, 84, 84, 0.1); | |
| transform: scale(1.01); | |
| } | |
| .v2-styled-table tbody tr:nth-child(even) { | |
| background: var(--bg-secondary); | |
| } | |
| .model-name { | |
| font-weight: 500; | |
| color: var(--accent-primary); | |
| font-size: 14px; | |
| transition: color 0.2s ease; | |
| } | |
| .v2-styled-table tr:hover .model-name { | |
| color: var(--accent-secondary); | |
| } | |
| .numeric-cell { | |
| font-family: 'Geist Mono', monospace; | |
| font-size: 14px; | |
| text-align: center; | |
| } | |
| .score-cell { | |
| min-width: 180px; | |
| } | |
| </style> | |
| <script> | |
| // Function to update radio button styling | |
| function updateRadioStyling() { | |
| // Remove selected class from all labels first | |
| document.querySelectorAll('.selected').forEach(function(label) { | |
| label.classList.remove('selected'); | |
| }); | |
| // Apply selected class to checked radio buttons | |
| document.querySelectorAll('input[type="radio"]:checked').forEach(function(input) { | |
| var label = input.closest('label'); | |
| if (label) { | |
| label.classList.add('selected'); | |
| // For domain radio buttons, apply special styling | |
| if (label.closest('.domain-radio')) { | |
| label.style.background = 'linear-gradient(145deg, rgba(227, 84, 84, 0.2), rgba(227, 84, 84, 0.1))'; | |
| label.style.borderColor = 'var(--accent-primary)'; | |
| label.style.transform = 'scale(1.05)'; | |
| label.style.fontWeight = '600'; | |
| } | |
| } | |
| }); | |
| } | |
| // Wait for Gradio to initialize | |
| function initializeRadioStyles() { | |
| updateRadioStyling(); | |
| // Create observer to watch for changes | |
| var observer = new MutationObserver(function(mutations) { | |
| mutations.forEach(function(mutation) { | |
| if (mutation.type === 'attributes' && mutation.attributeName === 'checked') { | |
| updateRadioStyling(); | |
| } | |
| }); | |
| }); | |
| // Observe all radio inputs | |
| document.querySelectorAll('input[type="radio"]').forEach(function(radio) { | |
| observer.observe(radio, { attributes: true }); | |
| }); | |
| } | |
| // Try multiple initialization strategies | |
| document.addEventListener('DOMContentLoaded', function() { | |
| setTimeout(initializeRadioStyles, 100); | |
| setTimeout(initializeRadioStyles, 500); | |
| setTimeout(initializeRadioStyles, 1000); | |
| }); | |
| // Also check when window loads | |
| window.addEventListener('load', function() { | |
| setTimeout(initializeRadioStyles, 100); | |
| }); | |
| // Listen for Gradio's custom events | |
| document.addEventListener('gradio:loaded', initializeRadioStyles); | |
| </script> | |
| """ | |
| gr.HTML(custom_css) | |
| # Header button above title | |
| gr.HTML(""" | |
| <style> | |
| /* Enhanced button styling with better gradio compatibility */ | |
| .custom-button-container { | |
| text-align: center; | |
| padding: 20px 0 10px 0; | |
| margin-bottom: 10px; | |
| } | |
| .header-action-button { | |
| display: inline-block !important; | |
| padding: 14px 28px !important; | |
| background: linear-gradient(135deg, #E35454 0%, #C84545 100%) !important; | |
| color: #FFFFFF !important; | |
| text-decoration: none !important; | |
| border-radius: 16px !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 700 !important; | |
| font-size: 1.1rem !important; | |
| transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; | |
| border: none !important; | |
| cursor: pointer !important; | |
| box-shadow: 0 8px 24px rgba(227, 84, 84, 0.4), 0 4px 12px rgba(0, 0, 0, 0.3) !important; | |
| position: relative !important; | |
| overflow: hidden !important; | |
| text-shadow: 0 1px 2px rgba(0, 0, 0, 0.3) !important; | |
| } | |
| .header-action-button::before { | |
| content: ''; | |
| position: absolute; | |
| top: 0; | |
| left: -100%; | |
| width: 100%; | |
| height: 100%; | |
| background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent); | |
| transition: left 0.6s; | |
| } | |
| .header-action-button:hover::before { | |
| left: 100%; | |
| } | |
| .header-action-button:hover { | |
| transform: translateY(-3px) !important; | |
| box-shadow: 0 12px 32px rgba(227, 84, 84, 0.5), 0 8px 16px rgba(0, 0, 0, 0.4) !important; | |
| background: linear-gradient(135deg, #F46464 0%, #D84F4F 100%) !important; | |
| color: #FFFFFF !important; | |
| text-decoration: none !important; | |
| } | |
| .header-action-button:active { | |
| transform: translateY(-1px) !important; | |
| } | |
| .action-button-icon { | |
| font-size: 1.2rem !important; | |
| margin-right: 8px !important; | |
| filter: drop-shadow(0 0 8px rgba(255, 255, 255, 0.3)); | |
| } | |
| /* Navigation buttons styling */ | |
| .nav-buttons-container { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| gap: 16px; | |
| flex-wrap: wrap; | |
| margin: 24px 0; | |
| padding: 0 20px; | |
| } | |
| .nav-link-button { | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| gap: 8px !important; | |
| padding: 12px 20px !important; | |
| background: rgba(1, 9, 26, 0.8) !important; | |
| color: #F5F6F7 !important; | |
| text-decoration: none !important; | |
| border-radius: 12px !important; | |
| font-family: 'Geist', -apple-system, BlinkMacSystemFont, sans-serif !important; | |
| font-weight: 600 !important; | |
| font-size: 0.95rem !important; | |
| transition: all 0.3s ease !important; | |
| border: 2px solid rgba(245, 246, 247, 0.15) !important; | |
| backdrop-filter: blur(10px) !important; | |
| -webkit-backdrop-filter: blur(10px) !important; | |
| position: relative !important; | |
| overflow: hidden !important; | |
| box-shadow: 0 4px 16px rgba(0, 0, 0, 0.3) !important; | |
| } | |
| .nav-link-button::before { | |
| content: ''; | |
| position: absolute; | |
| top: 0; | |
| left: 0; | |
| right: 0; | |
| bottom: 0; | |
| background: linear-gradient(135deg, rgba(227, 84, 84, 0.1) 0%, rgba(16, 152, 247, 0.1) 100%); | |
| opacity: 0; | |
| transition: opacity 0.3s ease; | |
| } | |
| .nav-link-button:hover::before { | |
| opacity: 1; | |
| } | |
| .nav-link-button:hover { | |
| transform: translateY(-3px) scale(1.02) !important; | |
| border-color: #E35454 !important; | |
| box-shadow: 0 8px 24px rgba(227, 84, 84, 0.3), 0 4px 12px rgba(0, 0, 0, 0.4) !important; | |
| text-decoration: none !important; | |
| color: #FFFFFF !important; | |
| } | |
| .nav-link-button.primary-nav { | |
| background: linear-gradient(135deg, #1098F7 0%, #0A6BC4 100%) !important; | |
| border-color: #1098F7 !important; | |
| color: #FFFFFF !important; | |
| font-weight: 700 !important; | |
| } | |
| .nav-link-button.primary-nav:hover { | |
| background: linear-gradient(135deg, #2AA8FF 0%, #0550A0 100%) !important; | |
| border-color: #2AA8FF !important; | |
| box-shadow: 0 8px 24px rgba(16, 152, 247, 0.4), 0 4px 12px rgba(0, 0, 0, 0.4) !important; | |
| color: #FFFFFF !important; | |
| } | |
| .nav-button-icon { | |
| font-size: 1.1rem !important; | |
| filter: drop-shadow(0 0 6px currentColor); | |
| } | |
| /* Responsive design */ | |
| @media (max-width: 768px) { | |
| .nav-buttons-container { | |
| gap: 12px; | |
| padding: 0 10px; | |
| } | |
| .nav-link-button { | |
| font-size: 0.85rem !important; | |
| padding: 10px 16px !important; | |
| } | |
| .header-action-button { | |
| font-size: 1rem !important; | |
| padding: 12px 24px !important; | |
| } | |
| } | |
| @media (max-width: 480px) { | |
| .nav-buttons-container { | |
| flex-direction: column; | |
| gap: 8px; | |
| } | |
| .nav-link-button { | |
| width: 200px; | |
| justify-content: center; | |
| } | |
| } | |
| </style> | |
| <div class="custom-button-container"> | |
| <a href="https://app.galileo.ai/sign-up?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="header-action-button"> | |
| <span class="action-button-icon">π</span>Evaluate your GenAI for free | |
| </a> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="text-align: center; padding: 20px 0;"> | |
| <h1 style="font-size: 3rem; margin-bottom: 12px; color: var(--text-primary); | |
| text-shadow: 0 0 20px rgba(227, 84, 84, 0.3); font-family: 'Geist', sans-serif; font-weight: 800;"> | |
| π Galileo Agent Leaderboard v2 | |
| </h1> | |
| <p style="color: var(--text-secondary); font-size: 1.2rem; margin-top: 0; font-family: 'Geist', sans-serif;"> | |
| Comprehensive performance metrics for LLM agents across business domains | |
| </p> | |
| </div> | |
| """) | |
| # Links section below title | |
| gr.HTML(""" | |
| <div class="nav-buttons-container"> | |
| <a href="http://galileo.ai/blog/agent-leaderboard-v2" target="_blank" class="nav-link-button"> | |
| <span class="nav-button-icon">π</span> | |
| Blog | |
| </a> | |
| <a href="https://galileo.ai/mastering-agents-ebook?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="nav-link-button"> | |
| <span class="nav-button-icon">π</span> | |
| eBook | |
| </a> | |
| <a href="https://github.com/rungalileo/agent-leaderboard" target="_blank" class="nav-link-button"> | |
| <span class="nav-button-icon">π</span> | |
| GitHub | |
| </a> | |
| <a href="https://huggingface.co/datasets/galileo-ai/agent-leaderboard-v2" target="_blank" class="nav-link-button"> | |
| <span class="nav-button-icon">π€</span> | |
| Dataset | |
| </a> | |
| <a href="https://huggingface.co/spaces/galileo-ai/agent-leaderboard/discussions/new" target="_blank" class="nav-link-button"> | |
| <span class="nav-button-icon">β</span> | |
| Add Model | |
| </a> | |
| </div> | |
| """) | |
| # Metrics overview cards with insights | |
| gr.HTML(""" | |
| <div style="margin-bottom: 40px;"> | |
| <!-- Ultra-modern metric cards with advanced styling --> | |
| <style> | |
| .insight-card { | |
| background: linear-gradient(145deg, rgba(245, 246, 247, 0.03) 0%, rgba(227, 84, 84, 0.08) 100%); | |
| border-radius: 16px; | |
| padding: 20px; | |
| position: relative; | |
| border: 1px solid var(--border-subtle); | |
| transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); | |
| overflow: hidden; | |
| backdrop-filter: blur(20px); | |
| -webkit-backdrop-filter: blur(20px); | |
| } | |
| .insight-card::before { | |
| content: ''; | |
| position: absolute; | |
| inset: 0; | |
| border-radius: 24px; | |
| padding: 1px; | |
| background: linear-gradient(145deg, var(--border-subtle), var(--border-default)); | |
| -webkit-mask: linear-gradient(#fff 0 0) content-box, linear-gradient(#fff 0 0); | |
| -webkit-mask-composite: source-out; | |
| mask-composite: subtract; | |
| pointer-events: none; | |
| } | |
| .insight-card::after { | |
| content: ''; | |
| position: absolute; | |
| top: -100%; | |
| left: -100%; | |
| width: 300%; | |
| height: 300%; | |
| background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
| opacity: 0; | |
| transition: opacity 0.6s ease, transform 0.6s ease; | |
| pointer-events: none; | |
| } | |
| .insight-card:hover::after { | |
| opacity: 0.15; | |
| transform: translate(50%, 50%); | |
| } | |
| .insight-card:hover { | |
| transform: translateY(-8px); | |
| border-color: var(--accent-primary); | |
| box-shadow: | |
| 0 24px 48px rgba(227, 84, 84, 0.2), | |
| 0 12px 24px rgba(0, 0, 0, 0.3), | |
| inset 0 1px 0 rgba(255, 255, 255, 0.1); | |
| } | |
| .insight-card.secondary-accent:hover { | |
| border-color: var(--accent-primary); | |
| } | |
| .insight-card.secondary-accent::after { | |
| background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
| } | |
| .insight-card.tertiary-accent:hover { | |
| border-color: var(--accent-primary); | |
| } | |
| .insight-card.tertiary-accent::after { | |
| background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
| } | |
| .card-header { | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| margin-bottom: 12px; | |
| } | |
| .card-icon { | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| font-size: 2rem; | |
| margin-right: 8px; | |
| } | |
| .card-title { | |
| flex: 1; | |
| } | |
| .card-label { | |
| font-family: 'Geist Mono', monospace; | |
| font-size: 0.7rem; | |
| letter-spacing: 0.05em; | |
| text-transform: uppercase; | |
| color: var(--text-secondary); | |
| margin-bottom: 2px; | |
| } | |
| .card-value { | |
| font-family: 'Geist', sans-serif; | |
| font-size: 1.1rem; | |
| font-weight: 700; | |
| color: var(--text-primary); | |
| line-height: 1.1; | |
| } | |
| .insight-list { | |
| list-style: none; | |
| padding: 0; | |
| margin: 0; | |
| } | |
| .insight-list li { | |
| margin-bottom: 8px; | |
| } | |
| .insight-item { | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| padding: 8px 10px; | |
| background: rgba(245, 246, 247, 0.03); | |
| border-radius: 8px; | |
| border: 1px solid var(--border-subtle); | |
| transition: all 0.3s ease; | |
| } | |
| .insight-item:hover { | |
| background: rgba(227, 84, 84, 0.1); | |
| border-color: var(--accent-primary); | |
| transform: translateX(4px); | |
| } | |
| .insight-icon { | |
| font-size: 1rem; | |
| flex-shrink: 0; | |
| } | |
| .insight-text { | |
| flex: 1; | |
| font-size: 0.85rem; | |
| line-height: 1.3; | |
| color: var(--text-secondary); | |
| } | |
| .highlight { | |
| color: var(--text-primary); | |
| font-weight: 600; | |
| } | |
| .badge-row { | |
| display: flex; | |
| gap: 6px; | |
| margin-top: 10px; | |
| flex-wrap: wrap; | |
| } | |
| .badge { | |
| padding: 4px 10px; | |
| background: rgba(245, 246, 247, 0.05); | |
| border: 1px solid var(--border-subtle); | |
| border-radius: 16px; | |
| font-size: 0.75rem; | |
| color: var(--text-secondary); | |
| transition: all 0.2s ease; | |
| display: flex; | |
| align-items: center; | |
| gap: 4px; | |
| } | |
| .badge:hover { | |
| background: rgba(227, 84, 84, 0.15); | |
| border-color: var(--accent-primary); | |
| color: var(--text-primary); | |
| transform: scale(1.05); | |
| } | |
| .badge-icon { | |
| font-size: 0.85rem; | |
| } | |
| @keyframes float { | |
| 0%, 100% { transform: translateY(0); } | |
| 50% { transform: translateY(-5px); } | |
| } | |
| .floating-icon { | |
| animation: float 3s ease-in-out infinite; | |
| } | |
| /* Tertiary color for special elements */ | |
| .tertiary-color { | |
| color: var(--accent-tertiary); | |
| } | |
| </style> | |
| <!-- First row: Five key insight cards --> | |
| <div class="insight-card-grid"> | |
| <div class="insight-card"> | |
| <div class="card-header"> | |
| <div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
| π― | |
| </div> | |
| </div> | |
| <div class="card-value">Task Completion</div> | |
| <div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
| Compare models based on their ability to complete real-world business tasks accurately and efficiently | |
| </div> | |
| </div> | |
| <div class="insight-card"> | |
| <div class="card-header"> | |
| <div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
| π‘ | |
| </div> | |
| </div> | |
| <div class="card-value">Tool Selection</div> | |
| <div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
| Analyze how precisely models choose the right tools for each task and make optimal decisions | |
| </div> | |
| </div> | |
| <div class="insight-card"> | |
| <div class="card-header"> | |
| <div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
| π° | |
| </div> | |
| </div> | |
| <div class="card-value">Cost Efficiency</div> | |
| <div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
| Find models that deliver the best performance per dollar spent and optimize your ROI | |
| </div> | |
| </div> | |
| <div class="insight-card"> | |
| <div class="card-header"> | |
| <div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
| ποΈ | |
| </div> | |
| </div> | |
| <div class="card-value">Domain Coverage</div> | |
| <div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
| Banking, Healthcare, Insurance, Investment, and Telecom industries analyzed for specialized performance | |
| </div> | |
| </div> | |
| <div class="insight-card"> | |
| <div class="card-header"> | |
| <div class="card-icon floating-icon" style="color: var(--accent-primary);"> | |
| π | |
| </div> | |
| </div> | |
| <div class="card-value">Speed vs Accuracy</div> | |
| <div class="insight-text" style="margin-top: 16px; color: var(--text-secondary); font-size: 0.9rem; line-height: 1.5;"> | |
| Understand the trade-offs between response time and accuracy to find the right balance | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Second row: Key features showcase --> | |
| <div class="metric-card-grid" style="margin-top: 16px;"> | |
| <div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);"> | |
| <div class="card-value">Model Capabilities</div> | |
| <div class="badge-row" style="margin-top: 16px;"> | |
| <div class="badge"> | |
| <span class="badge-icon">π</span> | |
| <span>Open Source</span> | |
| </div> | |
| <div class="badge"> | |
| <span class="badge-icon">π</span> | |
| <span>Proprietary</span> | |
| </div> | |
| <div class="badge"> | |
| <span class="badge-icon">π§ </span> | |
| <span>Reasoning</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);"> | |
| <div class="card-value">Interactive Visualizations</div> | |
| <div class="badge-row" style="margin-top: 16px;"> | |
| <div class="badge"> | |
| <span class="badge-icon">πΈοΈ</span> | |
| <span>Radar Charts</span> | |
| </div> | |
| <div class="badge"> | |
| <span class="badge-icon">π</span> | |
| <span>Heatmaps</span> | |
| </div> | |
| <div class="badge"> | |
| <span class="badge-icon">π</span> | |
| <span>Scatter Plots</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="insight-card" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%);"> | |
| <div class="card-value">Real-World Performance</div> | |
| <div class="badge-row" style="margin-top: 16px;"> | |
| <div class="badge"> | |
| <span class="badge-icon">πΌ</span> | |
| <span>Business Tasks</span> | |
| </div> | |
| <div class="badge"> | |
| <span class="badge-icon">π</span> | |
| <span>Multi-Turn</span> | |
| </div> | |
| <div class="badge"> | |
| <span class="badge-icon">π</span> | |
| <span>Benchmarks</span> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # Domain filter section with enhanced styling | |
| gr.HTML(""" | |
| <style> | |
| /* Enhanced domain selector styling */ | |
| .domain-selector-container { | |
| background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%); | |
| border-radius: 20px; | |
| padding: 32px; | |
| margin-bottom: 32px; | |
| border: 1px solid var(--border-subtle); | |
| position: relative; | |
| overflow: hidden; | |
| box-shadow: | |
| 0 8px 32px rgba(0, 0, 0, 0.3), | |
| inset 0 1px 0 rgba(255, 255, 255, 0.05); | |
| } | |
| .domain-selector-container::before { | |
| content: ''; | |
| position: absolute; | |
| top: -50%; | |
| left: -50%; | |
| width: 200%; | |
| height: 200%; | |
| background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
| opacity: 0.1; | |
| animation: pulse 4s ease-in-out infinite; | |
| } | |
| .domain-header { | |
| text-align: center; | |
| margin-bottom: 28px; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| .domain-title { | |
| font-size: 2rem; | |
| font-weight: 800; | |
| background: linear-gradient(90deg, var(--accent-primary), var(--accent-secondary)); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| margin-bottom: 8px; | |
| text-shadow: 0 0 30px var(--glow-primary); | |
| } | |
| .domain-subtitle { | |
| color: var(--text-secondary); | |
| font-size: 1.2rem; | |
| font-family: 'Geist', sans-serif; | |
| } | |
| /* Custom radio button styling */ | |
| .domain-radio { | |
| display: flex !important; | |
| gap: 12px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: center !important; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| /* Gradio radio button wrapper */ | |
| .domain-radio .wrap { | |
| display: flex !important; | |
| gap: 12px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: center !important; | |
| width: 100% !important; | |
| } | |
| .domain-radio label, | |
| .domain-radio .wrap > label { | |
| flex: 1 !important; | |
| min-width: 160px !important; | |
| max-width: 200px !important; | |
| padding: 16px 24px !important; | |
| background: var(--bg-card) !important; | |
| border: 2px solid var(--border-default) !important; | |
| border-radius: 16px !important; | |
| cursor: pointer !important; | |
| transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; | |
| text-align: center !important; | |
| position: relative !important; | |
| overflow: hidden !important; | |
| } | |
| .domain-radio label::before { | |
| content: ''; | |
| position: absolute; | |
| top: 0; | |
| left: 0; | |
| right: 0; | |
| bottom: 0; | |
| background: linear-gradient(145deg, transparent, var(--glow-primary)); | |
| opacity: 0; | |
| transition: opacity 0.3s ease; | |
| pointer-events: none; | |
| } | |
| .domain-radio label:hover { | |
| transform: translateY(-2px) !important; | |
| border-color: var(--accent-primary) !important; | |
| box-shadow: | |
| 0 8px 24px rgba(227, 84, 84, 0.3), | |
| inset 0 0 20px rgba(227, 84, 84, 0.1) !important; | |
| } | |
| .domain-radio label:hover::before { | |
| opacity: 0.1; | |
| } | |
| .domain-radio input[type="radio"] { | |
| display: none !important; | |
| } | |
| .domain-radio input[type="radio"]:checked + label, | |
| .domain-radio .wrap > label:has(input[type="radio"]:checked), | |
| .domain-radio label.selected { | |
| background: transparent !important; | |
| border-color: var(--accent-primary) !important; | |
| color: var(--text-primary) !important; | |
| font-weight: 600 !important; | |
| transform: scale(1.05) !important; | |
| box-shadow: | |
| 0 12px 32px rgba(227, 84, 84, 0.4), | |
| 0 0 60px rgba(227, 84, 84, 0.2) !important; | |
| } | |
| .domain-radio input[type="radio"]:checked + label::before { | |
| opacity: 0.2; | |
| } | |
| /* Domain icons */ | |
| .domain-icon { | |
| font-size: 1.5rem; | |
| margin-bottom: 4px; | |
| display: block; | |
| filter: drop-shadow(0 0 10px currentColor); | |
| } | |
| .domain-name { | |
| font-size: 0.95rem; | |
| font-weight: 500; | |
| margin-top: 4px; | |
| } | |
| /* Badge for domain counts */ | |
| .domain-count { | |
| position: absolute; | |
| top: 8px; | |
| right: 8px; | |
| background: var(--accent-primary); | |
| color: white; | |
| font-size: 0.75rem; | |
| padding: 2px 8px; | |
| border-radius: 12px; | |
| font-weight: 600; | |
| opacity: 0.8; | |
| } | |
| /* Filter radio buttons styling - smaller for better fit */ | |
| .filter-radio { | |
| max-width: 100% !important; | |
| } | |
| .filter-radio .gr-row { | |
| gap: 8px !important; | |
| } | |
| .filter-radio .gr-column { | |
| min-width: 0 !important; | |
| flex: 1 !important; | |
| } | |
| .filter-radio .gr-form { | |
| min-width: 0 !important; | |
| } | |
| .filter-radio .gr-radio-group { | |
| gap: 4px !important; | |
| } | |
| .filter-radio .domain-radio { | |
| display: flex !important; | |
| gap: 4px !important; | |
| flex-wrap: nowrap !important; | |
| justify-content: center !important; | |
| } | |
| .filter-radio .domain-radio label { | |
| min-width: auto !important; | |
| max-width: 120px !important; | |
| padding: 8px 12px !important; | |
| font-size: 0.8rem !important; | |
| white-space: nowrap !important; | |
| overflow: hidden !important; | |
| text-overflow: ellipsis !important; | |
| } | |
| /* Additional targeting for the specific filter components */ | |
| .filter-radio .gr-box { | |
| padding: 8px !important; | |
| } | |
| .filter-radio .gr-radio { | |
| gap: 4px !important; | |
| } | |
| .filter-radio .gr-input-label { | |
| font-size: 0.85rem !important; | |
| margin-bottom: 4px !important; | |
| } | |
| /* Force compact layout for the filters */ | |
| @media (max-width: 1400px) { | |
| .filter-radio .domain-radio label { | |
| padding: 6px 10px !important; | |
| font-size: 0.75rem !important; | |
| } | |
| } | |
| /* Compact filter row styling */ | |
| .compact-filter-row { | |
| margin-bottom: 20px !important; | |
| } | |
| .compact-filter-row .gr-column { | |
| padding: 0 8px !important; | |
| } | |
| .compact-filter-row .gr-box { | |
| padding: 0 !important; | |
| } | |
| /* Compact radio button styling */ | |
| .compact-radio { | |
| width: 100% !important; | |
| } | |
| .compact-radio > label { | |
| font-size: 0.85rem !important; | |
| margin-bottom: 8px !important; | |
| font-weight: 600 !important; | |
| color: var(--text-primary) !important; | |
| display: block !important; | |
| } | |
| .compact-radio .wrap { | |
| display: flex !important; | |
| flex-wrap: nowrap !important; | |
| gap: 4px !important; | |
| justify-content: center !important; | |
| } | |
| .compact-radio .wrap > label { | |
| display: inline-flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| padding: 6px 10px !important; | |
| margin: 0 !important; | |
| background: var(--bg-card) !important; | |
| border: 1px solid var(--border-default) !important; | |
| border-radius: 8px !important; | |
| cursor: pointer !important; | |
| transition: all 0.2s ease !important; | |
| font-size: 0.75rem !important; | |
| white-space: nowrap !important; | |
| flex: 1 !important; | |
| min-width: 0 !important; | |
| overflow: hidden !important; | |
| text-overflow: ellipsis !important; | |
| } | |
| .compact-radio .wrap > label:has(input[type="radio"]:checked) { | |
| background: transparent !important; | |
| border-color: var(--accent-primary) !important; | |
| color: var(--text-primary) !important; | |
| font-weight: 600 !important; | |
| } | |
| .compact-radio .wrap > label:hover { | |
| background: rgba(227, 84, 84, 0.1) !important; | |
| border-color: var(--accent-primary) !important; | |
| transform: scale(1.02) !important; | |
| } | |
| .compact-radio input[type="radio"] { | |
| display: none !important; | |
| } | |
| /* Target Gradio's data attributes for selected state */ | |
| .compact-radio label[data-selected="true"], | |
| .compact-radio label[aria-checked="true"], | |
| .domain-radio label[data-selected="true"], | |
| .domain-radio label[aria-checked="true"] { | |
| background: transparent !important; | |
| border-color: var(--accent-primary) !important; | |
| color: var(--text-primary) !important; | |
| font-weight: 600 !important; | |
| } | |
| /* Sort by radio buttons */ | |
| .sort-by-radio .domain-radio { | |
| display: flex !important; | |
| gap: 10px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: flex-start !important; | |
| } | |
| .sort-by-radio .domain-radio .wrap { | |
| display: flex !important; | |
| gap: 10px !important; | |
| flex-wrap: wrap !important; | |
| justify-content: flex-start !important; | |
| width: 100% !important; | |
| } | |
| .sort-by-radio .domain-radio label, | |
| .sort-by-radio .domain-radio .wrap > label { | |
| min-width: 180px !important; | |
| max-width: 220px !important; | |
| padding: 12px 20px !important; | |
| font-size: 0.95rem !important; | |
| } | |
| </style> | |
| <div class="domain-selector-container"> | |
| <div class="domain-header"> | |
| <h2 class="domain-title">ποΈ Select Business Domain</h2> | |
| <p class="domain-subtitle">Choose a domain to see specialized agent performance</p> | |
| </div> | |
| """) | |
| # Creating a custom radio with better visual design | |
| domain_choices = [ | |
| ("All", "π", "All Domains"), | |
| ("Banking", "π¦", "Banking"), | |
| ("Healthcare", "π₯", "Healthcare"), | |
| ("Insurance", "π‘οΈ", "Insurance"), | |
| ("Investment", "π°", "Investment"), | |
| ("Telecom", "π±", "Telecom") | |
| ] | |
| with gr.Row(): | |
| domain_filter = gr.Radio( | |
| choices=["π All", "π¦ Banking", "π₯ Healthcare", "π‘οΈ Insurance", "π° Investment", "π± Telecom"], | |
| value="π All", | |
| label="", | |
| interactive=True, | |
| elem_classes=["domain-radio"] | |
| ) | |
| gr.HTML(""" | |
| </div> | |
| """) | |
| # Filter controls with enhanced styling | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-bottom: 24px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Filters & Sorting | |
| </h3> | |
| </div> | |
| """) | |
| # First row: Model filters and sort order | |
| with gr.Row(elem_classes=["compact-filter-row"]): | |
| with gr.Column(scale=1): | |
| model_type_filter = gr.Radio( | |
| choices=["All", "Open Source", "Proprietary"], | |
| value="All", | |
| label="π Model Access", | |
| elem_classes=["compact-radio"] | |
| ) | |
| with gr.Column(scale=1): | |
| reasoning_filter = gr.Radio( | |
| choices=["All", "Reasoning", "Normal"], | |
| value="All", | |
| label="π§ Output Type", | |
| elem_classes=["compact-radio"] | |
| ) | |
| with gr.Column(scale=1): | |
| sort_order = gr.Radio( | |
| choices=["Descending", "Ascending"], | |
| value="Descending", | |
| label="π Sort Order", | |
| elem_classes=["compact-radio"] | |
| ) | |
| # Second row: Sort by options | |
| gr.HTML("""<div style="margin-top: 20px; margin-bottom: 10px;"> | |
| <h4 style="color: var(--text-primary); font-size: 1.1rem; font-weight: 600; margin: 0;">π Sort By</h4> | |
| </div>""") | |
| gr.HTML('<div class="sort-by-radio">') | |
| sort_by = gr.Radio( | |
| choices=["Avg Action Completion", "Avg Tool Selection Quality", "Avg Session Cost", "Avg Session Duration", "Avg Turns"], | |
| value="Avg Action Completion", | |
| label="", | |
| elem_classes=["domain-radio"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # Main leaderboard table with dynamic title | |
| leaderboard_title = gr.HTML(""" | |
| <div class="dark-container pulse" style="margin-bottom: 24px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-primary);">π</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Agent Leaderboard for All | |
| </h3> | |
| </div> | |
| <div class="dataframe-container"> | |
| """) | |
| leaderboard_table = gr.HTML(initial_table) | |
| gr.HTML(""" | |
| </div> | |
| </div>""") | |
| # Evaluate Your Agents Button | |
| gr.HTML(""" | |
| <div style="text-align: center; margin-top: 32px; margin-bottom: 32px;"> | |
| <a href="https://app.galileo.ai/sign-up?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" | |
| style="display: inline-flex; align-items: center; gap: 12px; padding: 16px 40px; | |
| background: linear-gradient(135deg, #E35454 0%, #F06B6B 100%); | |
| border: none; | |
| border-radius: 16px; | |
| color: white; | |
| text-decoration: none; | |
| font-size: 1.1rem; | |
| font-family: 'Geist', sans-serif; | |
| font-weight: 600; | |
| transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); | |
| box-shadow: 0 8px 24px rgba(227, 84, 84, 0.35), 0 2px 8px rgba(0, 0, 0, 0.1); | |
| transform: translateY(0);"> | |
| <span style="font-size: 1.3rem;">π</span> | |
| <span>Evaluate Your Agents</span> | |
| <span style="font-size: 0.9rem;">β</span> | |
| </a> | |
| </div> | |
| <style> | |
| .dataframe-container + div a:hover { | |
| background: linear-gradient(135deg, #D94444 0%, #E05555 100%) !important; | |
| transform: translateY(-3px) !important; | |
| box-shadow: 0 12px 32px rgba(227, 84, 84, 0.45), 0 4px 12px rgba(0, 0, 0, 0.15) !important; | |
| } | |
| </style> | |
| """) | |
| # Column Info Section | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-top: 24px; margin-bottom: 32px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Column Explanations | |
| </h3> | |
| </div> | |
| <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin-top: 20px;"> | |
| <!-- Performance Metrics --> | |
| <div class="info-box" style="background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.03) 100%);"> | |
| <h4 style="color: var(--accent-primary); margin-top: 0; margin-bottom: 16px; font-size: 1.1rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;"> | |
| <span style="font-size: 1.3rem;">π―</span> | |
| Performance Metrics | |
| </h4> | |
| <div style="space-y: 12px;"> | |
| <div style="margin-bottom: 12px;"> | |
| <div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
| π Action Completion | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4; margin-bottom: 6px;"> | |
| Measures how well the agent accomplishes user goals and completes tasks successfully. | |
| </div> | |
| <a href="https://v2docs.galileo.ai/concepts/metrics/agentic/action-completion" target="_blank" | |
| style="color: var(--accent-primary); text-decoration: none; font-size: 0.85rem; display: inline-flex; align-items: center; gap: 4px;"> | |
| π Learn more about Action Completion | |
| <span style="font-size: 0.7rem;">β</span> | |
| </a> | |
| </div> | |
| <div style="border-top: 1px solid var(--border-subtle); padding-top: 12px;"> | |
| <div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
| π οΈ Tool Selection Quality | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4; margin-bottom: 6px;"> | |
| Evaluates the accuracy of selecting the right tools and using them with correct parameters. | |
| </div> | |
| <a href="https://v2docs.galileo.ai/concepts/metrics/agentic/tool-selection-quality" target="_blank" | |
| style="color: var(--accent-primary); text-decoration: none; font-size: 0.85rem; display: inline-flex; align-items: center; gap: 4px;"> | |
| π Learn more about Tool Selection Quality | |
| <span style="font-size: 0.7rem;">β</span> | |
| </a> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Session-Level Metrics --> | |
| <div class="info-box" style="background: linear-gradient(145deg, rgba(16, 152, 247, 0.05) 0%, rgba(245, 246, 247, 0.03) 100%);"> | |
| <h4 style="color: var(--accent-secondary); margin-top: 0; margin-bottom: 16px; font-size: 1.1rem; font-family: 'Geist', sans-serif; font-weight: 600; display: flex; align-items: center; gap: 8px;"> | |
| <span style="font-size: 1.3rem;">π</span> | |
| Session-Level Metrics | |
| </h4> | |
| <div style="space-y: 10px;"> | |
| <div style="margin-bottom: 10px;"> | |
| <div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
| π° Avg Cost ($) | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;"> | |
| Average cost per conversation session, including all API calls and processing. | |
| </div> | |
| </div> | |
| <div style="margin-bottom: 10px; border-top: 1px solid var(--border-subtle); padding-top: 10px;"> | |
| <div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
| β‘ Avg Duration (s) | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;"> | |
| Average time taken to complete a full conversation session from start to finish. | |
| </div> | |
| </div> | |
| <div style="border-top: 1px solid var(--border-subtle); padding-top: 10px;"> | |
| <div style="font-weight: 600; color: var(--text-primary); margin-bottom: 4px; font-family: 'Geist Mono', monospace; font-size: 0.9rem;"> | |
| π¬ Avg Turns | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;"> | |
| Average number of back-and-forth exchanges needed to complete a task. | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Additional Notes --> | |
| <div style="margin-top: 24px; padding: 16px; background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle); border-radius: 12px;"> | |
| <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;"> | |
| <span style="font-size: 1.1rem;">π‘</span> | |
| <span style="font-weight: 600; color: var(--text-primary); font-size: 0.95rem;">Default Sorting</span> | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.9rem; line-height: 1.4;"> | |
| The table is sorted by <strong style="color: var(--text-primary);">Action Completion</strong> in descending order by default, showing the best-performing models first. You can change the sorting using the filters above. | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # AI Agent Reliability Prediction Section | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-bottom: 24px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Enterprise Readiness Prediction | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| When will AI agents reach 99% reliability for enterprise deployment? | |
| </p> | |
| """) | |
| # Create initial prediction with default filters | |
| initial_prediction_chart, initial_date_99, initial_months_to_99, initial_best_ac = create_ac_prediction_chart( | |
| load_leaderboard_data(), domain_filter="All", model_type_filter="All" | |
| ) | |
| # Add dynamic insights section with visual cards | |
| def generate_insight_html(date_99, months_to_99, domain_filter="All", model_type_filter="All", current_best_ac=None): | |
| """Generate insight HTML with visual cards/badges based on prediction results and filters""" | |
| # Clean up filter names | |
| if domain_filter.startswith('π'): | |
| domain_clean = "All Domains" | |
| elif domain_filter.startswith('π¦'): | |
| domain_clean = "Banking" | |
| elif domain_filter.startswith('π₯'): | |
| domain_clean = "Healthcare" | |
| elif domain_filter.startswith('π‘οΈ'): | |
| domain_clean = "Insurance" | |
| elif domain_filter.startswith('π°'): | |
| domain_clean = "Investment" | |
| elif domain_filter.startswith('π±'): | |
| domain_clean = "Telecom" | |
| else: | |
| domain_clean = domain_filter | |
| filter_context = "" | |
| filter_badge = "" | |
| if domain_clean != "All Domains" or model_type_filter != "All": | |
| parts = [] | |
| if domain_clean != "All Domains": | |
| parts.append(domain_clean) | |
| if model_type_filter != "All": | |
| parts.append(f"{model_type_filter} Models") | |
| filter_badge = f""" | |
| <span style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 4px; | |
| padding: 4px 10px; | |
| background: rgba(16, 152, 247, 0.15); | |
| border: 1px solid rgba(16, 152, 247, 0.3); | |
| border-radius: 20px; | |
| font-size: 0.85rem; | |
| font-weight: 600; | |
| color: var(--accent-primary); | |
| margin-left: 12px; | |
| "> | |
| <span style="font-size: 0.9rem;">π</span> | |
| {' β’ '.join(parts)} | |
| </span> | |
| """ | |
| if date_99 and months_to_99: | |
| if months_to_99 > 0: | |
| # Calculate percentage progress (assuming 100% at 99% AC) | |
| # If current_best_ac is provided, use it; otherwise default to 85% | |
| if current_best_ac is not None: | |
| # Scale from 0-99% AC to 0-100% progress | |
| current_progress = min(100, int((current_best_ac / 0.99) * 100)) | |
| else: | |
| current_progress = 85 # Fallback if not provided | |
| return f""" | |
| <div style="margin-bottom: 24px;"> | |
| <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 16px;"> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-weight: 700;"> | |
| Key Predictions | |
| </h3> | |
| {filter_badge} | |
| </div> | |
| <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 16px;"> | |
| <!-- Enterprise Ready Date Card --> | |
| <div style=" | |
| padding: 20px; | |
| background: linear-gradient(135deg, rgba(16, 152, 247, 0.1) 0%, rgba(16, 152, 247, 0.05) 100%); | |
| border: 1px solid rgba(16, 152, 247, 0.2); | |
| border-radius: 16px; | |
| position: relative; | |
| overflow: hidden; | |
| "> | |
| <div style=" | |
| position: absolute; | |
| top: -20px; | |
| right: -20px; | |
| width: 60px; | |
| height: 60px; | |
| background: radial-gradient(circle, rgba(16, 152, 247, 0.2) 0%, transparent 70%); | |
| border-radius: 50%; | |
| "></div> | |
| <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;"> | |
| <span style="font-size: 1.8rem;">π </span> | |
| <span style="color: var(--text-secondary); font-size: 0.9rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Target Date</span> | |
| </div> | |
| <div style="font-size: 1.8rem; font-weight: 800; color: var(--text-primary); margin-bottom: 4px;"> | |
| {date_99.strftime('%b %Y')} | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.95rem;"> | |
| 99% AC Threshold | |
| </div> | |
| </div> | |
| <!-- Time Remaining Card --> | |
| <div style=" | |
| padding: 20px; | |
| background: linear-gradient(135deg, rgba(227, 84, 84, 0.1) 0%, rgba(227, 84, 84, 0.05) 100%); | |
| border: 1px solid rgba(227, 84, 84, 0.2); | |
| border-radius: 16px; | |
| position: relative; | |
| overflow: hidden; | |
| "> | |
| <div style=" | |
| position: absolute; | |
| top: -20px; | |
| right: -20px; | |
| width: 60px; | |
| height: 60px; | |
| background: radial-gradient(circle, rgba(227, 84, 84, 0.2) 0%, transparent 70%); | |
| border-radius: 50%; | |
| "></div> | |
| <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;"> | |
| <span style="font-size: 1.8rem;">β±οΈ</span> | |
| <span style="color: var(--text-secondary); font-size: 0.9rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Timeline</span> | |
| </div> | |
| <div style="font-size: 1.8rem; font-weight: 800; color: var(--text-primary); margin-bottom: 4px;"> | |
| ~{months_to_99:.0f} | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.95rem;"> | |
| Months to 99% AC | |
| </div> | |
| </div> | |
| <!-- Current Performance Card --> | |
| <div style=" | |
| padding: 20px; | |
| background: linear-gradient(135deg, rgba(40, 167, 69, 0.1) 0%, rgba(40, 167, 69, 0.05) 100%); | |
| border: 1px solid rgba(40, 167, 69, 0.2); | |
| border-radius: 16px; | |
| position: relative; | |
| overflow: hidden; | |
| "> | |
| <div style=" | |
| position: absolute; | |
| top: -20px; | |
| right: -20px; | |
| width: 60px; | |
| height: 60px; | |
| background: radial-gradient(circle, rgba(40, 167, 69, 0.2) 0%, transparent 70%); | |
| border-radius: 50%; | |
| "></div> | |
| <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 8px;"> | |
| <span style="font-size: 1.8rem;">π</span> | |
| <span style="color: var(--text-secondary); font-size: 0.9rem; font-weight: 600; text-transform: uppercase; letter-spacing: 0.5px;">Current Best</span> | |
| </div> | |
| <div style="font-size: 1.8rem; font-weight: 800; color: var(--text-primary); margin-bottom: 4px;"> | |
| {current_best_ac*100:.0f}% | |
| </div> | |
| <div style="color: var(--text-secondary); font-size: 0.95rem;"> | |
| AC Score Achieved | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Progress Bar --> | |
| <div style="margin-top: 20px; padding: 16px; background: rgba(255, 255, 255, 0.02); border: 1px solid var(--border-subtle); border-radius: 12px;"> | |
| <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;"> | |
| <span style="color: var(--text-secondary); font-size: 0.9rem; font-weight: 600;">PROGRESS TO ENTERPRISE READY</span> | |
| <span style="color: var(--accent-primary); font-size: 0.9rem; font-weight: 700;">{current_progress}%</span> | |
| </div> | |
| <div style="width: 100%; height: 8px; background: rgba(255, 255, 255, 0.1); border-radius: 4px; overflow: hidden;"> | |
| <div style="width: {current_progress}%; height: 100%; background: linear-gradient(90deg, var(--accent-primary) 0%, var(--accent-secondary) 100%); border-radius: 4px; transition: width 0.3s ease;"></div> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| else: | |
| return f""" | |
| <div style="margin-bottom: 24px;"> | |
| <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 16px;"> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-weight: 700;"> | |
| Achievement Status | |
| </h3> | |
| {filter_badge} | |
| </div> | |
| <div style=" | |
| padding: 24px; | |
| background: linear-gradient(135deg, rgba(40, 167, 69, 0.15) 0%, rgba(40, 167, 69, 0.05) 100%); | |
| border: 2px solid rgba(40, 167, 69, 0.3); | |
| border-radius: 16px; | |
| text-align: center; | |
| "> | |
| <span style="font-size: 3rem; margin-bottom: 12px; display: block;">π</span> | |
| <h2 style="margin: 0 0 12px 0; color: var(--text-primary); font-size: 1.8rem; font-weight: 800;"> | |
| Enterprise Ready! | |
| </h2> | |
| <p style="color: var(--text-secondary); font-size: 1.1rem; margin: 0 0 20px 0;"> | |
| Models have achieved enterprise-grade reliability | |
| </p> | |
| <div style="display: flex; justify-content: center; gap: 12px; flex-wrap: wrap;"> | |
| <span style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| padding: 8px 16px; | |
| background: rgba(40, 167, 69, 0.1); | |
| border: 1px solid rgba(40, 167, 69, 0.3); | |
| border-radius: 24px; | |
| font-size: 0.95rem; | |
| font-weight: 600; | |
| color: #28a745; | |
| "> | |
| β 99% AC Achieved | |
| </span> | |
| <span style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| padding: 8px 16px; | |
| background: rgba(16, 152, 247, 0.1); | |
| border: 1px solid rgba(16, 152, 247, 0.3); | |
| border-radius: 24px; | |
| font-size: 0.95rem; | |
| font-weight: 600; | |
| color: var(--accent-primary); | |
| "> | |
| β‘ Ready for Scaling | |
| </span> | |
| <span style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| padding: 8px 16px; | |
| background: rgba(227, 84, 84, 0.1); | |
| border: 1px solid rgba(227, 84, 84, 0.3); | |
| border-radius: 24px; | |
| font-size: 0.95rem; | |
| font-weight: 600; | |
| color: var(--accent-secondary); | |
| "> | |
| π‘οΈ Focus on Guardrails | |
| </span> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| else: | |
| return f""" | |
| <div style="margin-bottom: 24px;"> | |
| <div style="display: flex; align-items: center; gap: 8px; margin-bottom: 16px;"> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.3rem; font-weight: 700;"> | |
| Prediction Status | |
| </h3> | |
| {filter_badge} | |
| </div> | |
| <div style=" | |
| padding: 20px; | |
| background: rgba(255, 193, 7, 0.08); | |
| border: 1px solid rgba(255, 193, 7, 0.2); | |
| border-radius: 16px; | |
| display: flex; | |
| align-items: center; | |
| gap: 16px; | |
| "> | |
| <span style="font-size: 2.5rem;">π</span> | |
| <div> | |
| <h4 style="margin: 0 0 8px 0; color: var(--text-primary); font-size: 1.1rem; font-weight: 700;"> | |
| Insufficient Data for Predictions | |
| </h4> | |
| <p style="color: var(--text-secondary); margin: 0; font-size: 0.95rem; line-height: 1.5;"> | |
| {'More models need to be evaluated in this category to generate reliable predictions.' if (domain_clean != "All Domains" or model_type_filter != "All") else 'As more models are released and evaluated, our predictions will become more accurate.'} | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| # Create the insights HTML component ABOVE the chart | |
| prediction_insights = gr.HTML( | |
| generate_insight_html(initial_date_99, initial_months_to_99, "All", "All", initial_best_ac) | |
| ) | |
| # Add prediction chart - make it reactive | |
| gr.HTML('<div class="chart-container">') | |
| prediction_plot = gr.Plot( | |
| label="", | |
| value=initial_prediction_chart, | |
| elem_classes=["prediction-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| # Add methodology note | |
| gr.HTML(""" | |
| <div style=" | |
| margin-top: 20px; | |
| margin-bottom: 24px; | |
| padding: 16px; | |
| background: rgba(16, 152, 247, 0.05); | |
| border-left: 3px solid var(--accent-primary); | |
| border-radius: 8px; | |
| "> | |
| <div style="display: flex; align-items: flex-start; gap: 12px;"> | |
| <span style="font-size: 1.2rem; color: var(--accent-primary); margin-top: 2px;">π</span> | |
| <div> | |
| <h4 style="margin: 0 0 8px 0; color: var(--text-primary); font-size: 1rem; font-weight: 600;"> | |
| Methodology Note | |
| </h4> | |
| <p style="color: var(--text-secondary); margin: 0; font-size: 0.95rem; line-height: 1.6;"> | |
| Our current prediction uses a <strong>conservative linear projection with a 50% diminishing returns factor</strong>. | |
| This simple approach assumes that future AI improvements will occur at half the current rate, accounting for | |
| increasing technical challenges as models approach higher performance levels. | |
| </p> | |
| <p style="color: var(--text-secondary); margin: 8px 0 0 0; font-size: 0.95rem; line-height: 1.6;"> | |
| <strong>Why this approach?</strong> With limited data points currently available, complex curve fitting | |
| (exponential, logistic) would lead to overfitting. As we evaluate more models and gather additional data points, | |
| we will refine our methodology to incorporate more sophisticated growth models that better capture the true | |
| trajectory of AI agent capabilities. | |
| </p> | |
| <p style="color: var(--accent-primary); margin: 8px 0 0 0; font-size: 0.9rem; font-weight: 600;"> | |
| π This projection will automatically update as new models are added to the leaderboard. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| gr.HTML("</div>") | |
| # Radar Chart Section | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-bottom: 24px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-primary);">πΈοΈ</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Domain Performance Analysis | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;">Compare model performance across different business domains</p> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| model_selector = gr.Dropdown( | |
| choices=initial_df['Model'].tolist()[:10], | |
| value=initial_df['Model'].tolist()[:5], | |
| multiselect=True, | |
| label="π― Select Models for Comparison", | |
| info="Choose up to 5 models to visualize", | |
| elem_classes=["dropdown"] | |
| ) | |
| # Radar chart plot - wrapped in centered container | |
| gr.HTML('<div class="chart-container">') | |
| radar_chart = gr.Plot( | |
| label="", | |
| value=create_domain_radar_chart( | |
| load_leaderboard_data(), | |
| "Avg AC", | |
| initial_df['Model'].tolist()[:5] | |
| ), | |
| elem_classes=["radar-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # Update functions | |
| def get_optimal_sort_order(sort_by_value): | |
| """Return the optimal sort order for a given metric""" | |
| # Metrics where higher is better (descending) | |
| descending_metrics = ["Avg Action Completion", "Avg Tool Selection Quality"] | |
| # Metrics where lower is better (ascending) | |
| ascending_metrics = ["Avg Session Cost", "Avg Session Duration", "Avg Turns"] | |
| if sort_by_value in descending_metrics: | |
| return "Descending" | |
| elif sort_by_value in ascending_metrics: | |
| return "Ascending" | |
| else: | |
| return "Descending" # Default fallback | |
| def update_sort_order_automatically(sort_by_value): | |
| """Update sort order automatically based on selected metric""" | |
| optimal_order = get_optimal_sort_order(sort_by_value) | |
| return optimal_order | |
| def update_table(*args): | |
| title_html = update_leaderboard_title(args[0]) # domain_filter is first arg | |
| table_html = filter_and_sort_data(*args) | |
| return title_html, table_html | |
| def update_radar_chart(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models): | |
| # Get filtered dataframe | |
| df = load_leaderboard_data() | |
| filtered_df = df.copy() | |
| # Strip emoji prefix from domain filter | |
| domain_filter_clean = domain_filter | |
| if domain_filter.startswith('π'): | |
| domain_filter_clean = "All" | |
| elif domain_filter.startswith('π¦'): | |
| domain_filter_clean = "Banking" | |
| elif domain_filter.startswith('π₯'): | |
| domain_filter_clean = "Healthcare" | |
| elif domain_filter.startswith('π‘οΈ'): | |
| domain_filter_clean = "Insurance" | |
| elif domain_filter.startswith('π°'): | |
| domain_filter_clean = "Investment" | |
| elif domain_filter.startswith('π±'): | |
| domain_filter_clean = "Telecom" | |
| # Apply filters (same logic as filter_and_sort_data) | |
| if domain_filter_clean != "All": | |
| domain_col_map = { | |
| "Banking": "Banking AC", | |
| "Healthcare": "Healthcare AC", | |
| "Insurance": "Insurance AC", | |
| "Investment": "Investment AC", | |
| "Telecom": "Telecom AC" | |
| } | |
| if domain_filter_clean in domain_col_map: | |
| domain_col = domain_col_map[domain_filter_clean] | |
| # Only show models that have data for this domain | |
| filtered_df = filtered_df[filtered_df[domain_col] != ''] | |
| if model_type_filter != "All": | |
| if model_type_filter == "Open Source": | |
| filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] | |
| elif model_type_filter == "Proprietary": | |
| filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] | |
| if reasoning_filter != "All": | |
| if reasoning_filter == "Reasoning": | |
| filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] | |
| elif reasoning_filter == "Normal": | |
| filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] | |
| # Map display name to actual column name using shared mapping | |
| actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) | |
| # If domain is selected and sorting by AC or TSQ, use domain-specific column | |
| if domain_filter_clean != "All": | |
| if actual_sort_column == "Avg AC": | |
| actual_sort_column = f"{domain_filter_clean} AC" | |
| elif actual_sort_column == "Avg TSQ": | |
| actual_sort_column = f"{domain_filter_clean} TSQ" | |
| elif actual_sort_column == "Avg Total Cost": | |
| actual_sort_column = f"{domain_filter_clean} Cost" | |
| elif actual_sort_column == "Avg Session Duration": | |
| actual_sort_column = f"{domain_filter_clean} Duration" | |
| elif actual_sort_column == "Avg Turns": | |
| actual_sort_column = f"{domain_filter_clean} Turns" | |
| if actual_sort_column and actual_sort_column in filtered_df.columns: | |
| ascending = (sort_order == "Ascending") | |
| filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') | |
| # Update model selector choices based on filtered data | |
| available_models = filtered_df['Model'].tolist()[:15] # Top 15 from filtered results | |
| # If selected models are not in available models, reset to top 5 | |
| if selected_models: | |
| valid_selected = [m for m in selected_models if m in available_models] | |
| if not valid_selected: | |
| valid_selected = available_models[:5] | |
| else: | |
| valid_selected = available_models[:5] | |
| # Create radar chart | |
| chart = create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected) | |
| return gr.Dropdown(choices=available_models, value=valid_selected), chart | |
| def update_radar_only(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order, selected_models): | |
| # Get filtered dataframe | |
| df = load_leaderboard_data() | |
| filtered_df = df.copy() | |
| # Strip emoji prefix from domain filter | |
| domain_filter_clean = domain_filter | |
| if domain_filter.startswith('π'): | |
| domain_filter_clean = "All" | |
| elif domain_filter.startswith('π¦'): | |
| domain_filter_clean = "Banking" | |
| elif domain_filter.startswith('π₯'): | |
| domain_filter_clean = "Healthcare" | |
| elif domain_filter.startswith('π‘οΈ'): | |
| domain_filter_clean = "Insurance" | |
| elif domain_filter.startswith('π°'): | |
| domain_filter_clean = "Investment" | |
| elif domain_filter.startswith('π±'): | |
| domain_filter_clean = "Telecom" | |
| # Apply filters (same logic as filter_and_sort_data) | |
| if domain_filter_clean != "All": | |
| domain_col_map = { | |
| "Banking": "Banking AC", | |
| "Healthcare": "Healthcare AC", | |
| "Insurance": "Insurance AC", | |
| "Investment": "Investment AC", | |
| "Telecom": "Telecom AC" | |
| } | |
| if domain_filter_clean in domain_col_map: | |
| domain_col = domain_col_map[domain_filter_clean] | |
| filtered_df = filtered_df[filtered_df[domain_col] != ''] | |
| if model_type_filter != "All": | |
| if model_type_filter == "Open Source": | |
| filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] | |
| elif model_type_filter == "Proprietary": | |
| filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] | |
| if reasoning_filter != "All": | |
| if reasoning_filter == "Reasoning": | |
| filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] | |
| elif reasoning_filter == "Normal": | |
| filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] | |
| # Map display name to actual column name using shared mapping | |
| actual_sort_column = SORT_COLUMN_MAP.get(sort_by, sort_by) | |
| if actual_sort_column and actual_sort_column in filtered_df.columns: | |
| ascending = (sort_order == "Ascending") | |
| filtered_df = filtered_df.sort_values(by=actual_sort_column, ascending=ascending, na_position='last') | |
| if selected_models: | |
| valid_selected = [m for m in selected_models if m in filtered_df['Model'].tolist()] | |
| if not valid_selected: | |
| valid_selected = filtered_df['Model'].tolist()[:5] | |
| else: | |
| valid_selected = filtered_df['Model'].tolist()[:5] | |
| return create_domain_radar_chart(load_leaderboard_data(), sort_by, valid_selected) | |
| # Function to update prediction chart and insights | |
| def update_prediction_chart_and_insights(domain_filter, model_type_filter): | |
| """Update prediction chart and insights based on filters""" | |
| df = load_leaderboard_data() | |
| # Create new prediction chart with filters | |
| chart, date_99, months_to_99, current_best_ac = create_ac_prediction_chart( | |
| df, domain_filter=domain_filter, model_type_filter=model_type_filter | |
| ) | |
| # Generate new insights HTML with current best AC for progress bar | |
| insights_html = generate_insight_html(date_99, months_to_99, domain_filter, model_type_filter, current_best_ac) | |
| return chart, insights_html | |
| # Update table when filters change | |
| filter_inputs = [domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order] | |
| for input_component in filter_inputs: | |
| input_component.change( | |
| fn=update_table, | |
| inputs=filter_inputs, | |
| outputs=[leaderboard_title, leaderboard_table] | |
| ) | |
| # Also update radar chart when filters change | |
| input_component.change( | |
| fn=update_radar_chart, | |
| inputs=filter_inputs + [model_selector], | |
| outputs=[model_selector, radar_chart] | |
| ) | |
| # Update prediction chart when domain or model type filters change | |
| # Only react to domain_filter and model_type_filter, not other filters | |
| domain_filter.change( | |
| fn=update_prediction_chart_and_insights, | |
| inputs=[domain_filter, model_type_filter], | |
| outputs=[prediction_plot, prediction_insights] | |
| ) | |
| model_type_filter.change( | |
| fn=update_prediction_chart_and_insights, | |
| inputs=[domain_filter, model_type_filter], | |
| outputs=[prediction_plot, prediction_insights] | |
| ) | |
| # Update radar chart when model selection changes | |
| model_selector.change( | |
| fn=update_radar_only, | |
| inputs=filter_inputs + [model_selector], | |
| outputs=[radar_chart] | |
| ) | |
| # Automatically update sort order when sort_by changes | |
| sort_by.change( | |
| fn=update_sort_order_automatically, | |
| inputs=[sort_by], | |
| outputs=[sort_order] | |
| ) | |
| # Performance insights section | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-top: 32px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Key Insights | |
| </h3> | |
| </div> | |
| <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 24px; margin-top: 24px;"> | |
| <div class="info-box"> | |
| <h4 style="color: var(--accent-primary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">π Top Performers</h4> | |
| <ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;"> | |
| <li>Highest AC scores indicate best action completion</li> | |
| <li>Superior TSQ shows optimal tool selection</li> | |
| <li>Balance cost-effectiveness with performance</li> | |
| </ul> | |
| </div> | |
| <div class="info-box"> | |
| <h4 style="color: var(--accent-secondary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">π Filter Features</h4> | |
| <ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;"> | |
| <li>Domain-specific performance analysis</li> | |
| <li>Compare open source vs private models</li> | |
| <li>Reasoning vs standard model comparison</li> | |
| </ul> | |
| </div> | |
| <div class="info-box"> | |
| <h4 style="color: var(--accent-primary); margin-top: 0; font-size: 1.2rem; font-family: 'Geist', sans-serif; font-weight: 600;">π Visualization</h4> | |
| <ul style="color: var(--text-secondary); margin: 0; padding-left: 20px; line-height: 1.6; font-family: 'Geist', sans-serif;"> | |
| <li>Interactive radar charts for domain breakdown</li> | |
| <li>Compare up to 5 models simultaneously</li> | |
| <li>Hover for detailed performance metrics</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # NEW VISUALIZATIONS START HERE | |
| # 1. Cost-Performance Efficiency Scatter Plot | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-top: 32px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-primary);">π‘</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Cost-Performance Efficiency Analysis | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| Identify models that deliver the best performance per dollar spent | |
| </p> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| efficiency_metric = gr.Dropdown( | |
| choices=["Avg Action Completion", "Avg Tool Selection Quality"], | |
| value="Avg Action Completion", | |
| label="π Performance Metric", | |
| info="Select which performance metric to analyze against cost", | |
| elem_classes=["dropdown"] | |
| ) | |
| gr.HTML('<div class="chart-container">') | |
| cost_performance_plot = gr.Plot( | |
| label="", | |
| value=create_cost_performance_scatter(load_leaderboard_data(), "Avg AC"), | |
| elem_classes=["efficiency-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # 2. Speed vs Accuracy Trade-off Chart | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-top: 32px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-secondary);">β‘</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Speed vs Accuracy Trade-off | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| Find the sweet spot between response time and accuracy | |
| </p> | |
| """) | |
| gr.HTML('<div class="chart-container">') | |
| speed_accuracy_plot = gr.Plot( | |
| label="", | |
| value=create_speed_accuracy_plot(load_leaderboard_data(), "Avg AC"), | |
| elem_classes=["speed-accuracy-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # 3. Performance Heatmap | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-top: 32px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-primary);">π₯</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Comprehensive Performance Heatmap | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| All metrics at a glance - darker colors indicate better performance | |
| </p> | |
| """) | |
| gr.HTML('<div class="chart-container">') | |
| performance_heatmap = gr.Plot( | |
| label="", | |
| value=create_performance_heatmap(load_leaderboard_data()), | |
| elem_classes=["heatmap-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # 4. Domain Specialization Matrix | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-top: 32px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-primary);">π―</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Domain Specialization Matrix | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| Bubble size shows performance level, color intensity shows specialization strength | |
| </p> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| specialization_metric = gr.Dropdown( | |
| choices=["AC (Action Completion)", "TSQ (Tool Selection Quality)"], | |
| value="AC (Action Completion)", | |
| label="π Metric Type", | |
| info="Choose which metric to analyze for domain specialization", | |
| elem_classes=["dropdown"] | |
| ) | |
| gr.HTML('<div class="chart-container">') | |
| domain_specialization_plot = gr.Plot( | |
| label="", | |
| value=create_domain_specialization_matrix(load_leaderboard_data(), "AC"), | |
| elem_classes=["specialization-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # 5. Performance Gap Analysis | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-top: 32px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-secondary);">π</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Performance Gap Analysis by Domain | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| Visualize the performance range across models for each domain | |
| </p> | |
| """) | |
| gr.HTML('<div class="chart-container">') | |
| performance_gap_plot = gr.Plot( | |
| label="", | |
| value=create_performance_gap_analysis(load_leaderboard_data(), "AC"), | |
| elem_classes=["gap-analysis-chart", "plot-container"] | |
| ) | |
| gr.HTML('</div>') | |
| gr.HTML("</div>") | |
| # Update functions for new visualizations | |
| def update_cost_performance(efficiency_metric): | |
| actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric) | |
| return create_cost_performance_scatter(load_leaderboard_data(), actual_metric) | |
| def update_speed_accuracy(efficiency_metric): | |
| actual_metric = SORT_COLUMN_MAP.get(efficiency_metric, efficiency_metric) | |
| return create_speed_accuracy_plot(load_leaderboard_data(), actual_metric) | |
| def update_domain_specialization(specialization_metric): | |
| metric_type = "AC" if "AC" in specialization_metric else "TSQ" | |
| return create_domain_specialization_matrix(load_leaderboard_data(), metric_type) | |
| def update_performance_gap(specialization_metric): | |
| metric_type = "AC" if "AC" in specialization_metric else "TSQ" | |
| return create_performance_gap_analysis(load_leaderboard_data(), metric_type) | |
| def update_all_visualizations(domain_filter, model_type_filter, reasoning_filter, sort_by, sort_order): | |
| """Update all new visualizations when filters change""" | |
| df = load_leaderboard_data() | |
| filtered_df = apply_filters(df, domain_filter, model_type_filter, reasoning_filter) | |
| # Update efficiency metric based on current sort | |
| actual_metric = SORT_COLUMN_MAP.get(sort_by, sort_by) if sort_by in ["Avg Action Completion", "Avg Tool Selection Quality"] else "Avg AC" | |
| # Update all plots | |
| cost_perf = create_cost_performance_scatter(filtered_df, actual_metric) | |
| speed_acc = create_speed_accuracy_plot(filtered_df, actual_metric) | |
| heatmap = create_performance_heatmap(filtered_df) | |
| return cost_perf, speed_acc, heatmap | |
| def apply_filters(df, domain_filter, model_type_filter, reasoning_filter): | |
| """Apply filters to dataframe""" | |
| filtered_df = df.copy() | |
| # Strip emoji prefix from domain filter | |
| domain_filter_clean = domain_filter | |
| if domain_filter.startswith('π'): | |
| domain_filter_clean = "All" | |
| elif domain_filter.startswith('π¦'): | |
| domain_filter_clean = "Banking" | |
| elif domain_filter.startswith('π₯'): | |
| domain_filter_clean = "Healthcare" | |
| elif domain_filter.startswith('π‘οΈ'): | |
| domain_filter_clean = "Insurance" | |
| elif domain_filter.startswith('π°'): | |
| domain_filter_clean = "Investment" | |
| elif domain_filter.startswith('π±'): | |
| domain_filter_clean = "Telecom" | |
| # Domain filtering | |
| if domain_filter_clean != "All": | |
| domain_col_map = { | |
| "Banking": "Banking AC", | |
| "Healthcare": "Healthcare AC", | |
| "Insurance": "Insurance AC", | |
| "Investment": "Investment AC", | |
| "Telecom": "Telecom AC" | |
| } | |
| if domain_filter_clean in domain_col_map: | |
| domain_col = domain_col_map[domain_filter_clean] | |
| filtered_df = filtered_df[filtered_df[domain_col] != ''] | |
| # Model type filtering | |
| if model_type_filter != "All": | |
| if model_type_filter == "Open Source": | |
| filtered_df = filtered_df[filtered_df['Model Type'] == 'Open source'] | |
| elif model_type_filter == "Proprietary": | |
| filtered_df = filtered_df[filtered_df['Model Type'] == 'Proprietary'] | |
| # Reasoning filtering | |
| if reasoning_filter != "All": | |
| if reasoning_filter == "Reasoning": | |
| filtered_df = filtered_df[filtered_df['Output Type'] == 'Reasoning'] | |
| elif reasoning_filter == "Normal": | |
| filtered_df = filtered_df[filtered_df['Output Type'] == 'Normal'] | |
| return filtered_df | |
| # Connect update functions to components | |
| efficiency_metric.change( | |
| fn=update_cost_performance, | |
| inputs=[efficiency_metric], | |
| outputs=[cost_performance_plot] | |
| ) | |
| efficiency_metric.change( | |
| fn=update_speed_accuracy, | |
| inputs=[efficiency_metric], | |
| outputs=[speed_accuracy_plot] | |
| ) | |
| specialization_metric.change( | |
| fn=update_domain_specialization, | |
| inputs=[specialization_metric], | |
| outputs=[domain_specialization_plot] | |
| ) | |
| specialization_metric.change( | |
| fn=update_performance_gap, | |
| inputs=[specialization_metric], | |
| outputs=[performance_gap_plot] | |
| ) | |
| # Update new visualizations when main filters change | |
| for input_component in filter_inputs: | |
| input_component.change( | |
| fn=update_all_visualizations, | |
| inputs=filter_inputs, | |
| outputs=[cost_performance_plot, speed_accuracy_plot, performance_heatmap] | |
| ) | |
| # Define generate_performance_card function before using it | |
| def generate_performance_card(model_name): | |
| """Generate HTML for the model performance card""" | |
| if not model_name: | |
| return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;"> | |
| Please select a model to generate its performance card | |
| </div>""" | |
| # Get model data | |
| df = load_leaderboard_data() | |
| model_data = df[df['Model'] == model_name] | |
| if model_data.empty: | |
| return """<div style="text-align: center; color: var(--text-secondary); padding: 40px;"> | |
| Model not found in the database | |
| </div>""" | |
| row = model_data.iloc[0] | |
| # Get overall rank | |
| df_with_ac = df[df['Avg AC'] != ''].copy() | |
| df_with_ac['Avg AC'] = pd.to_numeric(df_with_ac['Avg AC'], errors='coerce') | |
| df_sorted = df_with_ac.sort_values('Avg AC', ascending=False).reset_index(drop=True) | |
| try: | |
| rank = df_sorted[df_sorted['Model'] == model_name].index[0] + 1 | |
| except: | |
| rank = 'N/A' | |
| # Format values | |
| def format_value(val, decimals=3, prefix='', suffix=''): | |
| if pd.isna(val) or val == '': | |
| return 'N/A' | |
| return f"{prefix}{float(val):.{decimals}f}{suffix}" | |
| # Determine model type icon and badge color | |
| type_icon = "π" if row['Model Type'] == 'Open source' else "π" | |
| reasoning_icon = "π§ " if row.get('Output Type', '') == 'Reasoning' else "π‘" | |
| # Calculate performance stars | |
| def get_performance_stars(value, max_val=1.0): | |
| if pd.isna(value) or value == '': | |
| return 'β' * 0 | |
| score = float(value) / max_val | |
| if score >= 0.9: | |
| return 'β' * 5 | |
| elif score >= 0.7: | |
| return 'β' * 4 | |
| elif score >= 0.5: | |
| return 'β' * 3 | |
| elif score >= 0.3: | |
| return 'β' * 2 | |
| else: | |
| return 'β' * 1 | |
| # Create HTML | |
| card_html = f""" | |
| <div class="performance-card"> | |
| <div class="card-header"> | |
| <h1 class="card-model-name">{model_name}</h1> | |
| <div class="card-stars"> | |
| {get_performance_stars(row['Avg AC'])} | |
| </div> | |
| </div> | |
| <div class="metrics-grid" style="margin-bottom: 24px;"> | |
| <div class="metric-item"> | |
| <div class="metric-icon" style="color: var(--accent-primary);">π</div> | |
| <div class="metric-label">Overall Rank</div> | |
| <div class="metric-value">#{rank}</div> | |
| </div> | |
| <div class="metric-item"> | |
| <div class="metric-icon" style="color: var(--accent-primary);">π―</div> | |
| <div class="metric-label">Action Completion</div> | |
| <div class="metric-value">{format_value(row['Avg AC'])}</div> | |
| </div> | |
| <div class="metric-item"> | |
| <div class="metric-icon" style="color: var(--accent-secondary);">π οΈ</div> | |
| <div class="metric-label">Tool Selection</div> | |
| <div class="metric-value">{format_value(row['Avg TSQ'])}</div> | |
| </div> | |
| <div class="metric-item"> | |
| <div class="metric-icon" style="color: #F5F6F7;">π°</div> | |
| <div class="metric-label">Avg Cost</div> | |
| <div class="metric-value">{format_value(row['Avg Total Cost'], 3, '$')}</div> | |
| </div> | |
| <div class="metric-item"> | |
| <div class="metric-icon" style="color: #F5F6F7;">β‘</div> | |
| <div class="metric-label">Avg Duration</div> | |
| <div class="metric-value">{format_value(row['Avg Session Duration'], 1, '', 's')}</div> | |
| </div> | |
| <div class="metric-item"> | |
| <div class="metric-icon" style="color: #F5F6F7;">π¬</div> | |
| <div class="metric-label">Avg Turns</div> | |
| <div class="metric-value">{format_value(row['Avg Turns'], 1)}</div> | |
| </div> | |
| </div> | |
| <div class="domains-section" style="margin-top: 24px;"> | |
| <h3 class="domains-title">ποΈ Domain Performance</h3> | |
| <div class="domains-grid"> | |
| """ | |
| # Add domain scores | |
| domains = [ | |
| ('π¦', 'Banking'), | |
| ('π₯', 'Healthcare'), | |
| ('π‘οΈ', 'Insurance'), | |
| ('π°', 'Investment'), | |
| ('π±', 'Telecom') | |
| ] | |
| for domain_icon, domain_name in domains: | |
| ac_col = f'{domain_name} AC' | |
| ac_value = row.get(ac_col, '') | |
| if ac_value != '' and not pd.isna(ac_value): | |
| score_display = f"{float(ac_value):.3f}" | |
| score_color = "var(--accent-primary)" | |
| else: | |
| score_display = "N/A" | |
| score_color = "var(--text-muted)" | |
| card_html += f""" | |
| <div class="domain-item"> | |
| <div class="domain-name">{domain_icon}</div> | |
| <div style="font-size: 0.7rem; color: var(--text-secondary); margin-bottom: 2px;">{domain_name}</div> | |
| <div class="domain-score" style="color: {score_color};">{score_display}</div> | |
| </div> | |
| """ | |
| card_html += f""" | |
| </div> | |
| </div> | |
| <div class="card-footer"> | |
| <div class="card-url"> | |
| <strong>https://galileo.ai/agent-leaderboard</strong> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| return card_html | |
| # MODEL PERFORMANCE CARD SECTION | |
| gr.HTML(""" | |
| <div class="dark-container" style="margin-top: 32px;"> | |
| <div class="section-header"> | |
| <span class="section-icon" style="color: var(--accent-primary);">π―</span> | |
| <h3 style="margin: 0; color: var(--text-primary); font-size: 1.5rem; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Model Performance Card | |
| </h3> | |
| </div> | |
| <p style="color: var(--text-secondary); margin-bottom: 20px; font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| Comprehensive performance card for any model - perfect for presentations and reports | |
| </p> | |
| <div style="display: flex; gap: 24px; align-items: flex-start;"> | |
| <!-- Controls Column --> | |
| <div style="flex: 0 0 280px;"> | |
| <div style="background: rgba(245, 246, 247, 0.03); border: 1px solid var(--border-subtle); | |
| border-radius: 16px; padding: 20px; position: sticky; top: 20px;"> | |
| """) | |
| card_model_selector = gr.Dropdown( | |
| choices=initial_df['Model'].tolist(), | |
| value=initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None, | |
| label="π€ Select Model", | |
| info="Choose a model to view its performance card", | |
| elem_classes=["dropdown"] | |
| ) | |
| download_card_btn = gr.Button( | |
| "π₯ Download Card as PNG", | |
| variant="secondary", | |
| elem_classes=["download-button"], | |
| elem_id="download-card-btn" | |
| ) | |
| gr.HTML(""" | |
| </div> | |
| </div> | |
| <!-- Card Display Column --> | |
| <div style="flex: 1; min-width: 0;" id="card-display-container"> | |
| """) | |
| # Card display area - generate initial card | |
| initial_model = initial_df['Model'].tolist()[0] if len(initial_df) > 0 else None | |
| initial_card_html = generate_performance_card(initial_model) if initial_model else "" | |
| card_display = gr.HTML(value=initial_card_html, elem_id="performance-card-html") | |
| gr.HTML(""" | |
| </div> | |
| </div> | |
| </div>""") | |
| # Add custom CSS for the performance card | |
| gr.HTML(""" | |
| <style> | |
| /* Performance Card Styles */ | |
| .performance-card { | |
| background: linear-gradient(145deg, rgba(1, 9, 26, 0.98) 0%, rgba(227, 84, 84, 0.05) 100%); | |
| border: 2px solid var(--accent-primary); | |
| border-radius: 24px; | |
| padding: 32px; | |
| max-width: 700px; | |
| margin: 0 auto; | |
| position: relative; | |
| overflow: hidden; | |
| box-shadow: | |
| 0 20px 40px rgba(0, 0, 0, 0.5), | |
| 0 0 80px rgba(227, 84, 84, 0.2), | |
| inset 0 0 120px rgba(227, 84, 84, 0.05); | |
| } | |
| .performance-card::before { | |
| content: ''; | |
| position: absolute; | |
| top: -50%; | |
| left: -50%; | |
| width: 200%; | |
| height: 200%; | |
| background: radial-gradient(circle at center, var(--glow-primary) 0%, transparent 70%); | |
| opacity: 0.1; | |
| animation: pulse 4s ease-in-out infinite; | |
| } | |
| .card-header { | |
| text-align: center; | |
| margin-bottom: 24px; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| .card-badges { | |
| display: flex; | |
| justify-content: center; | |
| gap: 12px; | |
| margin-bottom: 16px; | |
| } | |
| .card-model-name { | |
| font-size: 2rem; | |
| font-weight: 800; | |
| background: linear-gradient(135deg, var(--accent-primary) 0%, var(--accent-secondary) 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| margin-bottom: 8px; | |
| text-shadow: 0 0 40px var(--glow-primary); | |
| line-height: 1.2; | |
| } | |
| .card-stars { | |
| font-size: 1.2rem; | |
| margin: 8px 0; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| gap: 2px; | |
| } | |
| .card-vendor { | |
| font-size: 1.2rem; | |
| color: var(--text-secondary); | |
| font-weight: 500; | |
| margin-top: 4px; | |
| } | |
| .metrics-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); | |
| gap: 16px; | |
| margin-bottom: 24px; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| .metric-item { | |
| background: rgba(245, 246, 247, 0.05); | |
| border: 1px solid var(--border-subtle); | |
| border-radius: 16px; | |
| padding: 16px; | |
| text-align: center; | |
| transition: all 0.3s ease; | |
| } | |
| .metric-item:hover { | |
| transform: translateY(-4px); | |
| border-color: var(--accent-primary); | |
| box-shadow: 0 8px 24px rgba(227, 84, 84, 0.2); | |
| } | |
| .metric-icon { | |
| font-size: 1.5rem; | |
| margin-bottom: 6px; | |
| filter: drop-shadow(0 0 20px currentColor); | |
| } | |
| .metric-label { | |
| font-size: 0.75rem; | |
| color: var(--text-secondary); | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| margin-bottom: 4px; | |
| } | |
| .metric-value { | |
| font-size: 1.4rem; | |
| font-weight: 700; | |
| color: var(--text-primary); | |
| font-family: 'Geist Mono', monospace; | |
| } | |
| .domains-section { | |
| margin-top: 32px; | |
| position: relative; | |
| z-index: 1; | |
| } | |
| .domains-title { | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| color: var(--text-primary); | |
| margin-bottom: 16px; | |
| text-align: center; | |
| } | |
| .domains-grid { | |
| display: grid; | |
| grid-template-columns: repeat(5, 1fr); | |
| gap: 12px; | |
| } | |
| .domain-item { | |
| background: rgba(245, 246, 247, 0.05); | |
| border: 1px solid var(--border-subtle); | |
| border-radius: 12px; | |
| padding: 12px; | |
| text-align: center; | |
| } | |
| .domain-name { | |
| font-size: 1.4rem; | |
| margin-bottom: 4px; | |
| } | |
| .domain-score { | |
| font-size: 1rem; | |
| font-weight: 600; | |
| color: var(--accent-primary); | |
| } | |
| .card-footer { | |
| text-align: center; | |
| margin-top: 24px; | |
| padding-top: 20px; | |
| border-top: 1px solid var(--border-subtle); | |
| position: relative; | |
| z-index: 1; | |
| } | |
| .card-badge { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 8px; | |
| padding: 8px 16px; | |
| background: rgba(245, 246, 247, 0.05); | |
| border: 1px solid var(--border-subtle); | |
| border-radius: 20px; | |
| font-size: 0.9rem; | |
| color: var(--text-secondary); | |
| margin: 0 4px; | |
| } | |
| .card-url { | |
| margin-top: 12px; | |
| font-size: 0.75rem; | |
| color: var(--text-muted); | |
| font-family: 'Geist Mono', monospace; | |
| } | |
| .primary-button { | |
| background: linear-gradient(135deg, var(--accent-primary) 0%, #B94545 100%) !important; | |
| color: white !important; | |
| border: none !important; | |
| padding: 10px 20px !important; | |
| font-weight: 600 !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .primary-button:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 8px 24px rgba(227, 84, 84, 0.4) !important; | |
| } | |
| /* Download button styling */ | |
| .download-button { | |
| background: linear-gradient(135deg, var(--accent-secondary) 0%, #0A6BC4 100%) !important; | |
| color: white !important; | |
| border: none !important; | |
| padding: 10px 20px !important; | |
| font-weight: 600 !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .download-button:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 8px 24px rgba(16, 152, 247, 0.4) !important; | |
| } | |
| /* Responsive layout for performance card section */ | |
| @media (max-width: 1200px) { | |
| .performance-card { | |
| padding: 24px !important; | |
| } | |
| .card-model-name { | |
| font-size: 1.7rem !important; | |
| } | |
| .metric-value { | |
| font-size: 1.2rem !important; | |
| } | |
| } | |
| @media (max-width: 900px) { | |
| /* Stack the controls above the card on smaller screens */ | |
| #card-display-container { | |
| margin-top: 20px; | |
| } | |
| .performance-card { | |
| padding: 20px !important; | |
| } | |
| .card-model-name { | |
| font-size: 1.5rem !important; | |
| } | |
| .metric-value { | |
| font-size: 1.1rem !important; | |
| } | |
| .domains-grid { | |
| grid-template-columns: repeat(3, 1fr) !important; | |
| } | |
| } | |
| /* Button states */ | |
| .download-button:disabled { | |
| opacity: 0.6 !important; | |
| cursor: not-allowed !important; | |
| } | |
| </style> | |
| <!-- Include html2canvas library --> | |
| <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/html2canvas.min.js"></script> | |
| """) | |
| # Wire up the card generator to selection change | |
| card_model_selector.change( | |
| fn=generate_performance_card, | |
| inputs=[card_model_selector], | |
| outputs=[card_display] | |
| ) | |
| # Wire up download button with improved functionality | |
| download_card_btn.click( | |
| fn=None, | |
| js=""" | |
| () => { | |
| // Wait a bit to ensure the card is fully rendered | |
| setTimeout(() => { | |
| const card = document.querySelector('.performance-card'); | |
| if (!card) { | |
| alert('Performance card not found. Please select a model first.'); | |
| return; | |
| } | |
| // Check if html2canvas is loaded | |
| if (typeof html2canvas === 'undefined') { | |
| // Try to load html2canvas dynamically | |
| const script = document.createElement('script'); | |
| script.src = 'https://cdn.jsdelivr.net/npm/[email protected]/dist/html2canvas.min.js'; | |
| script.onload = () => { | |
| captureCard(); | |
| }; | |
| script.onerror = () => { | |
| alert('Failed to load html2canvas library. Please try again.'); | |
| }; | |
| document.head.appendChild(script); | |
| } else { | |
| captureCard(); | |
| } | |
| function captureCard() { | |
| // Show loading indicator | |
| const btn = document.getElementById('download-card-btn'); | |
| const originalText = btn.textContent; | |
| btn.textContent = 'Generating...'; | |
| btn.disabled = true; | |
| html2canvas(card, { | |
| backgroundColor: '#01091A', | |
| scale: 2, | |
| logging: false, | |
| useCORS: true, | |
| allowTaint: true | |
| }).then(canvas => { | |
| // Create download link | |
| const link = document.createElement('a'); | |
| const modelName = card.querySelector('.card-model-name')?.textContent || 'model'; | |
| const timestamp = new Date().toISOString().slice(0,10); | |
| const fileName = `${modelName.replace(/[^a-z0-9]/gi, '-').toLowerCase()}-performance-${timestamp}.png`; | |
| link.download = fileName; | |
| link.href = canvas.toDataURL('image/png'); | |
| document.body.appendChild(link); | |
| link.click(); | |
| document.body.removeChild(link); | |
| // Restore button | |
| btn.textContent = originalText; | |
| btn.disabled = false; | |
| }).catch(error => { | |
| console.error('Error capturing card:', error); | |
| alert('Failed to capture performance card. Please try again.'); | |
| btn.textContent = originalText; | |
| btn.disabled = false; | |
| }); | |
| } | |
| }, 100); | |
| } | |
| """ | |
| ) | |
| # Also update card when filters change to keep model selector in sync | |
| for input_component in filter_inputs: | |
| def update_dropdown_and_card(*args): | |
| filtered_df = apply_filters(load_leaderboard_data(), args[0], args[1], args[2]) | |
| choices = filtered_df['Model'].tolist() | |
| # Select first model from filtered list | |
| value = choices[0] if choices else None | |
| return gr.Dropdown(choices=choices, value=value) | |
| input_component.change( | |
| fn=update_dropdown_and_card, | |
| inputs=filter_inputs, | |
| outputs=[card_model_selector] | |
| ) | |
| # Footer CTAs | |
| gr.HTML(""" | |
| <div style="margin-top: 60px; padding: 40px 20px; background: linear-gradient(145deg, rgba(227, 84, 84, 0.05) 0%, rgba(245, 246, 247, 0.05) 100%); border-radius: 20px; border: 1px solid var(--border-subtle);"> | |
| <div style="text-align: center; margin-bottom: 30px;"> | |
| <h3 style="font-size: 2rem; color: var(--text-primary); margin-bottom: 10px; font-family: 'Geist', sans-serif; font-weight: 700;"> | |
| Ready to Take Your AI to the Next Level? | |
| </h3> | |
| <p style="color: var(--text-secondary); font-size: 1.1rem; font-family: 'Geist', sans-serif;"> | |
| Learn more about building better agents and evaluating your models | |
| </p> | |
| </div> | |
| <div style="display: flex; justify-content: center; gap: 16px; flex-wrap: wrap;"> | |
| <a href="https://galileo.ai/mastering-agents-ebook?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="header-action-button"> | |
| <span class="action-button-icon">π</span>Mastering Agents eBook | |
| </a> | |
| <a href="https://app.galileo.ai/sign-up?utm_medium=referral&utm_source=HF&utm_campaign=agent_leaderboard_v2" target="_blank" class="header-action-button"> | |
| <span class="action-button-icon">π</span>Evaluate your GenAI for free | |
| </a> | |
| </div> | |
| </div> | |
| """) | |
| # Add FAQ section at the end | |
| gr.HTML(get_faq_section()) | |
| return leaderboard_table | |
| def create_leaderboard_v2_interface(): | |
| """Create the complete leaderboard v2 interface""" | |
| return create_leaderboard_v2_tab() | |
| def create_domain_radar_chart(df, metric_type, selected_models=None, max_models=5): | |
| """Create a radar chart showing model performance across domains for the selected metric""" | |
| # Map the metric_type to actual column name using shared mapping | |
| actual_metric_type = SORT_COLUMN_MAP.get(metric_type, metric_type) | |
| if selected_models is None or len(selected_models) == 0: | |
| # Default to top 5 models by the selected metric if available | |
| if actual_metric_type in df.columns: | |
| selected_models = df.nlargest(max_models, actual_metric_type)['Model'].tolist() | |
| else: | |
| selected_models = df.head(max_models)['Model'].tolist() | |
| # Limit to max_models for readability | |
| selected_models = selected_models[:max_models] | |
| # Define domain mapping based on metric type | |
| domain_mapping = { | |
| 'Avg AC': { | |
| 'Banking': 'Banking AC', | |
| 'Healthcare': 'Healthcare AC', | |
| 'Insurance': 'Insurance AC', | |
| 'Investment': 'Investment AC', | |
| 'Telecom': 'Telecom AC' | |
| }, | |
| 'Avg TSQ': { | |
| 'Banking': 'Banking TSQ', | |
| 'Healthcare': 'Healthcare TSQ', | |
| 'Insurance': 'Insurance TSQ', | |
| 'Investment': 'Investment TSQ', | |
| 'Telecom': 'Telecom TSQ' | |
| }, | |
| 'Avg Total Cost': { | |
| 'Banking': 'Banking Cost', | |
| 'Healthcare': 'Healthcare Cost', | |
| 'Insurance': 'Insurance Cost', | |
| 'Investment': 'Investment Cost', | |
| 'Telecom': 'Telecom Cost' | |
| }, | |
| 'Avg Session Duration': { | |
| 'Banking': 'Banking Duration', | |
| 'Healthcare': 'Healthcare Duration', | |
| 'Insurance': 'Insurance Duration', | |
| 'Investment': 'Investment Duration', | |
| 'Telecom': 'Telecom Duration' | |
| }, | |
| 'Avg Turns': { | |
| 'Banking': 'Banking Turns', | |
| 'Healthcare': 'Healthcare Turns', | |
| 'Insurance': 'Insurance Turns', | |
| 'Investment': 'Investment Turns', | |
| 'Telecom': 'Telecom Turns' | |
| } | |
| } | |
| # Only show radar chart for AC and TSQ metrics that have domain breakdowns | |
| if actual_metric_type not in domain_mapping: | |
| return create_empty_radar_chart(f"Domain breakdown not available for {metric_type}") | |
| fig = go.Figure() | |
| domains = list(domain_mapping[actual_metric_type].keys()) | |
| domain_columns = list(domain_mapping[actual_metric_type].values()) | |
| # Galileo dark theme color scheme | |
| galileo_dark_colors = [ | |
| {'fill': 'rgba(227, 84, 84, 0.25)', 'line': '#E35454', 'name': 'Vanguard'}, # Vanguard Red | |
| {'fill': 'rgba(16, 152, 247, 0.15)', 'line': '#1098F7', 'name': 'Airglow'}, # Airglow Blue | |
| {'fill': 'rgba(245, 246, 247, 0.15)', 'line': '#F5F6F7', 'name': 'Mercury'}, # Light Mercury | |
| {'fill': 'rgba(227, 84, 84, 0.35)', 'line': '#B94545', 'name': 'Deep Red'}, # Darker Vanguard | |
| {'fill': 'rgba(16, 152, 247, 0.25)', 'line': '#0A6BC4', 'name': 'Deep Blue'} # Darker Airglow | |
| ] | |
| for idx, model_name in enumerate(selected_models): | |
| model_data = df[df['Model'] == model_name] | |
| if model_data.empty: | |
| continue | |
| model_row = model_data.iloc[0] | |
| values = [] | |
| # Get values for each domain | |
| for col in domain_columns: | |
| if col in df.columns and col in model_row: | |
| val = model_row[col] | |
| if pd.isna(val) or val == '': | |
| val = 0 | |
| else: | |
| val = float(val) | |
| values.append(val) | |
| else: | |
| values.append(0) | |
| # Close the radar chart by repeating first value | |
| values_plot = values + [values[0]] | |
| domains_plot = domains + [domains[0]] | |
| colors = galileo_dark_colors[idx % len(galileo_dark_colors)] | |
| fig.add_trace( | |
| go.Scatterpolar( | |
| r=values_plot, | |
| theta=domains_plot, | |
| fill='toself', | |
| fillcolor=colors['fill'], | |
| line=dict( | |
| color=colors['line'], | |
| width=3, | |
| shape='spline', | |
| smoothing=0.8 | |
| ), | |
| marker=dict( | |
| size=10, | |
| color=colors['line'], | |
| symbol='circle', | |
| line=dict(width=2, color='#01091A') | |
| ), | |
| name=model_name, | |
| mode="lines+markers", | |
| hovertemplate="<b>%{fullData.name}</b><br>" + | |
| "<span style='color: #94A3B8'>%{theta}</span><br>" + | |
| "<b style='font-size: 14px; color: #F5F6F7'>%{r:.3f}</b><br>" + | |
| "<extra></extra>", | |
| hoverlabel=dict( | |
| bgcolor="rgba(1, 9, 26, 0.95)", | |
| bordercolor=colors['line'], | |
| font=dict(color="#F5F6F7", size=12, family="'Geist', sans-serif") | |
| ) | |
| ) | |
| ) | |
| # Determine appropriate range based on metric type | |
| if actual_metric_type in ['Avg AC', 'Avg TSQ']: | |
| max_range = 1.0 | |
| else: | |
| # Calculate max from data for other metrics (Cost, Duration, Turns) | |
| all_values = [] | |
| for model_name in selected_models: | |
| model_data = df[df['Model'] == model_name] | |
| if not model_data.empty: | |
| model_row = model_data.iloc[0] | |
| for col in domain_columns: | |
| if col in df.columns and col in model_row: | |
| val = model_row[col] | |
| if pd.notna(val) and val != '': | |
| all_values.append(float(val)) | |
| max_range = max(all_values) * 1.1 if all_values else 1.0 | |
| # Create custom tick values for better readability | |
| tick_vals = [i * max_range / 5 for i in range(6)] | |
| tick_text = [f"{val:.2f}" for val in tick_vals] | |
| fig.update_layout( | |
| polar=dict( | |
| bgcolor='rgba(245, 246, 247, 0.03)', | |
| radialaxis=dict( | |
| visible=True, | |
| range=[0, max_range], | |
| showline=True, | |
| linewidth=2, | |
| linecolor='rgba(245, 246, 247, 0.2)', | |
| gridcolor='rgba(245, 246, 247, 0.1)', | |
| gridwidth=1, | |
| tickvals=tick_vals, | |
| ticktext=tick_text, | |
| tickfont=dict( | |
| size=11, | |
| color='#94A3B8', | |
| family="'Geist Mono', monospace" | |
| ), | |
| tickangle=0 | |
| ), | |
| angularaxis=dict( | |
| showline=True, | |
| linewidth=2, | |
| linecolor='rgba(245, 246, 247, 0.2)', | |
| gridcolor='rgba(245, 246, 247, 0.08)', | |
| tickfont=dict( | |
| size=14, | |
| family="'Geist', sans-serif", | |
| color='#F5F6F7', | |
| weight=600 | |
| ), | |
| rotation=90, | |
| direction="clockwise", | |
| ), | |
| ), | |
| showlegend=True, | |
| legend=dict( | |
| orientation="v", | |
| yanchor="middle", | |
| y=0.5, | |
| xanchor="left", | |
| x=1.05, | |
| font=dict( | |
| size=12, | |
| family="'Geist', sans-serif", | |
| color='#F5F6F7' | |
| ), | |
| bgcolor='rgba(1, 9, 26, 0.8)', | |
| bordercolor='rgba(245, 246, 247, 0.2)', | |
| borderwidth=1, | |
| itemsizing='constant', | |
| itemwidth=30 | |
| ), | |
| title=dict( | |
| text=f"<b>Domain Performance: {metric_type}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict( | |
| size=22, | |
| family="'Geist', sans-serif", | |
| color="#F5F6F7", | |
| weight=700 | |
| ), | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=900, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=80, r=200), | |
| annotations=[ | |
| dict( | |
| text="Galileo Agent Leaderboard", | |
| xref="paper", yref="paper", | |
| x=0.98, y=0.02, | |
| xanchor='right', yanchor='bottom', | |
| font=dict(size=10, color='#64748B'), | |
| showarrow=False | |
| ) | |
| ] | |
| ) | |
| return fig | |
| def create_empty_radar_chart(message): | |
| """Create an empty radar chart with a message""" | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text=f"π {message}", | |
| xref="paper", yref="paper", | |
| x=0.5, y=0.5, | |
| xanchor='center', yanchor='middle', | |
| font=dict( | |
| size=18, | |
| color="#94A3B8", | |
| family="'Geist', sans-serif" | |
| ), | |
| showarrow=False, | |
| bgcolor="rgba(245, 246, 247, 0.05)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1, | |
| borderpad=20 | |
| ) | |
| fig.update_layout( | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=900, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=80, r=200), | |
| title=dict( | |
| text="<b>Domain Performance Chart</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict( | |
| size=22, | |
| family="'Geist', sans-serif", | |
| color="#F5F6F7", | |
| weight=700 | |
| ), | |
| ), | |
| annotations=[ | |
| dict( | |
| text="Galileo Agent Leaderboard", | |
| xref="paper", yref="paper", | |
| x=0.98, y=0.02, | |
| xanchor='right', yanchor='bottom', | |
| font=dict(size=10, color='#64748B'), | |
| showarrow=False | |
| ) | |
| ] | |
| ) | |
| return fig | |
| # NEW VISUALIZATION FUNCTIONS | |
| def create_cost_performance_scatter(df, metric="Avg AC"): | |
| """Create scatter plot showing cost vs performance efficiency""" | |
| # Filter out models without cost or performance data | |
| df_filtered = df[(df['Avg Total Cost'] != '') & (df[metric] != '')].copy() | |
| if df_filtered.empty: | |
| return create_empty_chart("No data available for cost-performance analysis") | |
| # Convert to numeric | |
| df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') | |
| df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') | |
| df_filtered['Avg Turns'] = pd.to_numeric(df_filtered['Avg Turns'], errors='coerce') | |
| # Create color mapping for model type | |
| color_map = { | |
| 'Proprietary': '#1098F7', # Airglow Blue for Proprietary | |
| 'Open source': '#58BC82' # Green for Open source | |
| } | |
| df_filtered['Color'] = df_filtered['Model Type'].map(color_map).fillna('#F5F6F7') | |
| fig = go.Figure() | |
| # Add scatter points | |
| for model_type in df_filtered['Model Type'].unique(): | |
| df_type = df_filtered[df_filtered['Model Type'] == model_type] | |
| fig.add_trace(go.Scatter( | |
| x=df_type[metric], | |
| y=df_type['Avg Total Cost'], | |
| mode='markers+text', | |
| name=model_type, | |
| text=df_type['Model'], | |
| textposition="top center", | |
| textfont=dict(size=10, color='#94A3B8'), | |
| marker=dict( | |
| size=df_type['Avg Turns'] * 3, # Size based on number of turns | |
| color=color_map.get(model_type, '#F5F6F7'), | |
| opacity=0.8, | |
| line=dict(width=2, color='#01091A') | |
| ), | |
| hovertemplate="<b>%{text}</b><br>" + | |
| f"{metric}: %{{x:.3f}}<br>" + | |
| "Cost: $%{y:.3f}<br>" + | |
| "Turns: %{marker.size:.1f}<br>" + | |
| "<extra></extra>" | |
| )) | |
| # Add quadrant lines | |
| median_x = df_filtered[metric].median() | |
| median_y = df_filtered['Avg Total Cost'].median() | |
| fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) | |
| fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) | |
| # Add quadrant labels | |
| fig.add_annotation(x=0.95, y=0.05, text="π High Performance<br>Low Cost", | |
| showarrow=False, xref="paper", yref="paper", | |
| font=dict(size=12, color="#F5F6F7"), bgcolor="rgba(245, 246, 247, 0.1)") | |
| fig.add_annotation(x=0.05, y=0.95, text="β οΈ Low Performance<br>High Cost", | |
| showarrow=False, xref="paper", yref="paper", | |
| font=dict(size=12, color="#E35454"), bgcolor="rgba(227, 84, 84, 0.1)") | |
| metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" | |
| fig.update_layout( | |
| title=dict( | |
| text=f"<b>Cost-Performance Efficiency: {metric_display}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
| ), | |
| xaxis=dict( | |
| title=dict( | |
| text=f"<b>{metric_display}</b>", | |
| font=dict(size=16, color="#F5F6F7") | |
| ), | |
| tickfont=dict(size=12, color="#94A3B8"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| zerolinecolor="rgba(245, 246, 247, 0.2)" | |
| ), | |
| yaxis=dict( | |
| title=dict( | |
| text="<b>Average Session Cost ($)</b>", | |
| font=dict(size=16, color="#F5F6F7") | |
| ), | |
| tickfont=dict(size=12, color="#94A3B8"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| zerolinecolor="rgba(245, 246, 247, 0.2)" | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=900, | |
| width=1450, | |
| showlegend=True, | |
| legend=dict( | |
| orientation="h", | |
| yanchor="bottom", | |
| y=1.02, | |
| xanchor="right", | |
| x=1, | |
| font=dict(size=12, family="'Geist', sans-serif", color='#F5F6F7'), | |
| bgcolor='rgba(1, 9, 26, 0.8)', | |
| bordercolor='rgba(245, 246, 247, 0.2)', | |
| borderwidth=1 | |
| ), | |
| margin=dict(t=100, b=80, l=80, r=80) | |
| ) | |
| return fig | |
| def create_speed_accuracy_plot(df, metric="Avg AC"): | |
| """Create scatter plot showing speed vs accuracy trade-off""" | |
| # Filter out models without duration or performance data | |
| df_filtered = df[(df['Avg Session Duration'] != '') & (df[metric] != '')].copy() | |
| if df_filtered.empty: | |
| return create_empty_chart("No data available for speed-accuracy analysis") | |
| # Convert to numeric | |
| df_filtered['Avg Session Duration'] = pd.to_numeric(df_filtered['Avg Session Duration'], errors='coerce') | |
| df_filtered[metric] = pd.to_numeric(df_filtered[metric], errors='coerce') | |
| # Create color scale based on cost | |
| df_filtered['Avg Total Cost'] = pd.to_numeric(df_filtered['Avg Total Cost'], errors='coerce') | |
| fig = go.Figure() | |
| # Add scatter trace | |
| fig.add_trace(go.Scatter( | |
| x=df_filtered[metric], | |
| y=df_filtered['Avg Session Duration'], | |
| mode='markers+text', | |
| text=df_filtered['Model'], | |
| textposition="top center", | |
| textfont=dict(size=9, color='#94A3B8'), | |
| marker=dict( | |
| size=12, | |
| color=df_filtered['Avg Total Cost'], | |
| colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']], | |
| showscale=True, | |
| colorbar=dict( | |
| title=dict( | |
| text="Cost ($)", | |
| font=dict(color="#F5F6F7") | |
| ), | |
| tickfont=dict(color="#94A3B8"), | |
| bgcolor="rgba(1, 9, 26, 0.8)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1, | |
| x=1.02 | |
| ), | |
| line=dict(width=2, color='#01091A') | |
| ), | |
| hovertemplate="<b>%{text}</b><br>" + | |
| f"{metric}: %{{x:.3f}}<br>" + | |
| "Duration: %{y:.1f}s<br>" + | |
| "Cost: $%{marker.color:.3f}<br>" + | |
| "<extra></extra>" | |
| )) | |
| # Add quadrant lines | |
| median_x = df_filtered[metric].median() | |
| median_y = df_filtered['Avg Session Duration'].median() | |
| fig.add_hline(y=median_y, line_dash="dash", line_color="#64748B", opacity=0.5) | |
| fig.add_vline(x=median_x, line_dash="dash", line_color="#64748B", opacity=0.5) | |
| # Add quadrant labels | |
| fig.add_annotation(x=0.95, y=0.05, text="β‘ Fast & Accurate", | |
| showarrow=False, xref="paper", yref="paper", | |
| font=dict(size=12, color="#F5F6F7", weight=600)) | |
| fig.add_annotation(x=0.05, y=0.95, text="π Slow & Inaccurate", | |
| showarrow=False, xref="paper", yref="paper", | |
| font=dict(size=12, color="#E35454", weight=600)) | |
| metric_display = "Action Completion" if metric == "Avg AC" else "Tool Selection Quality" | |
| fig.update_layout( | |
| title=dict( | |
| text=f"<b>Speed vs Accuracy Trade-off: {metric_display}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
| ), | |
| xaxis=dict( | |
| title=dict( | |
| text=f"<b>{metric_display}</b>", | |
| font=dict(size=16, color="#F5F6F7") | |
| ), | |
| tickfont=dict(size=12, color="#94A3B8"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| zerolinecolor="rgba(245, 246, 247, 0.2)" | |
| ), | |
| yaxis=dict( | |
| title=dict( | |
| text="<b>Average Session Duration (seconds)</b>", | |
| font=dict(size=16, color="#F5F6F7") | |
| ), | |
| tickfont=dict(size=12, color="#94A3B8"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| zerolinecolor="rgba(245, 246, 247, 0.2)" | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=900, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=80, r=120) | |
| ) | |
| return fig | |
| def create_performance_heatmap(df): | |
| """Create a heatmap showing all metrics for all models""" | |
| # Select relevant columns | |
| metrics = ['Avg AC', 'Avg TSQ', 'Avg Total Cost', 'Avg Session Duration', 'Avg Turns'] | |
| # Filter models with data | |
| df_filtered = df[df['Avg AC'] != ''].copy() | |
| if df_filtered.empty: | |
| return create_empty_chart("No data available for performance heatmap") | |
| # Convert to numeric and normalize | |
| for col in metrics: | |
| df_filtered[col] = pd.to_numeric(df_filtered[col], errors='coerce') | |
| # Create normalized data (0-1 scale) | |
| # For cost, duration, and turns, lower is better so we invert | |
| normalized_data = [] | |
| metric_labels = [] | |
| for col in metrics: | |
| if col in ['Avg Total Cost', 'Avg Session Duration', 'Avg Turns']: | |
| # Invert these metrics (lower is better) | |
| normalized = 1 - (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min()) | |
| else: | |
| # Higher is better for AC and TSQ | |
| normalized = (df_filtered[col] - df_filtered[col].min()) / (df_filtered[col].max() - df_filtered[col].min()) | |
| normalized_data.append(normalized.values) | |
| # Create better labels | |
| label_map = { | |
| 'Avg AC': 'Action Completion', | |
| 'Avg TSQ': 'Tool Selection', | |
| 'Avg Total Cost': 'Cost Efficiency', | |
| 'Avg Session Duration': 'Speed', | |
| 'Avg Turns': 'Conversation Efficiency' | |
| } | |
| metric_labels.append(label_map.get(col, col)) | |
| # Create heatmap | |
| fig = go.Figure(data=go.Heatmap( | |
| z=normalized_data, | |
| x=df_filtered['Model'].tolist(), | |
| y=metric_labels, | |
| colorscale=[[0, '#01091A'], [0.5, '#1098F7'], [1, '#E35454']], | |
| hovertemplate="<b>%{x}</b><br>" + | |
| "%{y}: %{z:.2f}<br>" + | |
| "<extra></extra>", | |
| text=[[f"{val:.2f}" for val in row] for row in normalized_data], | |
| texttemplate="%{text}", | |
| textfont={"size": 10, "color": "#F5F6F7"}, | |
| showscale=True, | |
| colorbar=dict( | |
| title=dict( | |
| text="Performance<br>Score", | |
| font=dict(color="#F5F6F7") | |
| ), | |
| tickfont=dict(color="#94A3B8"), | |
| bgcolor="rgba(1, 9, 26, 0.8)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1 | |
| ) | |
| )) | |
| fig.update_layout( | |
| title=dict( | |
| text="<b>Comprehensive Performance Heatmap</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
| ), | |
| xaxis=dict( | |
| side="bottom", | |
| tickfont=dict(size=11, color="#94A3B8"), | |
| tickangle=-45 | |
| ), | |
| yaxis=dict( | |
| tickfont=dict(size=13, color="#F5F6F7", weight=600) | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=700, | |
| width=1550, | |
| margin=dict(t=100, b=120, l=170, r=120) | |
| ) | |
| return fig | |
| def create_domain_specialization_matrix(df, metric_type="AC"): | |
| """Create bubble chart showing domain specialization""" | |
| domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] | |
| # Prepare data | |
| data = [] | |
| for _, model in df.iterrows(): | |
| if model['Model'] == '': | |
| continue | |
| model_avg = pd.to_numeric(model[f'Avg {metric_type}'], errors='coerce') | |
| if pd.isna(model_avg): | |
| continue | |
| for domain in domains: | |
| domain_col = f'{domain} {metric_type}' | |
| if domain_col in model and model[domain_col] != '': | |
| domain_val = pd.to_numeric(model[domain_col], errors='coerce') | |
| if not pd.isna(domain_val): | |
| # Calculate specialization strength (deviation from model average) | |
| specialization = domain_val - model_avg | |
| data.append({ | |
| 'Model': model['Model'], | |
| 'Domain': domain, | |
| 'Performance': domain_val, | |
| 'Specialization': specialization, | |
| 'Model Type': model['Model Type'] | |
| }) | |
| if not data: | |
| return create_empty_chart("No domain specialization data available") | |
| df_plot = pd.DataFrame(data) | |
| # Create bubble chart | |
| fig = go.Figure() | |
| # Color based on specialization strength | |
| fig.add_trace(go.Scatter( | |
| x=df_plot['Domain'], | |
| y=df_plot['Model'], | |
| mode='markers', | |
| marker=dict( | |
| size=df_plot['Performance'] * 30, # Size based on absolute performance | |
| color=df_plot['Specialization'], | |
| colorscale=[[0, '#1098F7'], [0.5, '#F5F6F7'], [1, '#E35454']], | |
| showscale=True, | |
| colorbar=dict( | |
| title=dict( | |
| text="Specialization<br>Strength", | |
| font=dict(color="#F5F6F7") | |
| ), | |
| tickfont=dict(color="#94A3B8"), | |
| bgcolor="rgba(1, 9, 26, 0.8)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1 | |
| ), | |
| line=dict(width=2, color='#01091A'), | |
| opacity=0.8 | |
| ), | |
| text=[f"Performance: {p:.3f}<br>Specialization: {s:+.3f}" | |
| for p, s in zip(df_plot['Performance'], df_plot['Specialization'])], | |
| hovertemplate="<b>%{y}</b><br>" + | |
| "Domain: %{x}<br>" + | |
| "%{text}<br>" + | |
| "<extra></extra>" | |
| )) | |
| metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" | |
| fig.update_layout( | |
| title=dict( | |
| text=f"<b>Domain Specialization Matrix: {metric_display}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
| ), | |
| xaxis=dict( | |
| title=dict( | |
| text="<b>Business Domains</b>", | |
| font=dict(size=16, color="#F5F6F7") | |
| ), | |
| tickfont=dict(size=13, color="#F5F6F7"), | |
| gridcolor="rgba(245, 246, 247, 0.1)" | |
| ), | |
| yaxis=dict( | |
| title=dict( | |
| text="<b>Models</b>", | |
| font=dict(size=16, color="#F5F6F7") | |
| ), | |
| tickfont=dict(size=11, color="#94A3B8"), | |
| gridcolor="rgba(245, 246, 247, 0.1)" | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=1100, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=220, r=120) | |
| ) | |
| return fig | |
| def create_performance_gap_analysis(df, metric_type="AC"): | |
| """Create range plot showing performance gaps by domain""" | |
| domains = ['Banking', 'Healthcare', 'Insurance', 'Investment', 'Telecom'] | |
| # Calculate min, max, median for each domain | |
| gap_data = [] | |
| for domain in domains: | |
| domain_col = f'{domain} {metric_type}' | |
| if domain_col in df.columns: | |
| domain_values = pd.to_numeric(df[domain_col], errors='coerce').dropna() | |
| if len(domain_values) > 0: | |
| gap_data.append({ | |
| 'Domain': domain, | |
| 'Min': domain_values.min(), | |
| 'Max': domain_values.max(), | |
| 'Median': domain_values.median(), | |
| 'Q1': domain_values.quantile(0.25), | |
| 'Q3': domain_values.quantile(0.75), | |
| 'Gap': domain_values.max() - domain_values.min() | |
| }) | |
| if not gap_data: | |
| return create_empty_chart("No data available for gap analysis") | |
| df_gap = pd.DataFrame(gap_data) | |
| df_gap = df_gap.sort_values('Gap', ascending=True) | |
| fig = go.Figure() | |
| # Add range bars | |
| for idx, row in df_gap.iterrows(): | |
| # Add full range line | |
| fig.add_trace(go.Scatter( | |
| x=[row['Min'], row['Max']], | |
| y=[row['Domain'], row['Domain']], | |
| mode='lines', | |
| line=dict(color='#64748B', width=2), | |
| showlegend=False, | |
| hoverinfo='skip' | |
| )) | |
| # Add IQR box | |
| fig.add_trace(go.Scatter( | |
| x=[row['Q1'], row['Q3'], row['Q3'], row['Q1'], row['Q1']], | |
| y=[row['Domain'], row['Domain'], row['Domain'], row['Domain'], row['Domain']], | |
| fill='toself', | |
| fillcolor='rgba(227, 84, 84, 0.3)', | |
| line=dict(color='#E35454', width=2), | |
| showlegend=False, | |
| hoverinfo='skip', | |
| mode='lines' | |
| )) | |
| # Add median marker | |
| fig.add_trace(go.Scatter( | |
| x=[row['Median']], | |
| y=[row['Domain']], | |
| mode='markers', | |
| marker=dict( | |
| size=12, | |
| color='#E35454', | |
| symbol='diamond', | |
| line=dict(width=2, color='#01091A') | |
| ), | |
| showlegend=False, | |
| hovertemplate=f"<b>{row['Domain']}</b><br>" + | |
| f"Min: {row['Min']:.3f}<br>" + | |
| f"Q1: {row['Q1']:.3f}<br>" + | |
| f"Median: {row['Median']:.3f}<br>" + | |
| f"Q3: {row['Q3']:.3f}<br>" + | |
| f"Max: {row['Max']:.3f}<br>" + | |
| f"Gap: {row['Gap']:.3f}<br>" + | |
| "<extra></extra>" | |
| )) | |
| # Add min/max points | |
| for idx, row in df_gap.iterrows(): | |
| fig.add_trace(go.Scatter( | |
| x=[row['Min'], row['Max']], | |
| y=[row['Domain'], row['Domain']], | |
| mode='markers', | |
| marker=dict(size=8, color='#F5F6F7', line=dict(width=2, color='#01091A')), | |
| showlegend=False, | |
| hoverinfo='skip' | |
| )) | |
| metric_display = "Action Completion" if metric_type == "AC" else "Tool Selection Quality" | |
| fig.update_layout( | |
| title=dict( | |
| text=f"<b>Performance Gap Analysis by Domain: {metric_display}</b>", | |
| x=0.5, | |
| y=0.97, | |
| font=dict(size=22, family="'Geist', sans-serif", color="#F5F6F7", weight=700) | |
| ), | |
| xaxis=dict( | |
| title=dict( | |
| text=f"<b>{metric_display} Score</b>", | |
| font=dict(size=16, color="#F5F6F7") | |
| ), | |
| tickfont=dict(size=12, color="#94A3B8"), | |
| gridcolor="rgba(245, 246, 247, 0.1)", | |
| range=[0, 1] if metric_type in ['AC', 'TSQ'] else None | |
| ), | |
| yaxis=dict( | |
| title=dict( | |
| text="<b>Business Domain</b>", | |
| font=dict(size=16, color="#F5F6F7") | |
| ), | |
| tickfont=dict(size=13, color="#F5F6F7"), | |
| gridcolor="rgba(245, 246, 247, 0.1)" | |
| ), | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=800, | |
| width=1450, | |
| margin=dict(t=100, b=80, l=140, r=80), | |
| showlegend=False | |
| ) | |
| # Add legend manually | |
| fig.add_annotation( | |
| text="β Median β IQR β Full Range", | |
| xref="paper", yref="paper", | |
| x=0.98, y=0.02, | |
| xanchor='right', yanchor='bottom', | |
| font=dict(size=12, color='#94A3B8'), | |
| showarrow=False | |
| ) | |
| return fig | |
| def create_empty_chart(message): | |
| """Create an empty chart with a message""" | |
| fig = go.Figure() | |
| fig.add_annotation( | |
| text=f"π {message}", | |
| xref="paper", yref="paper", | |
| x=0.5, y=0.5, | |
| xanchor='center', yanchor='middle', | |
| font=dict( | |
| size=18, | |
| color="#94A3B8", | |
| family="'Geist', sans-serif" | |
| ), | |
| showarrow=False, | |
| bgcolor="rgba(245, 246, 247, 0.05)", | |
| bordercolor="rgba(245, 246, 247, 0.2)", | |
| borderwidth=1, | |
| borderpad=20 | |
| ) | |
| fig.update_layout( | |
| paper_bgcolor="#01091A", | |
| plot_bgcolor="rgba(245, 246, 247, 0.02)", | |
| height=700, | |
| width=1450, | |
| margin=dict(t=80, b=80, l=80, r=80) | |
| ) |