Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

App Files Files Community

Pratik Bhavsar commited on Feb 5

Commit

a64af65

1 Parent(s): 5172b6b

added 2 models

Browse files

Files changed (3) hide show

results.csv +19 -17
tabs/leaderboard.py +9 -94
utils.py +31 -3

results.csv CHANGED Viewed

@@ -1,17 +1,19 @@
-Model,Model Type,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
-gemini-2.0-flash-exp,Private,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
-gpt-4o-2024-11-20,Private,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
-gemini-1.5-flash,Private,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
-gemini-1.5-pro,Private,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
-o1-2024-12-17,Private,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
-gpt-4o-mini,Private,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
-qwen2.5-72b-instruct,Open source,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
-mistral-large-2411,Private,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
-claude-3-5-sonnet-20241022,Private,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
-Llama-3.3-70B-Instruct-Turbo,Open source,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
-claude-3-5-haiku-20241022,Private,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
-mistral-small-2409,Private,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
-ministral-8b-2410,Private,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
-Meta-Llama-3.1-8B-Instruct-Turbo,Open source,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
-open-mistral-nemo-2407,Open source,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
-,,,,,0.82,0.78,0.80,0.77,0.75,0.89,0.79,0.95,0.59,0.80,0.80,0.82,0.91,0.87,0.72,0.79

+Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
+gemini-2.0-flash-exp,Private,Normal,Google,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
+gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
+gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
+gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
+o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
+o3-mini-2025-01-31,Private,Reasoning,OpenAI,1.1,4.4,0.847,0.80,0.90,0.87,0.91,0.84,0.72,0.93,0.98,0.63,0.85,0.97,0.84,1,0.43,0.91,0.975
+mistral-small-2501,Open source,Normal,Mistral,0.9,0.9,0.832,0.88,0.78,0.83,0.78,0.92,0.97,0.76,0.99,0.62,0.8,0.82,0.77,0.95,0.92,0.74,0.775
+gpt-4o-mini,Private,Normal,OpenAI,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
+qwen2.5-72b-instruct,Open source,Normal,Alibaba,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
+mistral-large-2411,Private,Normal,Mistral,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
+claude-3-5-sonnet-20241022,Private,Normal,Anthropic,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
+Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
+claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
+mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
+ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
+Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
+open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
+,,,,,,,0.83,0.79,0.81,0.78,0.76,0.89,0.80,0.96,0.60,0.81,0.82,0.82,0.92,0.85,0.73,0.80

tabs/leaderboard.py CHANGED Viewed

@@ -5,98 +5,12 @@ from visualization import (
     get_performance_chart,
     get_performance_cost_chart,
 )
-def get_rank_badge(rank):
-    """Generate HTML for rank badge with appropriate styling"""
-    badge_styles = {
-        1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
-        2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
-        3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
-    }
-    if rank in badge_styles:
-        label, gradient, text_color = badge_styles[rank]
-        return f"""
-            <div style="
-                display: inline-flex;
-                align-items: center;
-                justify-content: center;
-                min-width: 48px;
-                padding: 4px 12px;
-                background: {gradient};
-                color: {text_color};
-                border-radius: 6px;
-                font-weight: 600;
-                font-size: 0.9em;
-                box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
-            ">
-                {label}
-            </div>
-        """
-    return f"""
-        <div style="
-            display: inline-flex;
-            align-items: center;
-            justify-content: center;
-            min-width: 28px;
-            color: #a1a1aa;
-            font-weight: 500;
-        ">
-            {rank}
-        </div>
-    """
-def get_type_badge(model_type):
-    """Generate HTML for model type badge"""
-    colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
-    bg_color = colors.get(model_type, "#4F46E5")
-    return f"""
-        <div style="
-            display: inline-flex;
-            align-items: center;
-            padding: 4px 8px;
-            background: {bg_color};
-            color: white;
-            border-radius: 4px;
-            font-size: 0.85em;
-            font-weight: 500;
-        ">
-            {model_type}
-        </div>
-    """
-def get_score_bar(score):
-    """Generate HTML for score bar"""
-    width = score * 100
-    return f"""
-        <div style="display: flex; align-items: center; gap: 12px; width: 100%;">
-            <div style="
-                flex-grow: 1;
-                height: 6px;
-                background: var(--score-bg, rgba(255, 255, 255, 0.1));
-                border-radius: 3px;
-                overflow: hidden;
-                max-width: 200px;
-            ">
-                <div style="
-                    width: {width}%;
-                    height: 100%;
-                    background: var(--accent-color, #4F46E5);
-                    border-radius: 3px;
-                "></div>
-            </div>
-            <span style="
-                font-family: 'SF Mono', monospace;
-                font-weight: 600;
-                color: var(--text-primary, #ffffff);
-                min-width: 60px;
-            ">{score:.3f}</span>
-        </div>
-    """
 def filter_leaderboard(df, model_type, category, sort_by):
     filtered_df = df.copy()
@@ -104,8 +18,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
         filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
     dataset_columns = CATEGORIES.get(category, ["Model Avg"])
-    avg_score = filtered_df[dataset_columns].mean(axis=1)
-    filtered_df["Category Score"] = avg_score
     if sort_by == "Performance":
         filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
@@ -204,6 +117,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
                     <th>Rank</th>
                     <th>Model</th>
                     <th>Type</th>
                     <th>Cost (I/O)</th>
                     <th>Category Score</th>
                 </tr>
@@ -217,6 +131,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
                 <td>{get_rank_badge(row['Rank'])}</td>
                 <td class="model-cell">{row['Model']}</td>
                 <td>{get_type_badge(row['Model Type'])}</td>
                 <td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
                 <td class="score-cell">{get_score_bar(row['Category Score'])}</td>
             </tr>

     get_performance_chart,
     get_performance_cost_chart,
 )
+from utils import (
+    get_rank_badge,
+    get_score_bar,
+    get_type_badge,
+    get_output_type_badge,
+)
 def filter_leaderboard(df, model_type, category, sort_by):
     filtered_df = df.copy()
         filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
     dataset_columns = CATEGORIES.get(category, ["Model Avg"])
+    filtered_df["Category Score"] = filtered_df[dataset_columns].mean(axis=1)
     if sort_by == "Performance":
         filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
                     <th>Rank</th>
                     <th>Model</th>
                     <th>Type</th>
+                    <th>Vendor</th>
                     <th>Cost (I/O)</th>
                     <th>Category Score</th>
                 </tr>
                 <td>{get_rank_badge(row['Rank'])}</td>
                 <td class="model-cell">{row['Model']}</td>
                 <td>{get_type_badge(row['Model Type'])}</td>
+                <td class="vendor-cell">{row['Vendor']}</td>
                 <td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
                 <td class="score-cell">{get_score_bar(row['Category Score'])}</td>
             </tr>

utils.py CHANGED Viewed

@@ -67,7 +67,7 @@ def get_score_bar(score):
             <div style="
                 flex-grow: 1;
                 height: 6px;
-                background: rgba(255, 255, 255, 0.1);
                 border-radius: 3px;
                 overflow: hidden;
                 max-width: 200px;
@@ -75,15 +75,43 @@ def get_score_bar(score):
                 <div style="
                     width: {width}%;
                     height: 100%;
-                    background: #4F46E5;
                     border-radius: 3px;
                 "></div>
             </div>
             <span style="
                 font-family: 'SF Mono', monospace;
                 font-weight: 600;
-                color: #ffffff;
                 min-width: 60px;
             ">{score:.3f}</span>
         </div>
     """

             <div style="
                 flex-grow: 1;
                 height: 6px;
+                background: var(--score-bg, rgba(255, 255, 255, 0.1));
                 border-radius: 3px;
                 overflow: hidden;
                 max-width: 200px;
                 <div style="
                     width: {width}%;
                     height: 100%;
+                    background: var(--accent-color, #4F46E5);
                     border-radius: 3px;
                 "></div>
             </div>
             <span style="
                 font-family: 'SF Mono', monospace;
                 font-weight: 600;
+                color: var(--text-primary, #ffffff);
                 min-width: 60px;
             ">{score:.3f}</span>
         </div>
     """
+def get_output_type_badge(output_type):
+    """Generate HTML for output type badges with different colors, supporting both light and dark themes"""
+    type_styles = {
+        "Normal": {
+            "light": {"bg": "#F3F4F6", "color": "#374151"},
+            "dark": {"bg": "#374151", "color": "#F3F4F6"},
+        },
+        "Reasoning": {
+            "light": {"bg": "#DBEAFE", "color": "#1E40AF"},
+            "dark": {"bg": "#1E40AF", "color": "#DBEAFE"},
+        },
+    }
+    style = type_styles.get(output_type, type_styles["Normal"])
+    return f"""
+        <span style="
+            background: var(--bg-color, {style['light']['bg']});
+            color: var(--text-color, {style['light']['color']});
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 0.875rem;
+            font-weight: 500;
+        ">
+            {output_type}
+        </span>
+    """