Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Pratik Bhavsar
commited on
Commit
·
a64af65
1
Parent(s):
5172b6b
added 2 models
Browse files- results.csv +19 -17
- tabs/leaderboard.py +9 -94
- utils.py +31 -3
results.csv
CHANGED
@@ -1,17 +1,19 @@
|
|
1 |
-
Model,Model Type,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
-
gemini-2.0-flash-exp,Private,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
|
3 |
-
gpt-4o-2024-11-20,Private,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
|
4 |
-
gemini-1.5-flash,Private,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
|
5 |
-
gemini-1.5-pro,Private,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
|
6 |
-
o1-2024-12-17,Private,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
claude-3-5-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
1 |
+
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
+
gemini-2.0-flash-exp,Private,Normal,Google,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
|
3 |
+
gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
|
4 |
+
gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
|
5 |
+
gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
|
6 |
+
o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
|
7 |
+
o3-mini-2025-01-31,Private,Reasoning,OpenAI,1.1,4.4,0.847,0.80,0.90,0.87,0.91,0.84,0.72,0.93,0.98,0.63,0.85,0.97,0.84,1,0.43,0.91,0.975
|
8 |
+
mistral-small-2501,Open source,Normal,Mistral,0.9,0.9,0.832,0.88,0.78,0.83,0.78,0.92,0.97,0.76,0.99,0.62,0.8,0.82,0.77,0.95,0.92,0.74,0.775
|
9 |
+
gpt-4o-mini,Private,Normal,OpenAI,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
|
10 |
+
qwen2.5-72b-instruct,Open source,Normal,Alibaba,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
|
11 |
+
mistral-large-2411,Private,Normal,Mistral,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
|
12 |
+
claude-3-5-sonnet-20241022,Private,Normal,Anthropic,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
|
13 |
+
Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
|
14 |
+
claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
|
15 |
+
mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
|
16 |
+
ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
|
17 |
+
Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
|
18 |
+
open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
|
19 |
+
,,,,,,,0.83,0.79,0.81,0.78,0.76,0.89,0.80,0.96,0.60,0.81,0.82,0.82,0.92,0.85,0.73,0.80
|
tabs/leaderboard.py
CHANGED
@@ -5,98 +5,12 @@ from visualization import (
|
|
5 |
get_performance_chart,
|
6 |
get_performance_cost_chart,
|
7 |
)
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
|
15 |
-
3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
|
16 |
-
}
|
17 |
-
|
18 |
-
if rank in badge_styles:
|
19 |
-
label, gradient, text_color = badge_styles[rank]
|
20 |
-
return f"""
|
21 |
-
<div style="
|
22 |
-
display: inline-flex;
|
23 |
-
align-items: center;
|
24 |
-
justify-content: center;
|
25 |
-
min-width: 48px;
|
26 |
-
padding: 4px 12px;
|
27 |
-
background: {gradient};
|
28 |
-
color: {text_color};
|
29 |
-
border-radius: 6px;
|
30 |
-
font-weight: 600;
|
31 |
-
font-size: 0.9em;
|
32 |
-
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
|
33 |
-
">
|
34 |
-
{label}
|
35 |
-
</div>
|
36 |
-
"""
|
37 |
-
return f"""
|
38 |
-
<div style="
|
39 |
-
display: inline-flex;
|
40 |
-
align-items: center;
|
41 |
-
justify-content: center;
|
42 |
-
min-width: 28px;
|
43 |
-
color: #a1a1aa;
|
44 |
-
font-weight: 500;
|
45 |
-
">
|
46 |
-
{rank}
|
47 |
-
</div>
|
48 |
-
"""
|
49 |
-
|
50 |
-
|
51 |
-
def get_type_badge(model_type):
|
52 |
-
"""Generate HTML for model type badge"""
|
53 |
-
colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
|
54 |
-
bg_color = colors.get(model_type, "#4F46E5")
|
55 |
-
return f"""
|
56 |
-
<div style="
|
57 |
-
display: inline-flex;
|
58 |
-
align-items: center;
|
59 |
-
padding: 4px 8px;
|
60 |
-
background: {bg_color};
|
61 |
-
color: white;
|
62 |
-
border-radius: 4px;
|
63 |
-
font-size: 0.85em;
|
64 |
-
font-weight: 500;
|
65 |
-
">
|
66 |
-
{model_type}
|
67 |
-
</div>
|
68 |
-
"""
|
69 |
-
|
70 |
-
|
71 |
-
def get_score_bar(score):
|
72 |
-
"""Generate HTML for score bar"""
|
73 |
-
width = score * 100
|
74 |
-
return f"""
|
75 |
-
<div style="display: flex; align-items: center; gap: 12px; width: 100%;">
|
76 |
-
<div style="
|
77 |
-
flex-grow: 1;
|
78 |
-
height: 6px;
|
79 |
-
background: var(--score-bg, rgba(255, 255, 255, 0.1));
|
80 |
-
border-radius: 3px;
|
81 |
-
overflow: hidden;
|
82 |
-
max-width: 200px;
|
83 |
-
">
|
84 |
-
<div style="
|
85 |
-
width: {width}%;
|
86 |
-
height: 100%;
|
87 |
-
background: var(--accent-color, #4F46E5);
|
88 |
-
border-radius: 3px;
|
89 |
-
"></div>
|
90 |
-
</div>
|
91 |
-
<span style="
|
92 |
-
font-family: 'SF Mono', monospace;
|
93 |
-
font-weight: 600;
|
94 |
-
color: var(--text-primary, #ffffff);
|
95 |
-
min-width: 60px;
|
96 |
-
">{score:.3f}</span>
|
97 |
-
</div>
|
98 |
-
"""
|
99 |
-
|
100 |
|
101 |
def filter_leaderboard(df, model_type, category, sort_by):
|
102 |
filtered_df = df.copy()
|
@@ -104,8 +18,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
|
|
104 |
filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
|
105 |
|
106 |
dataset_columns = CATEGORIES.get(category, ["Model Avg"])
|
107 |
-
|
108 |
-
filtered_df["Category Score"] = avg_score
|
109 |
|
110 |
if sort_by == "Performance":
|
111 |
filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
|
@@ -204,6 +117,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
|
|
204 |
<th>Rank</th>
|
205 |
<th>Model</th>
|
206 |
<th>Type</th>
|
|
|
207 |
<th>Cost (I/O)</th>
|
208 |
<th>Category Score</th>
|
209 |
</tr>
|
@@ -217,6 +131,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
|
|
217 |
<td>{get_rank_badge(row['Rank'])}</td>
|
218 |
<td class="model-cell">{row['Model']}</td>
|
219 |
<td>{get_type_badge(row['Model Type'])}</td>
|
|
|
220 |
<td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
|
221 |
<td class="score-cell">{get_score_bar(row['Category Score'])}</td>
|
222 |
</tr>
|
|
|
5 |
get_performance_chart,
|
6 |
get_performance_cost_chart,
|
7 |
)
|
8 |
+
from utils import (
|
9 |
+
get_rank_badge,
|
10 |
+
get_score_bar,
|
11 |
+
get_type_badge,
|
12 |
+
get_output_type_badge,
|
13 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def filter_leaderboard(df, model_type, category, sort_by):
|
16 |
filtered_df = df.copy()
|
|
|
18 |
filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
|
19 |
|
20 |
dataset_columns = CATEGORIES.get(category, ["Model Avg"])
|
21 |
+
filtered_df["Category Score"] = filtered_df[dataset_columns].mean(axis=1)
|
|
|
22 |
|
23 |
if sort_by == "Performance":
|
24 |
filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
|
|
|
117 |
<th>Rank</th>
|
118 |
<th>Model</th>
|
119 |
<th>Type</th>
|
120 |
+
<th>Vendor</th>
|
121 |
<th>Cost (I/O)</th>
|
122 |
<th>Category Score</th>
|
123 |
</tr>
|
|
|
131 |
<td>{get_rank_badge(row['Rank'])}</td>
|
132 |
<td class="model-cell">{row['Model']}</td>
|
133 |
<td>{get_type_badge(row['Model Type'])}</td>
|
134 |
+
<td class="vendor-cell">{row['Vendor']}</td>
|
135 |
<td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
|
136 |
<td class="score-cell">{get_score_bar(row['Category Score'])}</td>
|
137 |
</tr>
|
utils.py
CHANGED
@@ -67,7 +67,7 @@ def get_score_bar(score):
|
|
67 |
<div style="
|
68 |
flex-grow: 1;
|
69 |
height: 6px;
|
70 |
-
background: rgba(255, 255, 255, 0.1);
|
71 |
border-radius: 3px;
|
72 |
overflow: hidden;
|
73 |
max-width: 200px;
|
@@ -75,15 +75,43 @@ def get_score_bar(score):
|
|
75 |
<div style="
|
76 |
width: {width}%;
|
77 |
height: 100%;
|
78 |
-
background: #4F46E5;
|
79 |
border-radius: 3px;
|
80 |
"></div>
|
81 |
</div>
|
82 |
<span style="
|
83 |
font-family: 'SF Mono', monospace;
|
84 |
font-weight: 600;
|
85 |
-
color: #ffffff;
|
86 |
min-width: 60px;
|
87 |
">{score:.3f}</span>
|
88 |
</div>
|
89 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
<div style="
|
68 |
flex-grow: 1;
|
69 |
height: 6px;
|
70 |
+
background: var(--score-bg, rgba(255, 255, 255, 0.1));
|
71 |
border-radius: 3px;
|
72 |
overflow: hidden;
|
73 |
max-width: 200px;
|
|
|
75 |
<div style="
|
76 |
width: {width}%;
|
77 |
height: 100%;
|
78 |
+
background: var(--accent-color, #4F46E5);
|
79 |
border-radius: 3px;
|
80 |
"></div>
|
81 |
</div>
|
82 |
<span style="
|
83 |
font-family: 'SF Mono', monospace;
|
84 |
font-weight: 600;
|
85 |
+
color: var(--text-primary, #ffffff);
|
86 |
min-width: 60px;
|
87 |
">{score:.3f}</span>
|
88 |
</div>
|
89 |
"""
|
90 |
+
|
91 |
+
|
92 |
+
def get_output_type_badge(output_type):
|
93 |
+
"""Generate HTML for output type badges with different colors, supporting both light and dark themes"""
|
94 |
+
type_styles = {
|
95 |
+
"Normal": {
|
96 |
+
"light": {"bg": "#F3F4F6", "color": "#374151"},
|
97 |
+
"dark": {"bg": "#374151", "color": "#F3F4F6"},
|
98 |
+
},
|
99 |
+
"Reasoning": {
|
100 |
+
"light": {"bg": "#DBEAFE", "color": "#1E40AF"},
|
101 |
+
"dark": {"bg": "#1E40AF", "color": "#DBEAFE"},
|
102 |
+
},
|
103 |
+
}
|
104 |
+
|
105 |
+
style = type_styles.get(output_type, type_styles["Normal"])
|
106 |
+
return f"""
|
107 |
+
<span style="
|
108 |
+
background: var(--bg-color, {style['light']['bg']});
|
109 |
+
color: var(--text-color, {style['light']['color']});
|
110 |
+
padding: 4px 8px;
|
111 |
+
border-radius: 4px;
|
112 |
+
font-size: 0.875rem;
|
113 |
+
font-weight: 500;
|
114 |
+
">
|
115 |
+
{output_type}
|
116 |
+
</span>
|
117 |
+
"""
|