Pratik Bhavsar commited on
Commit
a64af65
·
1 Parent(s): 5172b6b

added 2 models

Browse files
Files changed (3) hide show
  1. results.csv +19 -17
  2. tabs/leaderboard.py +9 -94
  3. utils.py +31 -3
results.csv CHANGED
@@ -1,17 +1,19 @@
1
- Model,Model Type,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
- gemini-2.0-flash-exp,Private,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
3
- gpt-4o-2024-11-20,Private,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
4
- gemini-1.5-flash,Private,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
5
- gemini-1.5-pro,Private,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
6
- o1-2024-12-17,Private,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
7
- gpt-4o-mini,Private,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
8
- qwen2.5-72b-instruct,Open source,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
9
- mistral-large-2411,Private,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
10
- claude-3-5-sonnet-20241022,Private,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
11
- Llama-3.3-70B-Instruct-Turbo,Open source,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
12
- claude-3-5-haiku-20241022,Private,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
13
- mistral-small-2409,Private,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
14
- ministral-8b-2410,Private,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
15
- Meta-Llama-3.1-8B-Instruct-Turbo,Open source,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
16
- open-mistral-nemo-2407,Open source,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
17
- ,,,,,0.82,0.78,0.80,0.77,0.75,0.89,0.79,0.95,0.59,0.80,0.80,0.82,0.91,0.87,0.72,0.79
 
 
 
1
+ Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
+ gemini-2.0-flash-exp,Private,Normal,Google,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
3
+ gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
4
+ gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
5
+ gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
6
+ o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
7
+ o3-mini-2025-01-31,Private,Reasoning,OpenAI,1.1,4.4,0.847,0.80,0.90,0.87,0.91,0.84,0.72,0.93,0.98,0.63,0.85,0.97,0.84,1,0.43,0.91,0.975
8
+ mistral-small-2501,Open source,Normal,Mistral,0.9,0.9,0.832,0.88,0.78,0.83,0.78,0.92,0.97,0.76,0.99,0.62,0.8,0.82,0.77,0.95,0.92,0.74,0.775
9
+ gpt-4o-mini,Private,Normal,OpenAI,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
10
+ qwen2.5-72b-instruct,Open source,Normal,Alibaba,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
11
+ mistral-large-2411,Private,Normal,Mistral,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
12
+ claude-3-5-sonnet-20241022,Private,Normal,Anthropic,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
13
+ Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
14
+ claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
15
+ mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
16
+ ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
17
+ Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
18
+ open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
19
+ ,,,,,,,0.83,0.79,0.81,0.78,0.76,0.89,0.80,0.96,0.60,0.81,0.82,0.82,0.92,0.85,0.73,0.80
tabs/leaderboard.py CHANGED
@@ -5,98 +5,12 @@ from visualization import (
5
  get_performance_chart,
6
  get_performance_cost_chart,
7
  )
8
-
9
-
10
- def get_rank_badge(rank):
11
- """Generate HTML for rank badge with appropriate styling"""
12
- badge_styles = {
13
- 1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
14
- 2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
15
- 3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
16
- }
17
-
18
- if rank in badge_styles:
19
- label, gradient, text_color = badge_styles[rank]
20
- return f"""
21
- <div style="
22
- display: inline-flex;
23
- align-items: center;
24
- justify-content: center;
25
- min-width: 48px;
26
- padding: 4px 12px;
27
- background: {gradient};
28
- color: {text_color};
29
- border-radius: 6px;
30
- font-weight: 600;
31
- font-size: 0.9em;
32
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
33
- ">
34
- {label}
35
- </div>
36
- """
37
- return f"""
38
- <div style="
39
- display: inline-flex;
40
- align-items: center;
41
- justify-content: center;
42
- min-width: 28px;
43
- color: #a1a1aa;
44
- font-weight: 500;
45
- ">
46
- {rank}
47
- </div>
48
- """
49
-
50
-
51
- def get_type_badge(model_type):
52
- """Generate HTML for model type badge"""
53
- colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
54
- bg_color = colors.get(model_type, "#4F46E5")
55
- return f"""
56
- <div style="
57
- display: inline-flex;
58
- align-items: center;
59
- padding: 4px 8px;
60
- background: {bg_color};
61
- color: white;
62
- border-radius: 4px;
63
- font-size: 0.85em;
64
- font-weight: 500;
65
- ">
66
- {model_type}
67
- </div>
68
- """
69
-
70
-
71
- def get_score_bar(score):
72
- """Generate HTML for score bar"""
73
- width = score * 100
74
- return f"""
75
- <div style="display: flex; align-items: center; gap: 12px; width: 100%;">
76
- <div style="
77
- flex-grow: 1;
78
- height: 6px;
79
- background: var(--score-bg, rgba(255, 255, 255, 0.1));
80
- border-radius: 3px;
81
- overflow: hidden;
82
- max-width: 200px;
83
- ">
84
- <div style="
85
- width: {width}%;
86
- height: 100%;
87
- background: var(--accent-color, #4F46E5);
88
- border-radius: 3px;
89
- "></div>
90
- </div>
91
- <span style="
92
- font-family: 'SF Mono', monospace;
93
- font-weight: 600;
94
- color: var(--text-primary, #ffffff);
95
- min-width: 60px;
96
- ">{score:.3f}</span>
97
- </div>
98
- """
99
-
100
 
101
  def filter_leaderboard(df, model_type, category, sort_by):
102
  filtered_df = df.copy()
@@ -104,8 +18,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
104
  filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
105
 
106
  dataset_columns = CATEGORIES.get(category, ["Model Avg"])
107
- avg_score = filtered_df[dataset_columns].mean(axis=1)
108
- filtered_df["Category Score"] = avg_score
109
 
110
  if sort_by == "Performance":
111
  filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
@@ -204,6 +117,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
204
  <th>Rank</th>
205
  <th>Model</th>
206
  <th>Type</th>
 
207
  <th>Cost (I/O)</th>
208
  <th>Category Score</th>
209
  </tr>
@@ -217,6 +131,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
217
  <td>{get_rank_badge(row['Rank'])}</td>
218
  <td class="model-cell">{row['Model']}</td>
219
  <td>{get_type_badge(row['Model Type'])}</td>
 
220
  <td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
221
  <td class="score-cell">{get_score_bar(row['Category Score'])}</td>
222
  </tr>
 
5
  get_performance_chart,
6
  get_performance_cost_chart,
7
  )
8
+ from utils import (
9
+ get_rank_badge,
10
+ get_score_bar,
11
+ get_type_badge,
12
+ get_output_type_badge,
13
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def filter_leaderboard(df, model_type, category, sort_by):
16
  filtered_df = df.copy()
 
18
  filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
19
 
20
  dataset_columns = CATEGORIES.get(category, ["Model Avg"])
21
+ filtered_df["Category Score"] = filtered_df[dataset_columns].mean(axis=1)
 
22
 
23
  if sort_by == "Performance":
24
  filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
 
117
  <th>Rank</th>
118
  <th>Model</th>
119
  <th>Type</th>
120
+ <th>Vendor</th>
121
  <th>Cost (I/O)</th>
122
  <th>Category Score</th>
123
  </tr>
 
131
  <td>{get_rank_badge(row['Rank'])}</td>
132
  <td class="model-cell">{row['Model']}</td>
133
  <td>{get_type_badge(row['Model Type'])}</td>
134
+ <td class="vendor-cell">{row['Vendor']}</td>
135
  <td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
136
  <td class="score-cell">{get_score_bar(row['Category Score'])}</td>
137
  </tr>
utils.py CHANGED
@@ -67,7 +67,7 @@ def get_score_bar(score):
67
  <div style="
68
  flex-grow: 1;
69
  height: 6px;
70
- background: rgba(255, 255, 255, 0.1);
71
  border-radius: 3px;
72
  overflow: hidden;
73
  max-width: 200px;
@@ -75,15 +75,43 @@ def get_score_bar(score):
75
  <div style="
76
  width: {width}%;
77
  height: 100%;
78
- background: #4F46E5;
79
  border-radius: 3px;
80
  "></div>
81
  </div>
82
  <span style="
83
  font-family: 'SF Mono', monospace;
84
  font-weight: 600;
85
- color: #ffffff;
86
  min-width: 60px;
87
  ">{score:.3f}</span>
88
  </div>
89
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  <div style="
68
  flex-grow: 1;
69
  height: 6px;
70
+ background: var(--score-bg, rgba(255, 255, 255, 0.1));
71
  border-radius: 3px;
72
  overflow: hidden;
73
  max-width: 200px;
 
75
  <div style="
76
  width: {width}%;
77
  height: 100%;
78
+ background: var(--accent-color, #4F46E5);
79
  border-radius: 3px;
80
  "></div>
81
  </div>
82
  <span style="
83
  font-family: 'SF Mono', monospace;
84
  font-weight: 600;
85
+ color: var(--text-primary, #ffffff);
86
  min-width: 60px;
87
  ">{score:.3f}</span>
88
  </div>
89
  """
90
+
91
+
92
+ def get_output_type_badge(output_type):
93
+ """Generate HTML for output type badges with different colors, supporting both light and dark themes"""
94
+ type_styles = {
95
+ "Normal": {
96
+ "light": {"bg": "#F3F4F6", "color": "#374151"},
97
+ "dark": {"bg": "#374151", "color": "#F3F4F6"},
98
+ },
99
+ "Reasoning": {
100
+ "light": {"bg": "#DBEAFE", "color": "#1E40AF"},
101
+ "dark": {"bg": "#1E40AF", "color": "#DBEAFE"},
102
+ },
103
+ }
104
+
105
+ style = type_styles.get(output_type, type_styles["Normal"])
106
+ return f"""
107
+ <span style="
108
+ background: var(--bg-color, {style['light']['bg']});
109
+ color: var(--text-color, {style['light']['color']});
110
+ padding: 4px 8px;
111
+ border-radius: 4px;
112
+ font-size: 0.875rem;
113
+ font-weight: 500;
114
+ ">
115
+ {output_type}
116
+ </span>
117
+ """