Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
59bf951
1
Parent(s):
b189b91
added gemini 2.5 and llama 4
Browse files- data_loader.py +6 -6
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_irrelevance.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_base_multi_func_call.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_base_single_func_call.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_composite.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_long_context.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_miss_func.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_miss_param.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/tau_long_context.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/toolace_single_func_call_1.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/toolace_single_func_call_2.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_multiple_tool_multiple_call.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_multiple_tool_single_call.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_single_tool_multiple_call.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_single_tool_single_call.parquet +3 -0
- output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_tool_miss.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_irrelevance.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_base_multi_func_call.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_base_single_func_call.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_composite.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_long_context.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_miss_func.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_miss_param.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/tau_long_context.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/toolace_single_func_call_1.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/toolace_single_func_call_2.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/xlam_multiple_tool_multiple_call.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/xlam_multiple_tool_single_call.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/xlam_single_tool_multiple_call.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/xlam_single_tool_single_call.parquet +3 -0
- output/Llama-4-Scout-17B-16E-Instruct/xlam_tool_miss.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/BFCL_v3_irrelevance.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_base_multi_func_call.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_base_single_func_call.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_composite.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_long_context.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_miss_func.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_miss_param.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/tau_long_context.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/toolace_single_func_call_1.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/toolace_single_func_call_2.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/xlam_multiple_tool_multiple_call.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/xlam_multiple_tool_single_call.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/xlam_single_tool_multiple_call.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/xlam_single_tool_single_call.parquet +3 -0
- output/gemini-2.5-pro-preview-03-25/xlam_tool_miss.parquet +3 -0
- results.csv +4 -1
- tabs/leaderboard.py +2 -2
data_loader.py
CHANGED
@@ -646,10 +646,10 @@ HEADER_CONTENT = (
|
|
646 |
|
647 |
CARDS = """ <div class="metrics-grid">
|
648 |
<div class="metric-card">
|
649 |
-
<div class="metric-number metric-blue">
|
650 |
<div class="metric-label">Total Models</div>
|
651 |
-
<div class="metric-detail primary">
|
652 |
-
<div class="metric-detail primary">
|
653 |
</div>
|
654 |
|
655 |
<div class="metric-card">
|
@@ -1045,7 +1045,7 @@ METHODOLOGY = """
|
|
1045 |
<tbody>
|
1046 |
<tr>
|
1047 |
<td>Performance Champion</td>
|
1048 |
-
<td>Claude 3.7 Sonnet
|
1049 |
</tr>
|
1050 |
<tr>
|
1051 |
<td>Price-Performance Paradox</td>
|
@@ -1320,8 +1320,8 @@ evaluate_handler.finish()
|
|
1320 |
</div>
|
1321 |
<h3 class="feature-title">Updated Periodically</h3>
|
1322 |
<ul class="feature-list">
|
1323 |
-
<li>
|
1324 |
-
<li>
|
1325 |
<li>Monthly model additions</li>
|
1326 |
</ul>
|
1327 |
</div>
|
|
|
646 |
|
647 |
CARDS = """ <div class="metrics-grid">
|
648 |
<div class="metric-card">
|
649 |
+
<div class="metric-number metric-blue">28</div>
|
650 |
<div class="metric-label">Total Models</div>
|
651 |
+
<div class="metric-detail primary">20 Private</div>
|
652 |
+
<div class="metric-detail primary">8 Open Source</div>
|
653 |
</div>
|
654 |
|
655 |
<div class="metric-card">
|
|
|
1045 |
<tbody>
|
1046 |
<tr>
|
1047 |
<td>Performance Champion</td>
|
1048 |
+
<td>Claude 3.7 Sonnet comes at the top but Gemini models dominate at a very affordable cost, excelling in both complex tasks and safety features.</td>
|
1049 |
</tr>
|
1050 |
<tr>
|
1051 |
<td>Price-Performance Paradox</td>
|
|
|
1320 |
</div>
|
1321 |
<h3 class="feature-title">Updated Periodically</h3>
|
1322 |
<ul class="feature-list">
|
1323 |
+
<li>20 private models evaluated</li>
|
1324 |
+
<li>8 open source models included</li>
|
1325 |
<li>Monthly model additions</li>
|
1326 |
</ul>
|
1327 |
</div>
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_irrelevance.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02cb26bb44497965c398d84ccaec97e563f3459e89ec2d4677124b6fa7f393b8
|
3 |
+
size 36119
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_base_multi_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aecff94a1dce47b8748e3f21d3e1bc9c6c43b9517e72856b5bdbf7a2915f19b0
|
3 |
+
size 24922
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_base_single_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:74423c33ba802e73ceb4a2b4d4d7534e1732342bdde93df6a2422cc1120b19d7
|
3 |
+
size 21012
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_composite.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cc56603045d6cc6cd71d610b4c087112a93edb1acd20a75cb650f7fe3e50b999
|
3 |
+
size 42393
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:789add35bce05c67c7eea5fb2b6793eb576f0a849bde71acad842b81c1f001f1
|
3 |
+
size 37585
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_miss_func.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4890dbf795c05382a4b3c98f630001e996adce7b00aaa5af79239971ec95fa5e
|
3 |
+
size 41670
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_miss_param.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7be236e67c891e045e5ba141ebfcfc698e01dc11bde9031970b47133bd261684
|
3 |
+
size 42734
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/tau_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e5d94d45a51c1059438b93e569b1784a25d063061c2d2f7631deffcdac6eedd
|
3 |
+
size 43679
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/toolace_single_func_call_1.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8ccd92df79f1b6e150dfd8beb1a3eab6d0477c9844177f6378f9ecddb3e57b2
|
3 |
+
size 16111
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/toolace_single_func_call_2.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6632846c6e5acb8efe3a641157393b4d1243f2258e3de9ea7b734b4ce79f54b7
|
3 |
+
size 12208
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_multiple_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53798c43389a89012cc5ab3b8d9bf543f4dfd5ae04682856ed97f0ff209e5fab
|
3 |
+
size 106727
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_multiple_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1338ecd75c5987ca5f447cc216b1448df54a5e778052c448bb14daf7d4f216c9
|
3 |
+
size 39563
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_single_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92a50804042fbbf689542b7e095b12b450767e3b6934a02bd528d248d81f852e
|
3 |
+
size 31406
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_single_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8bc4cf2ed6a5e1f996f972df5f4e875621ce1e8c901da39e13f9a901715c648c
|
3 |
+
size 44549
|
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_tool_miss.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c22ea53f1f29f1fb515148caa2740571e55ad77b237f89849d6337f93bcd6d55
|
3 |
+
size 52992
|
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_irrelevance.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:67f87c6121bb037614cc0a8a5d18e0d242461b2993b3cee9cca32aa129854a02
|
3 |
+
size 46288
|
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_base_multi_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee6c5ed66a9a9bf99cb30cfd1d83c71625bb4cc723e980fb5c38bf46d48bad73
|
3 |
+
size 24185
|
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_base_single_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9ef51513bd4ecf18b6544d76e5a876b1ea6348690fab5c5c65a0b45213816806
|
3 |
+
size 22140
|
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_composite.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dae25b9e4fa5c6f9ccdee504d658c7305b694341af31e3f4239ec9764f8b5fdc
|
3 |
+
size 42456
|
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a1bd1b36d18b756ca72404f709be520f344e32b088d132cb1b7ab5b951a46f4
|
3 |
+
size 38709
|
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_miss_func.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e14a98362bd85f9581c79e4abd7f30ee540a85a1058306d4d8dbdc649087c935
|
3 |
+
size 42909
|
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_miss_param.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d79011429b4d72f14fe28081a2dbc4e372ad83795820a8ab7eece0108c83b319
|
3 |
+
size 43349
|
output/Llama-4-Scout-17B-16E-Instruct/tau_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:065fced70b9ab190c70e9a1f0f309754bedc661a58922fb8773302ba52ae4d4f
|
3 |
+
size 48909
|
output/Llama-4-Scout-17B-16E-Instruct/toolace_single_func_call_1.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d9a4493a4396c3053b2504e97cd4218fc63bf789a01f056d622ffa8c94c2b4f8
|
3 |
+
size 16843
|
output/Llama-4-Scout-17B-16E-Instruct/toolace_single_func_call_2.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c04d3ebe45de11b8f19d421a0a9879cd93ad3c5e8d0b1626b6d45f3bc8f6955
|
3 |
+
size 11905
|
output/Llama-4-Scout-17B-16E-Instruct/xlam_multiple_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fcd3243554880561378d8f61133d07e6d2d765b060f5bce07e965b3b05ec1be0
|
3 |
+
size 106855
|
output/Llama-4-Scout-17B-16E-Instruct/xlam_multiple_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c06aa2bbed6058d56ad6b37d4345bab5a6c7f827940751445705ccc2c95235f3
|
3 |
+
size 41590
|
output/Llama-4-Scout-17B-16E-Instruct/xlam_single_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cb0ef8890ee26fc917fa14527de84de167c4f88f5d49bf6b921bd7543d5206ca
|
3 |
+
size 31509
|
output/Llama-4-Scout-17B-16E-Instruct/xlam_single_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e75f613b3a53b7241fd89757240a1637a47c8043b5d2275effce6e2c0a06935
|
3 |
+
size 44403
|
output/Llama-4-Scout-17B-16E-Instruct/xlam_tool_miss.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5198588e8bf024b0a5042a4c1402366e704813646c34013969642c92a3a5fa3d
|
3 |
+
size 56995
|
output/gemini-2.5-pro-preview-03-25/BFCL_v3_irrelevance.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:457c7ef44b68bcea4f099c530e100837f0ac698dbb25573524cde06c095e4c83
|
3 |
+
size 32404
|
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_base_multi_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2cb45b5b9e1532732a9826f8c5f5b822c123c5a5a098c6a03dfa3ce1d37154f9
|
3 |
+
size 22298
|
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_base_single_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:347ae541d7e5871bc670c03ac4dcd1d2134ea18621f555b41cf6331b39d1867b
|
3 |
+
size 22024
|
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_composite.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f636717d7283ac3be9de514135f3b49bf3057c5b69d4a724326790f1136e2e87
|
3 |
+
size 45845
|
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:49e68f80f42a8414e2314a25ba4d947d9903b194420330cef17aaee825ea2796
|
3 |
+
size 37247
|
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_miss_func.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0056a1a69d7d512c06a0cc3de8c2d2f61a5f4cbba4c31ef3e9706d5fdaf4c5f4
|
3 |
+
size 42532
|
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_miss_param.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02418e817b29e6c2869283b1eea0ebfad68dcbea43967f20b0788e373652a369
|
3 |
+
size 44879
|
output/gemini-2.5-pro-preview-03-25/tau_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:38298fd3ea6fc273c121d034cf2531f36595bd0af54d4c6ace888c6e789c6cd6
|
3 |
+
size 55651
|
output/gemini-2.5-pro-preview-03-25/toolace_single_func_call_1.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8908adc7797c92c4fae8f3267ddc479c1e296fbcd52f9781174f91d365879dbe
|
3 |
+
size 14515
|
output/gemini-2.5-pro-preview-03-25/toolace_single_func_call_2.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:078e854eb19b158dd1e14f0f51f28f29147b3c3746ef3fb109d3345d3146f179
|
3 |
+
size 10800
|
output/gemini-2.5-pro-preview-03-25/xlam_multiple_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac786bf7a6b33349a8bc536bebdaa55500d8c38f738a8d2002c5a1697706db91
|
3 |
+
size 101337
|
output/gemini-2.5-pro-preview-03-25/xlam_multiple_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af1effd95afb8fdfc8caf0fd2bb16010a1af449819ed5adc1ef2cb321630b41a
|
3 |
+
size 42167
|
output/gemini-2.5-pro-preview-03-25/xlam_single_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1152d938b0a8658d05ec5c8585933327d39708d6407879fe7281a0c16d4ad1b9
|
3 |
+
size 31039
|
output/gemini-2.5-pro-preview-03-25/xlam_single_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fffdfb48433c13f2831d6be14718f20c8bb161dac0bf66bfdd685971fdba6d05
|
3 |
+
size 44404
|
output/gemini-2.5-pro-preview-03-25/xlam_tool_miss.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61b6bec7323bdcef421384e566369eec25e5d9bf99c9b6f094d7c16c64c3c118
|
3 |
+
size 50689
|
results.csv
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
|
|
|
3 |
gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
|
4 |
gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975
|
5 |
mistral-small-2503,Open source,Normal,Mistral,0.1,0.3,0.912,0.93,0.89,0.85,0.93,0.86,0.91,0.9,1,0.83,0.81,0.99,0.87,0.99,0.95,0.9,0.975
|
@@ -21,7 +22,9 @@ claude-3-5-sonnet-20241022,Private,Normal,Anthropic,3,15,0.801,0.83,0.77,0.68,0.
|
|
21 |
Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
|
22 |
claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
|
23 |
mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
|
|
|
24 |
ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
|
25 |
Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
|
26 |
open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
|
27 |
-
|
|
|
|
1 |
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
|
3 |
+
gemini-2.5-pro-preview-03-25,Private,Normal,Google,1.25,10,0.941,0.93,0.95,0.95,0.97,0.97,0.82,0.95,0.99,0.89,0.92,1,0.93,1,0.84,0.95,1
|
4 |
gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
|
5 |
gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975
|
6 |
mistral-small-2503,Open source,Normal,Mistral,0.1,0.3,0.912,0.93,0.89,0.85,0.93,0.86,0.91,0.9,1,0.83,0.81,0.99,0.87,0.99,0.95,0.9,0.975
|
|
|
22 |
Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
|
23 |
claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
|
24 |
mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
|
25 |
+
Llama-4-Maverick-17B-128E-Instruct-FP8,Open source,Normal,Meta,0.27,0.85,0.741,0.78,0.70,0.77,0.68,0.73,0.95,0.67,0.93,0.43,0.8,0.56,0.81,0.89,0.97,0.65,0.535
|
26 |
ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
|
27 |
Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
|
28 |
open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
|
29 |
+
Llama-4-Scout-17B-16E-Instruct,Open source,Normal,Meta,0.18,0.59,0.629,0.69,0.57,0.73,0.51,0.74,0.94,0.51,0.93,0.25,0.71,0.2,0.72,0.81,0.94,0.49,0.33
|
30 |
+
Dataset Avg,,,,,,,0.86,0.82,0.82,0.82,0.81,0.90,0.82,0.96,0.68,0.82,0.86,0.82,0.93,0.88,0.77,0.85
|
tabs/leaderboard.py
CHANGED
@@ -120,7 +120,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
|
|
120 |
|
121 |
<div class="note-box">
|
122 |
<p style="margin: 0; font-size: 1em;">
|
123 |
-
Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for
|
124 |
|
125 |
</p>
|
126 |
</div>
|
@@ -189,7 +189,7 @@ def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
|
|
189 |
gr.HTML(
|
190 |
"""<div class="note-box">
|
191 |
<p style="margin: 0; font-size: 1em;">
|
192 |
-
Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
|
193 |
</p>
|
194 |
</div>"""
|
195 |
)
|
|
|
120 |
|
121 |
<div class="note-box">
|
122 |
<p style="margin: 0; font-size: 1em;">
|
123 |
+
Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for open source models is either from Fireworks or Together.
|
124 |
|
125 |
</p>
|
126 |
</div>
|
|
|
189 |
gr.HTML(
|
190 |
"""<div class="note-box">
|
191 |
<p style="margin: 0; font-size: 1em;">
|
192 |
+
Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for open source models is either from Fireworks or Together.
|
193 |
</p>
|
194 |
</div>"""
|
195 |
)
|