pratikbhavsar commited on
Commit
59bf951
·
1 Parent(s): b189b91

added gemini 2.5 and llama 4

Browse files
Files changed (48) hide show
  1. data_loader.py +6 -6
  2. output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_irrelevance.parquet +3 -0
  3. output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_base_multi_func_call.parquet +3 -0
  4. output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_base_single_func_call.parquet +3 -0
  5. output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_composite.parquet +3 -0
  6. output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_long_context.parquet +3 -0
  7. output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_miss_func.parquet +3 -0
  8. output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_miss_param.parquet +3 -0
  9. output/Llama-4-Maverick-17B-128E-Instruct-FP8/tau_long_context.parquet +3 -0
  10. output/Llama-4-Maverick-17B-128E-Instruct-FP8/toolace_single_func_call_1.parquet +3 -0
  11. output/Llama-4-Maverick-17B-128E-Instruct-FP8/toolace_single_func_call_2.parquet +3 -0
  12. output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_multiple_tool_multiple_call.parquet +3 -0
  13. output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_multiple_tool_single_call.parquet +3 -0
  14. output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_single_tool_multiple_call.parquet +3 -0
  15. output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_single_tool_single_call.parquet +3 -0
  16. output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_tool_miss.parquet +3 -0
  17. output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_irrelevance.parquet +3 -0
  18. output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_base_multi_func_call.parquet +3 -0
  19. output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_base_single_func_call.parquet +3 -0
  20. output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_composite.parquet +3 -0
  21. output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_long_context.parquet +3 -0
  22. output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_miss_func.parquet +3 -0
  23. output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_miss_param.parquet +3 -0
  24. output/Llama-4-Scout-17B-16E-Instruct/tau_long_context.parquet +3 -0
  25. output/Llama-4-Scout-17B-16E-Instruct/toolace_single_func_call_1.parquet +3 -0
  26. output/Llama-4-Scout-17B-16E-Instruct/toolace_single_func_call_2.parquet +3 -0
  27. output/Llama-4-Scout-17B-16E-Instruct/xlam_multiple_tool_multiple_call.parquet +3 -0
  28. output/Llama-4-Scout-17B-16E-Instruct/xlam_multiple_tool_single_call.parquet +3 -0
  29. output/Llama-4-Scout-17B-16E-Instruct/xlam_single_tool_multiple_call.parquet +3 -0
  30. output/Llama-4-Scout-17B-16E-Instruct/xlam_single_tool_single_call.parquet +3 -0
  31. output/Llama-4-Scout-17B-16E-Instruct/xlam_tool_miss.parquet +3 -0
  32. output/gemini-2.5-pro-preview-03-25/BFCL_v3_irrelevance.parquet +3 -0
  33. output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_base_multi_func_call.parquet +3 -0
  34. output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_base_single_func_call.parquet +3 -0
  35. output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_composite.parquet +3 -0
  36. output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_long_context.parquet +3 -0
  37. output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_miss_func.parquet +3 -0
  38. output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_miss_param.parquet +3 -0
  39. output/gemini-2.5-pro-preview-03-25/tau_long_context.parquet +3 -0
  40. output/gemini-2.5-pro-preview-03-25/toolace_single_func_call_1.parquet +3 -0
  41. output/gemini-2.5-pro-preview-03-25/toolace_single_func_call_2.parquet +3 -0
  42. output/gemini-2.5-pro-preview-03-25/xlam_multiple_tool_multiple_call.parquet +3 -0
  43. output/gemini-2.5-pro-preview-03-25/xlam_multiple_tool_single_call.parquet +3 -0
  44. output/gemini-2.5-pro-preview-03-25/xlam_single_tool_multiple_call.parquet +3 -0
  45. output/gemini-2.5-pro-preview-03-25/xlam_single_tool_single_call.parquet +3 -0
  46. output/gemini-2.5-pro-preview-03-25/xlam_tool_miss.parquet +3 -0
  47. results.csv +4 -1
  48. tabs/leaderboard.py +2 -2
data_loader.py CHANGED
@@ -646,10 +646,10 @@ HEADER_CONTENT = (
646
 
647
  CARDS = """ <div class="metrics-grid">
648
  <div class="metric-card">
649
- <div class="metric-number metric-blue">25</div>
650
  <div class="metric-label">Total Models</div>
651
- <div class="metric-detail primary">19 Private</div>
652
- <div class="metric-detail primary">6 Open Source</div>
653
  </div>
654
 
655
  <div class="metric-card">
@@ -1045,7 +1045,7 @@ METHODOLOGY = """
1045
  <tbody>
1046
  <tr>
1047
  <td>Performance Champion</td>
1048
- <td>Claude 3.7 Sonnet(0.953) comes at the top but Gemini-2.0-flash(0.938) & Gemini-2.0-flash-lite(0.933) dominate at a very affordable cost, excelling in both complex tasks and safety features.</td>
1049
  </tr>
1050
  <tr>
1051
  <td>Price-Performance Paradox</td>
@@ -1320,8 +1320,8 @@ evaluate_handler.finish()
1320
  </div>
1321
  <h3 class="feature-title">Updated Periodically</h3>
1322
  <ul class="feature-list">
1323
- <li>19 private models evaluated</li>
1324
- <li>6 open source models included</li>
1325
  <li>Monthly model additions</li>
1326
  </ul>
1327
  </div>
 
646
 
647
  CARDS = """ <div class="metrics-grid">
648
  <div class="metric-card">
649
+ <div class="metric-number metric-blue">28</div>
650
  <div class="metric-label">Total Models</div>
651
+ <div class="metric-detail primary">20 Private</div>
652
+ <div class="metric-detail primary">8 Open Source</div>
653
  </div>
654
 
655
  <div class="metric-card">
 
1045
  <tbody>
1046
  <tr>
1047
  <td>Performance Champion</td>
1048
+ <td>Claude 3.7 Sonnet comes at the top but Gemini models dominate at a very affordable cost, excelling in both complex tasks and safety features.</td>
1049
  </tr>
1050
  <tr>
1051
  <td>Price-Performance Paradox</td>
 
1320
  </div>
1321
  <h3 class="feature-title">Updated Periodically</h3>
1322
  <ul class="feature-list">
1323
+ <li>20 private models evaluated</li>
1324
+ <li>8 open source models included</li>
1325
  <li>Monthly model additions</li>
1326
  </ul>
1327
  </div>
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_irrelevance.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02cb26bb44497965c398d84ccaec97e563f3459e89ec2d4677124b6fa7f393b8
3
+ size 36119
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aecff94a1dce47b8748e3f21d3e1bc9c6c43b9517e72856b5bdbf7a2915f19b0
3
+ size 24922
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74423c33ba802e73ceb4a2b4d4d7534e1732342bdde93df6a2422cc1120b19d7
3
+ size 21012
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_composite.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc56603045d6cc6cd71d610b4c087112a93edb1acd20a75cb650f7fe3e50b999
3
+ size 42393
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_long_context.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:789add35bce05c67c7eea5fb2b6793eb576f0a849bde71acad842b81c1f001f1
3
+ size 37585
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_miss_func.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4890dbf795c05382a4b3c98f630001e996adce7b00aaa5af79239971ec95fa5e
3
+ size 41670
output/Llama-4-Maverick-17B-128E-Instruct-FP8/BFCL_v3_multi_turn_miss_param.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7be236e67c891e045e5ba141ebfcfc698e01dc11bde9031970b47133bd261684
3
+ size 42734
output/Llama-4-Maverick-17B-128E-Instruct-FP8/tau_long_context.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e5d94d45a51c1059438b93e569b1784a25d063061c2d2f7631deffcdac6eedd
3
+ size 43679
output/Llama-4-Maverick-17B-128E-Instruct-FP8/toolace_single_func_call_1.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8ccd92df79f1b6e150dfd8beb1a3eab6d0477c9844177f6378f9ecddb3e57b2
3
+ size 16111
output/Llama-4-Maverick-17B-128E-Instruct-FP8/toolace_single_func_call_2.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6632846c6e5acb8efe3a641157393b4d1243f2258e3de9ea7b734b4ce79f54b7
3
+ size 12208
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_multiple_tool_multiple_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53798c43389a89012cc5ab3b8d9bf543f4dfd5ae04682856ed97f0ff209e5fab
3
+ size 106727
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_multiple_tool_single_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1338ecd75c5987ca5f447cc216b1448df54a5e778052c448bb14daf7d4f216c9
3
+ size 39563
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_single_tool_multiple_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92a50804042fbbf689542b7e095b12b450767e3b6934a02bd528d248d81f852e
3
+ size 31406
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_single_tool_single_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bc4cf2ed6a5e1f996f972df5f4e875621ce1e8c901da39e13f9a901715c648c
3
+ size 44549
output/Llama-4-Maverick-17B-128E-Instruct-FP8/xlam_tool_miss.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c22ea53f1f29f1fb515148caa2740571e55ad77b237f89849d6337f93bcd6d55
3
+ size 52992
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_irrelevance.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67f87c6121bb037614cc0a8a5d18e0d242461b2993b3cee9cca32aa129854a02
3
+ size 46288
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee6c5ed66a9a9bf99cb30cfd1d83c71625bb4cc723e980fb5c38bf46d48bad73
3
+ size 24185
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ef51513bd4ecf18b6544d76e5a876b1ea6348690fab5c5c65a0b45213816806
3
+ size 22140
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_composite.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dae25b9e4fa5c6f9ccdee504d658c7305b694341af31e3f4239ec9764f8b5fdc
3
+ size 42456
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_long_context.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a1bd1b36d18b756ca72404f709be520f344e32b088d132cb1b7ab5b951a46f4
3
+ size 38709
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_miss_func.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e14a98362bd85f9581c79e4abd7f30ee540a85a1058306d4d8dbdc649087c935
3
+ size 42909
output/Llama-4-Scout-17B-16E-Instruct/BFCL_v3_multi_turn_miss_param.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79011429b4d72f14fe28081a2dbc4e372ad83795820a8ab7eece0108c83b319
3
+ size 43349
output/Llama-4-Scout-17B-16E-Instruct/tau_long_context.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:065fced70b9ab190c70e9a1f0f309754bedc661a58922fb8773302ba52ae4d4f
3
+ size 48909
output/Llama-4-Scout-17B-16E-Instruct/toolace_single_func_call_1.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9a4493a4396c3053b2504e97cd4218fc63bf789a01f056d622ffa8c94c2b4f8
3
+ size 16843
output/Llama-4-Scout-17B-16E-Instruct/toolace_single_func_call_2.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c04d3ebe45de11b8f19d421a0a9879cd93ad3c5e8d0b1626b6d45f3bc8f6955
3
+ size 11905
output/Llama-4-Scout-17B-16E-Instruct/xlam_multiple_tool_multiple_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcd3243554880561378d8f61133d07e6d2d765b060f5bce07e965b3b05ec1be0
3
+ size 106855
output/Llama-4-Scout-17B-16E-Instruct/xlam_multiple_tool_single_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c06aa2bbed6058d56ad6b37d4345bab5a6c7f827940751445705ccc2c95235f3
3
+ size 41590
output/Llama-4-Scout-17B-16E-Instruct/xlam_single_tool_multiple_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb0ef8890ee26fc917fa14527de84de167c4f88f5d49bf6b921bd7543d5206ca
3
+ size 31509
output/Llama-4-Scout-17B-16E-Instruct/xlam_single_tool_single_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e75f613b3a53b7241fd89757240a1637a47c8043b5d2275effce6e2c0a06935
3
+ size 44403
output/Llama-4-Scout-17B-16E-Instruct/xlam_tool_miss.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5198588e8bf024b0a5042a4c1402366e704813646c34013969642c92a3a5fa3d
3
+ size 56995
output/gemini-2.5-pro-preview-03-25/BFCL_v3_irrelevance.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:457c7ef44b68bcea4f099c530e100837f0ac698dbb25573524cde06c095e4c83
3
+ size 32404
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_base_multi_func_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cb45b5b9e1532732a9826f8c5f5b822c123c5a5a098c6a03dfa3ce1d37154f9
3
+ size 22298
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_base_single_func_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:347ae541d7e5871bc670c03ac4dcd1d2134ea18621f555b41cf6331b39d1867b
3
+ size 22024
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_composite.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f636717d7283ac3be9de514135f3b49bf3057c5b69d4a724326790f1136e2e87
3
+ size 45845
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_long_context.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49e68f80f42a8414e2314a25ba4d947d9903b194420330cef17aaee825ea2796
3
+ size 37247
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_miss_func.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0056a1a69d7d512c06a0cc3de8c2d2f61a5f4cbba4c31ef3e9706d5fdaf4c5f4
3
+ size 42532
output/gemini-2.5-pro-preview-03-25/BFCL_v3_multi_turn_miss_param.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02418e817b29e6c2869283b1eea0ebfad68dcbea43967f20b0788e373652a369
3
+ size 44879
output/gemini-2.5-pro-preview-03-25/tau_long_context.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38298fd3ea6fc273c121d034cf2531f36595bd0af54d4c6ace888c6e789c6cd6
3
+ size 55651
output/gemini-2.5-pro-preview-03-25/toolace_single_func_call_1.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8908adc7797c92c4fae8f3267ddc479c1e296fbcd52f9781174f91d365879dbe
3
+ size 14515
output/gemini-2.5-pro-preview-03-25/toolace_single_func_call_2.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:078e854eb19b158dd1e14f0f51f28f29147b3c3746ef3fb109d3345d3146f179
3
+ size 10800
output/gemini-2.5-pro-preview-03-25/xlam_multiple_tool_multiple_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac786bf7a6b33349a8bc536bebdaa55500d8c38f738a8d2002c5a1697706db91
3
+ size 101337
output/gemini-2.5-pro-preview-03-25/xlam_multiple_tool_single_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af1effd95afb8fdfc8caf0fd2bb16010a1af449819ed5adc1ef2cb321630b41a
3
+ size 42167
output/gemini-2.5-pro-preview-03-25/xlam_single_tool_multiple_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1152d938b0a8658d05ec5c8585933327d39708d6407879fe7281a0c16d4ad1b9
3
+ size 31039
output/gemini-2.5-pro-preview-03-25/xlam_single_tool_single_call.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fffdfb48433c13f2831d6be14718f20c8bb161dac0bf66bfdd685971fdba6d05
3
+ size 44404
output/gemini-2.5-pro-preview-03-25/xlam_tool_miss.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b6bec7323bdcef421384e566369eec25e5d9bf99c9b6f094d7c16c64c3c118
3
+ size 50689
results.csv CHANGED
@@ -1,5 +1,6 @@
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
  claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
 
3
  gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
4
  gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975
5
  mistral-small-2503,Open source,Normal,Mistral,0.1,0.3,0.912,0.93,0.89,0.85,0.93,0.86,0.91,0.9,1,0.83,0.81,0.99,0.87,0.99,0.95,0.9,0.975
@@ -21,7 +22,9 @@ claude-3-5-sonnet-20241022,Private,Normal,Anthropic,3,15,0.801,0.83,0.77,0.68,0.
21
  Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
22
  claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
23
  mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
 
24
  ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
25
  Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
26
  open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
27
- Dataset Avg,,,,,,,0.86,0.82,0.82,0.82,0.81,0.90,0.82,0.96,0.68,0.82,0.87,0.82,0.93,0.88,0.77,0.85
 
 
1
  Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
  claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
3
+ gemini-2.5-pro-preview-03-25,Private,Normal,Google,1.25,10,0.941,0.93,0.95,0.95,0.97,0.97,0.82,0.95,0.99,0.89,0.92,1,0.93,1,0.84,0.95,1
4
  gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
5
  gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975
6
  mistral-small-2503,Open source,Normal,Mistral,0.1,0.3,0.912,0.93,0.89,0.85,0.93,0.86,0.91,0.9,1,0.83,0.81,0.99,0.87,0.99,0.95,0.9,0.975
 
22
  Llama-3.3-70B-Instruct-Turbo,Open source,Normal,Meta,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
23
  claude-3-5-haiku-20241022,Private,Normal,Anthropic,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
24
  mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
25
+ Llama-4-Maverick-17B-128E-Instruct-FP8,Open source,Normal,Meta,0.27,0.85,0.741,0.78,0.70,0.77,0.68,0.73,0.95,0.67,0.93,0.43,0.8,0.56,0.81,0.89,0.97,0.65,0.535
26
  ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
27
  Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
28
  open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
29
+ Llama-4-Scout-17B-16E-Instruct,Open source,Normal,Meta,0.18,0.59,0.629,0.69,0.57,0.73,0.51,0.74,0.94,0.51,0.93,0.25,0.71,0.2,0.72,0.81,0.94,0.49,0.33
30
+ Dataset Avg,,,,,,,0.86,0.82,0.82,0.82,0.81,0.90,0.82,0.96,0.68,0.82,0.86,0.82,0.93,0.88,0.77,0.85
tabs/leaderboard.py CHANGED
@@ -120,7 +120,7 @@ def filter_leaderboard(df, model_type, category, sort_by):
120
 
121
  <div class="note-box">
122
  <p style="margin: 0; font-size: 1em;">
123
- Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for Gemini models shown reflects <a href="https://cloud.google.com/vertex-ai/generative-ai/pricing">Vertex AI</a>. Google AI Studio offers <a href="https://ai.google.dev/gemini-api/docs/pricing">Gemini API Access</a> at a lower cost with an API Key.
124
 
125
  </p>
126
  </div>
@@ -189,7 +189,7 @@ def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
189
  gr.HTML(
190
  """<div class="note-box">
191
  <p style="margin: 0; font-size: 1em;">
192
- Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation.
193
  </p>
194
  </div>"""
195
  )
 
120
 
121
  <div class="note-box">
122
  <p style="margin: 0; font-size: 1em;">
123
+ Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for open source models is either from Fireworks or Together.
124
 
125
  </p>
126
  </div>
 
189
  gr.HTML(
190
  """<div class="note-box">
191
  <p style="margin: 0; font-size: 1em;">
192
+ Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. Pricing for open source models is either from Fireworks or Together.
193
  </p>
194
  </div>"""
195
  )