Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
601da30
1
Parent(s):
100bb71
added gpt 4.5 and flash lite
Browse files- data_loader.py +5 -5
- get_exp_data.ipynb +2 -4
- output/gemini-2.0-flash-lite-001/BFCL_v3_irrelevance.parquet +3 -0
- output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_base_multi_func_call.parquet +3 -0
- output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_base_single_func_call.parquet +3 -0
- output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_composite.parquet +3 -0
- output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_long_context.parquet +3 -0
- output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_miss_func.parquet +3 -0
- output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_miss_param.parquet +3 -0
- output/gemini-2.0-flash-lite-001/tau_long_context.parquet +3 -0
- output/gemini-2.0-flash-lite-001/toolace_single_func_call_1.parquet +3 -0
- output/gemini-2.0-flash-lite-001/toolace_single_func_call_2.parquet +3 -0
- output/gemini-2.0-flash-lite-001/xlam_multiple_tool_multiple_call.parquet +3 -0
- output/gemini-2.0-flash-lite-001/xlam_multiple_tool_single_call.parquet +3 -0
- output/gemini-2.0-flash-lite-001/xlam_single_tool_multiple_call.parquet +3 -0
- output/gemini-2.0-flash-lite-001/xlam_single_tool_single_call.parquet +3 -0
- output/gemini-2.0-flash-lite-001/xlam_tool_miss.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/BFCL_v3_irrelevance.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_base_multi_func_call.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_base_single_func_call.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_composite.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_long_context.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_miss_func.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_miss_param.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/tau_long_context.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/toolace_single_func_call_1.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/toolace_single_func_call_2.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/xlam_multiple_tool_multiple_call.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/xlam_multiple_tool_single_call.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/xlam_single_tool_multiple_call.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/xlam_single_tool_single_call.parquet +3 -0
- output/gpt-4.5-preview-2025-02-27/xlam_tool_miss.parquet +3 -0
- results.csv +3 -1
data_loader.py
CHANGED
@@ -604,9 +604,9 @@ HEADER_CONTENT = (
|
|
604 |
|
605 |
CARDS = """ <div class="metrics-grid">
|
606 |
<div class="metric-card">
|
607 |
-
<div class="metric-number metric-blue">
|
608 |
<div class="metric-label">Total Models</div>
|
609 |
-
<div class="metric-detail primary">
|
610 |
<div class="metric-detail primary">5 Open Source</div>
|
611 |
</div>
|
612 |
|
@@ -1003,11 +1003,11 @@ METHODOLOGY = """
|
|
1003 |
<tbody>
|
1004 |
<tr>
|
1005 |
<td>Performance Champion</td>
|
1006 |
-
<td>Claude 3.7 Sonnet comes at the top
|
1007 |
</tr>
|
1008 |
<tr>
|
1009 |
<td>Price-Performance Paradox</td>
|
1010 |
-
<td>Top 3 models span 10x price difference yet only
|
1011 |
</tr>
|
1012 |
<tr>
|
1013 |
<td>Open Vs Closed Source</td>
|
@@ -1278,7 +1278,7 @@ evaluate_handler.finish()
|
|
1278 |
</div>
|
1279 |
<h3 class="feature-title">Updated Periodically</h3>
|
1280 |
<ul class="feature-list">
|
1281 |
-
<li>
|
1282 |
<li>5 open source models included</li>
|
1283 |
<li>Monthly model additions</li>
|
1284 |
</ul>
|
|
|
604 |
|
605 |
CARDS = """ <div class="metrics-grid">
|
606 |
<div class="metric-card">
|
607 |
+
<div class="metric-number metric-blue">20</div>
|
608 |
<div class="metric-label">Total Models</div>
|
609 |
+
<div class="metric-detail primary">15 Private</div>
|
610 |
<div class="metric-detail primary">5 Open Source</div>
|
611 |
</div>
|
612 |
|
|
|
1003 |
<tbody>
|
1004 |
<tr>
|
1005 |
<td>Performance Champion</td>
|
1006 |
+
<td>Claude 3.7 Sonnet(0.953) comes at the top but Gemini-2.0-flash(0.938) & Gemini-2.0-flash-lite(0.933) dominate at a very affordable cost, excelling in both complex tasks and safety features.</td>
|
1007 |
</tr>
|
1008 |
<tr>
|
1009 |
<td>Price-Performance Paradox</td>
|
1010 |
+
<td>Top 3 models span 10x price difference yet only 2% performance gap, challenging pricing assumptions</td>
|
1011 |
</tr>
|
1012 |
<tr>
|
1013 |
<td>Open Vs Closed Source</td>
|
|
|
1278 |
</div>
|
1279 |
<h3 class="feature-title">Updated Periodically</h3>
|
1280 |
<ul class="feature-list">
|
1281 |
+
<li>15 private models evaluated</li>
|
1282 |
<li>5 open source models included</li>
|
1283 |
<li>Monthly model additions</li>
|
1284 |
</ul>
|
get_exp_data.ipynb
CHANGED
@@ -22,7 +22,7 @@
|
|
22 |
},
|
23 |
{
|
24 |
"cell_type": "code",
|
25 |
-
"execution_count":
|
26 |
"metadata": {},
|
27 |
"outputs": [],
|
28 |
"source": [
|
@@ -38,8 +38,6 @@
|
|
38 |
" rows = pq.get_rows(\n",
|
39 |
" project_id=PROJECT_ID,\n",
|
40 |
" run_id=run_id,\n",
|
41 |
-
" task_type=None,\n",
|
42 |
-
" config=None,\n",
|
43 |
" starting_token=0,\n",
|
44 |
" limit=1000,\n",
|
45 |
" )\n",
|
@@ -127,7 +125,7 @@
|
|
127 |
" ))\n",
|
128 |
"\n",
|
129 |
"\n",
|
130 |
-
"models = [\"
|
131 |
"# models = load_data()[\"Model\"]\n",
|
132 |
"\n",
|
133 |
"# Process each model sequentially, but datasets in parallel\n",
|
|
|
22 |
},
|
23 |
{
|
24 |
"cell_type": "code",
|
25 |
+
"execution_count": 5,
|
26 |
"metadata": {},
|
27 |
"outputs": [],
|
28 |
"source": [
|
|
|
38 |
" rows = pq.get_rows(\n",
|
39 |
" project_id=PROJECT_ID,\n",
|
40 |
" run_id=run_id,\n",
|
|
|
|
|
41 |
" starting_token=0,\n",
|
42 |
" limit=1000,\n",
|
43 |
" )\n",
|
|
|
125 |
" ))\n",
|
126 |
"\n",
|
127 |
"\n",
|
128 |
+
"models = [\"gpt-4.5-preview-2025-02-27\", \"gemini-2.0-flash-lite-001\"]\n",
|
129 |
"# models = load_data()[\"Model\"]\n",
|
130 |
"\n",
|
131 |
"# Process each model sequentially, but datasets in parallel\n",
|
output/gemini-2.0-flash-lite-001/BFCL_v3_irrelevance.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4af78144ab6253c0b28e82a27ec47194906d6f6e8c7c2b3ea22dbb26baea007
|
3 |
+
size 31688
|
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_base_multi_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2bd548717e0462128ef71d73f4db2f84bcf5432c13654153601f7482eb6693e
|
3 |
+
size 23101
|
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_base_single_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:73a3ef348ec736138ff10ee82975076787e292de5443763e20dfcaf4c5f51e7e
|
3 |
+
size 22091
|
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_composite.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8b6756dffec7df9edcbabdd9572262d39a6435c597d1bedacc2378585ea80647
|
3 |
+
size 40627
|
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63a9f36658d25493e7b1c46fed8b545553bdd9eef96ae90dd135b7db839d20cf
|
3 |
+
size 38453
|
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_miss_func.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0da60ca0204dfefbd186693c667bf9e97666ba0606ada3f16ff7d136be56856d
|
3 |
+
size 40012
|
output/gemini-2.0-flash-lite-001/BFCL_v3_multi_turn_miss_param.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b8a2189c4cde368a6cdb639c53020d6d7f7898aead9db4b7297c23b94b723ebf
|
3 |
+
size 42217
|
output/gemini-2.0-flash-lite-001/tau_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59807c739c491522a1313dae237716d5dc8950c03dce3ff0dac9938f01ede9f6
|
3 |
+
size 42787
|
output/gemini-2.0-flash-lite-001/toolace_single_func_call_1.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0a24a558b823692f9bc9a7d9ef2413867ec9592184e01a507872cac650864ad
|
3 |
+
size 15620
|
output/gemini-2.0-flash-lite-001/toolace_single_func_call_2.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4847159a0b653531adaf50b76ea9df3f52c7a1001eab9921504ec43b0698ab71
|
3 |
+
size 12338
|
output/gemini-2.0-flash-lite-001/xlam_multiple_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1e66d203598dabe245a64e2cfc680351d6eaf8015d427008e96209f6e21882f
|
3 |
+
size 106605
|
output/gemini-2.0-flash-lite-001/xlam_multiple_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e32ef42004873aca15d79c580073d83c8735e0b45a0bb9d0a208d7e6c73c4ddc
|
3 |
+
size 43575
|
output/gemini-2.0-flash-lite-001/xlam_single_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:46c8bc66edbe57ddfd691c3e2d890448c968ac02174695a43416a67ba5a8fa8b
|
3 |
+
size 32770
|
output/gemini-2.0-flash-lite-001/xlam_single_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7571305a5463d251a469b8fb2fa5520025fa28bcae48d1c14f8d330d85de28e7
|
3 |
+
size 47676
|
output/gemini-2.0-flash-lite-001/xlam_tool_miss.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:656d8d4d0781440567acdfbd48de353d3cc709f1c8902fac2b43dbbc2b17ded9
|
3 |
+
size 48849
|
output/gpt-4.5-preview-2025-02-27/BFCL_v3_irrelevance.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:67aa69636b8eb26a5a8f3aee927faf5aacc3ba8cc707a296fc072d006e58cd79
|
3 |
+
size 48953
|
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_base_multi_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c690fcd18516cdd5e87c111159a5247c3e3a515366bfa863560cf80cd8bad3ef
|
3 |
+
size 23801
|
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_base_single_func_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb2e05d9b936d65b314eeb9bd6075a4045a383faa1f64dddf7aa2977249a3d62
|
3 |
+
size 22711
|
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_composite.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:baa8820f42f293f258baa780e4a5e054e8630c9955280eaf7944f3e9336a15b8
|
3 |
+
size 44267
|
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6944feb423f64be8902471e4d91b229169722a87a45327b563b4ce0ae967e507
|
3 |
+
size 37929
|
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_miss_func.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c64ab23ab939635d646ba39913df7ba067743827e42cb0b253a751f4eba5bb20
|
3 |
+
size 39513
|
output/gpt-4.5-preview-2025-02-27/BFCL_v3_multi_turn_miss_param.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94aa889540de4dc59c1f163468bd0d707732cff90ae909b3a75461df08fb0a9a
|
3 |
+
size 41747
|
output/gpt-4.5-preview-2025-02-27/tau_long_context.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dfd0d4758fd267b966eaafcb5024c11aa895c50a72fd3770a30cc02edba7b178
|
3 |
+
size 43498
|
output/gpt-4.5-preview-2025-02-27/toolace_single_func_call_1.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45065313e44be9aed1f278544afe3e8587cb1753fe8e52b4dac3f7eaa41d969a
|
3 |
+
size 16054
|
output/gpt-4.5-preview-2025-02-27/toolace_single_func_call_2.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66f7c5f2442f2a47c04105bb14a4e42b5dd1970a6dededddfad1bafe6a7cd53c
|
3 |
+
size 11713
|
output/gpt-4.5-preview-2025-02-27/xlam_multiple_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd45047dffb8a469385434362adc759ef75acf33beabd195d82530d32c551639
|
3 |
+
size 100718
|
output/gpt-4.5-preview-2025-02-27/xlam_multiple_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffdf14a08e53074fbdd2ab44924bf2be2cea6f2c391edf4627c8501bb7998b09
|
3 |
+
size 41391
|
output/gpt-4.5-preview-2025-02-27/xlam_single_tool_multiple_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75aa25f3edc9e75ad9843eee1d68a3d9e6042909293419efea452ee6dac17c37
|
3 |
+
size 30433
|
output/gpt-4.5-preview-2025-02-27/xlam_single_tool_single_call.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ebee96bb89682d03e2a933ae8d4966b01ff2ad7a8bf84608c8a7fd4edad0276
|
3 |
+
size 43927
|
output/gpt-4.5-preview-2025-02-27/xlam_tool_miss.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:298a09d3c28677cabd7bf9c10f0b8247b76a2b1e4e38d221db4af6d2b6fc9141
|
3 |
+
size 54032
|
results.csv
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
|
3 |
gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
|
|
|
4 |
gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
|
|
|
5 |
gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
|
6 |
gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
|
7 |
o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
|
@@ -17,4 +19,4 @@ mistral-small-2409,Private,Normal,Mistral,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,
|
|
17 |
ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
|
18 |
Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
|
19 |
open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
|
20 |
-
Dataset Avg,,,,,,,0.
|
|
|
1 |
Model,Model Type,Model Output Type,Vendor,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
|
2 |
claude-3-7-sonnet-20250219,Private,Reasoning,Anthropic,3,15,0.953,0.96,0.95,0.92,0.96,1,0.95,0.97,1,0.96,0.94,0.97,0.96,0.99,0.82,0.92,0.975
|
3 |
gemini-2.0-flash-001,Private,Normal,Google,0.15,0.6,0.938,0.95,0.93,0.91,0.94,0.9,0.96,0.92,0.95,0.89,0.91,0.98,0.93,0.97,0.98,0.93,0.965
|
4 |
+
gemini-2.0-flash-lite-001,Private,Normal,Google,0.075,0.3,0.933,0.96,0.91,0.81,0.98,0.98,0.9,0.91,0.92,0.98,0.86,0.99,0.87,0.97,0.96,0.95,0.975
|
5 |
gpt-4o-2024-11-20,Private,Normal,OpenAI,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
|
6 |
+
gpt-4.5-preview-2025-02-27,Private,Normal,OpenAI,75,150,0.900,0.93,0.87,0.85,0.91,0.92,0.97,0.92,0.99,0.67,0.85,0.98,0.85,1,0.98,0.8,0.915
|
7 |
gemini-1.5-flash,Private,Normal,Google,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
|
8 |
gemini-1.5-pro,Private,Normal,Google,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
|
9 |
o1-2024-12-17,Private,Reasoning,OpenAI,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
|
|
|
19 |
ministral-8b-2410,Private,Normal,Mistral,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
|
20 |
Meta-Llama-3.1-8B-Instruct-Turbo,Open source,Normal,Meta,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
|
21 |
open-mistral-nemo-2407,Open source,Normal,Mistral,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
|
22 |
+
Dataset Avg,,,,,,,0.84,0.81,0.82,0.81,0.79,0.89,0.82,0.96,0.64,0.82,0.84,0.83,0.93,0.86,0.76,0.82
|