Konstantin Chernyshev commited on
Commit
4790bc5
ยท
1 Parent(s): ff4f460

chore: add u-math results

Browse files
Files changed (2) hide show
  1. data/u_math_eval_results.json +1246 -9
  2. src/populate.py +11 -3
data/u_math_eval_results.json CHANGED
@@ -1,13 +1,1250 @@
1
  [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
  "model_name": "gpt-4o-mini-2024-07-18",
4
- "judge_model_name": "gpt-4o-mini-2024-07-18",
5
- "u_math": [0.5123, 0.2345, 0.1234],
6
- "differential_calc": [0.5123, 0.2345, 0.1234],
7
- "integral_calc": [0.43, 0.23, 0.34],
8
- "algebra": [0.98, 0.12, 0.34],
9
- "multivariable_calculus": [0.98, 0.12, 0.34],
10
- "precalculus_review": [0.8412, 0.1234, 0.1234],
11
- "sequences_series": [0.1234, 0.1234, 0.1234]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  }
13
- ]
 
1
  [
2
+ {
3
+ "model_name": "llava-hf/llava-v1.6-mistral-7b-hf",
4
+ "judge_model_name": "gpt-4o-2024-08-06",
5
+ "u_math": [
6
+ 3.2727,
7
+ 3.6667,
8
+ 1.5
9
+ ],
10
+ "algebra": [
11
+ 0.0722,
12
+ 0.0867,
13
+ 0.0
14
+ ],
15
+ "differential_calc": [
16
+ 0.0227,
17
+ 0.0133,
18
+ 0.0429
19
+ ],
20
+ "integral_calc": [
21
+ 0.0,
22
+ 0.0,
23
+ 0.0
24
+ ],
25
+ "multivariable_calculus": [
26
+ 0.0281,
27
+ 0.0333,
28
+ 0.0
29
+ ],
30
+ "precalculus_review": [
31
+ 0.0562,
32
+ 0.06,
33
+ 0.0
34
+ ],
35
+ "sequences_series": [
36
+ 0.026,
37
+ 0.0267,
38
+ 0.0
39
+ ]
40
+ },
41
+ {
42
+ "model_name": "mistralai/Pixtral-12B-2409",
43
+ "judge_model_name": "gpt-4o-2024-08-06",
44
+ "u_math": [
45
+ 15.5455,
46
+ 15.5556,
47
+ 15.5
48
+ ],
49
+ "algebra": [
50
+ 0.4111,
51
+ 0.4467,
52
+ 0.2333
53
+ ],
54
+ "differential_calc": [
55
+ 0.1182,
56
+ 0.0133,
57
+ 0.3429
58
+ ],
59
+ "integral_calc": [
60
+ 0.0048,
61
+ 0.0067,
62
+ 0.0
63
+ ],
64
+ "multivariable_calculus": [
65
+ 0.0281,
66
+ 0.0333,
67
+ 0.0
68
+ ],
69
+ "precalculus_review": [
70
+ 0.3,
71
+ 0.32,
72
+ 0.0
73
+ ],
74
+ "sequences_series": [
75
+ 0.1104,
76
+ 0.1133,
77
+ 0.0
78
+ ]
79
+ },
80
+ {
81
+ "model_name": "meta-llama/Llama-3.2-11B-Vision-Instruct",
82
+ "judge_model_name": "gpt-4o-2024-08-06",
83
+ "u_math": [
84
+ 17.0,
85
+ 18.5556,
86
+ 10.0
87
+ ],
88
+ "algebra": [
89
+ 0.4667,
90
+ 0.54,
91
+ 0.1
92
+ ],
93
+ "differential_calc": [
94
+ 0.0727,
95
+ 0.0133,
96
+ 0.2
97
+ ],
98
+ "integral_calc": [
99
+ 0.0144,
100
+ 0.0133,
101
+ 0.0172
102
+ ],
103
+ "multivariable_calculus": [
104
+ 0.0449,
105
+ 0.0467,
106
+ 0.0357
107
+ ],
108
+ "precalculus_review": [
109
+ 0.4125,
110
+ 0.4333,
111
+ 0.1
112
+ ],
113
+ "sequences_series": [
114
+ 0.0649,
115
+ 0.0667,
116
+ 0.0
117
+ ]
118
+ },
119
+ {
120
+ "model_name": "llava-hf/llava-onevision-qwen2-7b-ov-chat-hf",
121
+ "judge_model_name": "gpt-4o-2024-08-06",
122
+ "u_math": [
123
+ 17.7273,
124
+ 20.6667,
125
+ 4.5
126
+ ],
127
+ "algebra": [
128
+ 0.5167,
129
+ 0.6067,
130
+ 0.0667
131
+ ],
132
+ "differential_calc": [
133
+ 0.0455,
134
+ 0.04,
135
+ 0.0571
136
+ ],
137
+ "integral_calc": [
138
+ 0.0144,
139
+ 0.0133,
140
+ 0.0172
141
+ ],
142
+ "multivariable_calculus": [
143
+ 0.0506,
144
+ 0.0533,
145
+ 0.0357
146
+ ],
147
+ "precalculus_review": [
148
+ 0.4125,
149
+ 0.4333,
150
+ 0.1
151
+ ],
152
+ "sequences_series": [
153
+ 0.0909,
154
+ 0.0933,
155
+ 0.0
156
+ ]
157
+ },
158
+ {
159
+ "model_name": "mistralai/Mathstral-7B-v0.1",
160
+ "judge_model_name": "gpt-4o-2024-08-06",
161
+ "u_math": [
162
+ 18.0,
163
+ 20.6667,
164
+ 6.0
165
+ ],
166
+ "algebra": [
167
+ 0.4389,
168
+ 0.5133,
169
+ 0.0667
170
+ ],
171
+ "differential_calc": [
172
+ 0.0591,
173
+ 0.04,
174
+ 0.1
175
+ ],
176
+ "integral_calc": [
177
+ 0.0144,
178
+ 0.0133,
179
+ 0.0172
180
+ ],
181
+ "multivariable_calculus": [
182
+ 0.073,
183
+ 0.08,
184
+ 0.0357
185
+ ],
186
+ "precalculus_review": [
187
+ 0.4625,
188
+ 0.4867,
189
+ 0.1
190
+ ],
191
+ "sequences_series": [
192
+ 0.1039,
193
+ 0.1067,
194
+ 0.0
195
+ ]
196
+ },
197
+ {
198
+ "model_name": "mistralai/Ministral-8B-Instruct-2410",
199
+ "judge_model_name": "gpt-4o-2024-08-06",
200
+ "u_math": [
201
+ 18.2727,
202
+ 21.4444,
203
+ 4.0
204
+ ],
205
+ "algebra": [
206
+ 0.5222,
207
+ 0.62,
208
+ 0.0333
209
+ ],
210
+ "differential_calc": [
211
+ 0.05,
212
+ 0.0333,
213
+ 0.0857
214
+ ],
215
+ "integral_calc": [
216
+ 0.0096,
217
+ 0.0133,
218
+ 0.0
219
+ ],
220
+ "multivariable_calculus": [
221
+ 0.0562,
222
+ 0.06,
223
+ 0.0357
224
+ ],
225
+ "precalculus_review": [
226
+ 0.4375,
227
+ 0.4667,
228
+ 0.0
229
+ ],
230
+ "sequences_series": [
231
+ 0.0909,
232
+ 0.0933,
233
+ 0.0
234
+ ]
235
+ },
236
+ {
237
+ "model_name": "AI-MO/NuminaMath-7B-CoT",
238
+ "judge_model_name": "gpt-4o-2024-08-06",
239
+ "u_math": [
240
+ 19.1818,
241
+ 22.7778,
242
+ 3.0
243
+ ],
244
+ "algebra": [
245
+ 0.5222,
246
+ 0.6267,
247
+ 0.0
248
+ ],
249
+ "differential_calc": [
250
+ 0.05,
251
+ 0.04,
252
+ 0.0714
253
+ ],
254
+ "integral_calc": [
255
+ 0.0096,
256
+ 0.0133,
257
+ 0.0
258
+ ],
259
+ "multivariable_calculus": [
260
+ 0.0562,
261
+ 0.06,
262
+ 0.0357
263
+ ],
264
+ "precalculus_review": [
265
+ 0.4812,
266
+ 0.5133,
267
+ 0.0
268
+ ],
269
+ "sequences_series": [
270
+ 0.1104,
271
+ 0.1133,
272
+ 0.0
273
+ ]
274
+ },
275
+ {
276
+ "model_name": "Qwen/Qwen2-VL-7B-Instruct",
277
+ "judge_model_name": "gpt-4o-2024-08-06",
278
+ "u_math": [
279
+ 20.3636,
280
+ 21.4444,
281
+ 15.5
282
+ ],
283
+ "algebra": [
284
+ 0.5389,
285
+ 0.6267,
286
+ 0.1
287
+ ],
288
+ "differential_calc": [
289
+ 0.1364,
290
+ 0.0467,
291
+ 0.3286
292
+ ],
293
+ "integral_calc": [
294
+ 0.0192,
295
+ 0.0067,
296
+ 0.0517
297
+ ],
298
+ "multivariable_calculus": [
299
+ 0.0674,
300
+ 0.0667,
301
+ 0.0714
302
+ ],
303
+ "precalculus_review": [
304
+ 0.425,
305
+ 0.4533,
306
+ 0.0
307
+ ],
308
+ "sequences_series": [
309
+ 0.0844,
310
+ 0.0867,
311
+ 0.0
312
+ ]
313
+ },
314
+ {
315
+ "model_name": "meta-llama/Llama-3.1-8B-Instruct",
316
+ "judge_model_name": "gpt-4o-2024-08-06",
317
+ "u_math": [
318
+ 22.2727,
319
+ 26.1111,
320
+ 5.0
321
+ ],
322
+ "algebra": [
323
+ 0.5,
324
+ 0.5933,
325
+ 0.0333
326
+ ],
327
+ "differential_calc": [
328
+ 0.0636,
329
+ 0.0667,
330
+ 0.0571
331
+ ],
332
+ "integral_calc": [
333
+ 0.0769,
334
+ 0.0933,
335
+ 0.0345
336
+ ],
337
+ "multivariable_calculus": [
338
+ 0.1011,
339
+ 0.1133,
340
+ 0.0357
341
+ ],
342
+ "precalculus_review": [
343
+ 0.5188,
344
+ 0.5467,
345
+ 0.1
346
+ ],
347
+ "sequences_series": [
348
+ 0.1558,
349
+ 0.1533,
350
+ 0.25
351
+ ]
352
+ },
353
+ {
354
+ "model_name": "AI-MO/NuminaMath-72B-CoT",
355
+ "judge_model_name": "gpt-4o-2024-08-06",
356
+ "u_math": [
357
+ 25.0,
358
+ 29.6667,
359
+ 4.0
360
+ ],
361
+ "algebra": [
362
+ 0.6278,
363
+ 0.7467,
364
+ 0.0333
365
+ ],
366
+ "differential_calc": [
367
+ 0.0591,
368
+ 0.0667,
369
+ 0.0429
370
+ ],
371
+ "integral_calc": [
372
+ 0.0385,
373
+ 0.04,
374
+ 0.0345
375
+ ],
376
+ "multivariable_calculus": [
377
+ 0.1011,
378
+ 0.1133,
379
+ 0.0357
380
+ ],
381
+ "precalculus_review": [
382
+ 0.5938,
383
+ 0.6267,
384
+ 0.1
385
+ ],
386
+ "sequences_series": [
387
+ 0.1818,
388
+ 0.1867,
389
+ 0.0
390
+ ]
391
+ },
392
+ {
393
+ "model_name": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
394
+ "judge_model_name": "gpt-4o-2024-08-06",
395
+ "u_math": [
396
+ 25.9091,
397
+ 30.2222,
398
+ 6.5
399
+ ],
400
+ "algebra": [
401
+ 0.6611,
402
+ 0.78,
403
+ 0.0667
404
+ ],
405
+ "differential_calc": [
406
+ 0.0591,
407
+ 0.0533,
408
+ 0.0714
409
+ ],
410
+ "integral_calc": [
411
+ 0.0673,
412
+ 0.08,
413
+ 0.0345
414
+ ],
415
+ "multivariable_calculus": [
416
+ 0.118,
417
+ 0.1333,
418
+ 0.0357
419
+ ],
420
+ "precalculus_review": [
421
+ 0.5938,
422
+ 0.6267,
423
+ 0.1
424
+ ],
425
+ "sequences_series": [
426
+ 0.1494,
427
+ 0.14,
428
+ 0.5
429
+ ]
430
+ },
431
+ {
432
+ "model_name": "meta-llama/Llama-3.1-70B-Instruct",
433
+ "judge_model_name": "gpt-4o-2024-08-06",
434
+ "u_math": [
435
+ 28.4545,
436
+ 33.6667,
437
+ 5.0
438
+ ],
439
+ "algebra": [
440
+ 0.6889,
441
+ 0.82,
442
+ 0.0333
443
+ ],
444
+ "differential_calc": [
445
+ 0.0909,
446
+ 0.1067,
447
+ 0.0571
448
+ ],
449
+ "integral_calc": [
450
+ 0.0433,
451
+ 0.04,
452
+ 0.0517
453
+ ],
454
+ "multivariable_calculus": [
455
+ 0.1236,
456
+ 0.14,
457
+ 0.0357
458
+ ],
459
+ "precalculus_review": [
460
+ 0.6,
461
+ 0.64,
462
+ 0.0
463
+ ],
464
+ "sequences_series": [
465
+ 0.2727,
466
+ 0.2733,
467
+ 0.25
468
+ ]
469
+ },
470
+ {
471
+ "model_name": "Qwen/Qwen2-VL-72B-Instruct",
472
+ "judge_model_name": "gpt-4o-2024-08-06",
473
+ "u_math": [
474
+ 31.1818,
475
+ 32.2222,
476
+ 26.5
477
+ ],
478
+ "algebra": [
479
+ 0.7167,
480
+ 0.8067,
481
+ 0.2667
482
+ ],
483
+ "differential_calc": [
484
+ 0.1909,
485
+ 0.0933,
486
+ 0.4
487
+ ],
488
+ "integral_calc": [
489
+ 0.0529,
490
+ 0.02,
491
+ 0.1379
492
+ ],
493
+ "multivariable_calculus": [
494
+ 0.1685,
495
+ 0.1467,
496
+ 0.2857
497
+ ],
498
+ "precalculus_review": [
499
+ 0.6188,
500
+ 0.6533,
501
+ 0.1
502
+ ],
503
+ "sequences_series": [
504
+ 0.2078,
505
+ 0.2133,
506
+ 0.0
507
+ ]
508
+ },
509
+ {
510
+ "model_name": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
511
+ "judge_model_name": "gpt-4o-2024-08-06",
512
+ "u_math": [
513
+ 31.3636,
514
+ 37.4444,
515
+ 4.0
516
+ ],
517
+ "algebra": [
518
+ 0.7,
519
+ 0.84,
520
+ 0.0
521
+ ],
522
+ "differential_calc": [
523
+ 0.1091,
524
+ 0.1467,
525
+ 0.0286
526
+ ],
527
+ "integral_calc": [
528
+ 0.0385,
529
+ 0.04,
530
+ 0.0345
531
+ ],
532
+ "multivariable_calculus": [
533
+ 0.2247,
534
+ 0.2533,
535
+ 0.0714
536
+ ],
537
+ "precalculus_review": [
538
+ 0.6125,
539
+ 0.64,
540
+ 0.2
541
+ ],
542
+ "sequences_series": [
543
+ 0.3182,
544
+ 0.3267,
545
+ 0.0
546
+ ]
547
+ },
548
+ {
549
+ "model_name": "meta-llama/Llama-3.2-90B-Vision-Instruct",
550
+ "judge_model_name": "gpt-4o-2024-08-06",
551
+ "u_math": [
552
+ 32.6364,
553
+ 36.3333,
554
+ 16.0
555
+ ],
556
+ "algebra": [
557
+ 0.7556,
558
+ 0.8533,
559
+ 0.2667
560
+ ],
561
+ "differential_calc": [
562
+ 0.1545,
563
+ 0.1067,
564
+ 0.2571
565
+ ],
566
+ "integral_calc": [
567
+ 0.024,
568
+ 0.0267,
569
+ 0.0172
570
+ ],
571
+ "multivariable_calculus": [
572
+ 0.2022,
573
+ 0.2267,
574
+ 0.0714
575
+ ],
576
+ "precalculus_review": [
577
+ 0.625,
578
+ 0.6533,
579
+ 0.2
580
+ ],
581
+ "sequences_series": [
582
+ 0.3117,
583
+ 0.3133,
584
+ 0.25
585
+ ]
586
+ },
587
+ {
588
+ "model_name": "Qwen/Qwen2.5-7B-Instruct",
589
+ "judge_model_name": "gpt-4o-2024-08-06",
590
+ "u_math": [
591
+ 33.8182,
592
+ 40.0,
593
+ 6.0
594
+ ],
595
+ "algebra": [
596
+ 0.7333,
597
+ 0.86,
598
+ 0.1
599
+ ],
600
+ "differential_calc": [
601
+ 0.0909,
602
+ 0.1267,
603
+ 0.0143
604
+ ],
605
+ "integral_calc": [
606
+ 0.1058,
607
+ 0.1,
608
+ 0.1207
609
+ ],
610
+ "multivariable_calculus": [
611
+ 0.2303,
612
+ 0.2667,
613
+ 0.0357
614
+ ],
615
+ "precalculus_review": [
616
+ 0.7062,
617
+ 0.7533,
618
+ 0.0
619
+ ],
620
+ "sequences_series": [
621
+ 0.2857,
622
+ 0.2933,
623
+ 0.0
624
+ ]
625
+ },
626
+ {
627
+ "model_name": "claude-sonnet-3-5",
628
+ "judge_model_name": "gpt-4o-2024-08-06",
629
+ "u_math": [
630
+ 35.0909,
631
+ 36.1111,
632
+ 30.5
633
+ ],
634
+ "algebra": [
635
+ 0.6889,
636
+ 0.76,
637
+ 0.3333
638
+ ],
639
+ "differential_calc": [
640
+ 0.2136,
641
+ 0.12,
642
+ 0.4143
643
+ ],
644
+ "integral_calc": [
645
+ 0.101,
646
+ 0.0733,
647
+ 0.1724
648
+ ],
649
+ "multivariable_calculus": [
650
+ 0.2247,
651
+ 0.2133,
652
+ 0.2857
653
+ ],
654
+ "precalculus_review": [
655
+ 0.6312,
656
+ 0.6533,
657
+ 0.3
658
+ ],
659
+ "sequences_series": [
660
+ 0.3442,
661
+ 0.3467,
662
+ 0.25
663
+ ]
664
+ },
665
  {
666
  "model_name": "gpt-4o-mini-2024-07-18",
667
+ "judge_model_name": "gpt-4o-2024-08-06",
668
+ "u_math": [
669
+ 37.1818,
670
+ 40.3333,
671
+ 23.0
672
+ ],
673
+ "algebra": [
674
+ 0.7611,
675
+ 0.88,
676
+ 0.1667
677
+ ],
678
+ "differential_calc": [
679
+ 0.2136,
680
+ 0.1667,
681
+ 0.3143
682
+ ],
683
+ "integral_calc": [
684
+ 0.0577,
685
+ 0.04,
686
+ 0.1034
687
+ ],
688
+ "multivariable_calculus": [
689
+ 0.2584,
690
+ 0.24,
691
+ 0.3571
692
+ ],
693
+ "precalculus_review": [
694
+ 0.7375,
695
+ 0.7733,
696
+ 0.2
697
+ ],
698
+ "sequences_series": [
699
+ 0.3182,
700
+ 0.32,
701
+ 0.25
702
+ ]
703
+ },
704
+ {
705
+ "model_name": "meta-llama/Llama-3.3-70B-Instruct",
706
+ "judge_model_name": "gpt-4o-2024-08-06",
707
+ "u_math": [
708
+ 37.2727,
709
+ 43.4444,
710
+ 9.5
711
+ ],
712
+ "algebra": [
713
+ 0.7333,
714
+ 0.8733,
715
+ 0.0333
716
+ ],
717
+ "differential_calc": [
718
+ 0.1773,
719
+ 0.2,
720
+ 0.1286
721
+ ],
722
+ "integral_calc": [
723
+ 0.1154,
724
+ 0.1133,
725
+ 0.1207
726
+ ],
727
+ "multivariable_calculus": [
728
+ 0.3315,
729
+ 0.38,
730
+ 0.0714
731
+ ],
732
+ "precalculus_review": [
733
+ 0.6312,
734
+ 0.6733,
735
+ 0.0
736
+ ],
737
+ "sequences_series": [
738
+ 0.3571,
739
+ 0.3667,
740
+ 0.0
741
+ ]
742
+ },
743
+ {
744
+ "model_name": "Qwen/Qwen2.5-Math-7B-Instruct",
745
+ "judge_model_name": "gpt-4o-2024-08-06",
746
+ "u_math": [
747
+ 38.3636,
748
+ 45.2222,
749
+ 7.5
750
+ ],
751
+ "algebra": [
752
+ 0.7389,
753
+ 0.8733,
754
+ 0.0667
755
+ ],
756
+ "differential_calc": [
757
+ 0.1455,
758
+ 0.1867,
759
+ 0.0571
760
+ ],
761
+ "integral_calc": [
762
+ 0.0865,
763
+ 0.08,
764
+ 0.1034
765
+ ],
766
+ "multivariable_calculus": [
767
+ 0.3202,
768
+ 0.36,
769
+ 0.1071
770
+ ],
771
+ "precalculus_review": [
772
+ 0.7562,
773
+ 0.8067,
774
+ 0.0
775
+ ],
776
+ "sequences_series": [
777
+ 0.3961,
778
+ 0.4067,
779
+ 0.0
780
+ ]
781
+ },
782
+ {
783
+ "model_name": "mistralai/Pixtral-Large-Instruct-2411",
784
+ "judge_model_name": "gpt-4o-2024-08-06",
785
+ "u_math": [
786
+ 39.7273,
787
+ 42.8889,
788
+ 25.5
789
+ ],
790
+ "algebra": [
791
+ 0.7722,
792
+ 0.86,
793
+ 0.3333
794
+ ],
795
+ "differential_calc": [
796
+ 0.2045,
797
+ 0.1533,
798
+ 0.3143
799
+ ],
800
+ "integral_calc": [
801
+ 0.1106,
802
+ 0.0933,
803
+ 0.1552
804
+ ],
805
+ "multivariable_calculus": [
806
+ 0.309,
807
+ 0.32,
808
+ 0.25
809
+ ],
810
+ "precalculus_review": [
811
+ 0.6938,
812
+ 0.7267,
813
+ 0.2
814
+ ],
815
+ "sequences_series": [
816
+ 0.4156,
817
+ 0.42,
818
+ 0.25
819
+ ]
820
+ },
821
+ {
822
+ "model_name": "mistralai/Mistral-Large-Instruct-2411",
823
+ "judge_model_name": "gpt-4o-2024-08-06",
824
+ "u_math": [
825
+ 40.3636,
826
+ 48.1111,
827
+ 5.5
828
+ ],
829
+ "algebra": [
830
+ 0.7333,
831
+ 0.8667,
832
+ 0.0667
833
+ ],
834
+ "differential_calc": [
835
+ 0.1682,
836
+ 0.2333,
837
+ 0.0286
838
+ ],
839
+ "integral_calc": [
840
+ 0.125,
841
+ 0.1533,
842
+ 0.0517
843
+ ],
844
+ "multivariable_calculus": [
845
+ 0.3315,
846
+ 0.3733,
847
+ 0.1071
848
+ ],
849
+ "precalculus_review": [
850
+ 0.7562,
851
+ 0.8067,
852
+ 0.0
853
+ ],
854
+ "sequences_series": [
855
+ 0.4481,
856
+ 0.4533,
857
+ 0.25
858
+ ]
859
+ },
860
+ {
861
+ "model_name": "Qwen/Qwen2.5-72B-Instruct",
862
+ "judge_model_name": "gpt-4o-2024-08-06",
863
+ "u_math": [
864
+ 41.0,
865
+ 48.5556,
866
+ 7.0
867
+ ],
868
+ "algebra": [
869
+ 0.75,
870
+ 0.8867,
871
+ 0.0667
872
+ ],
873
+ "differential_calc": [
874
+ 0.1682,
875
+ 0.2267,
876
+ 0.0429
877
+ ],
878
+ "integral_calc": [
879
+ 0.1058,
880
+ 0.12,
881
+ 0.069
882
+ ],
883
+ "multivariable_calculus": [
884
+ 0.3652,
885
+ 0.4,
886
+ 0.1786
887
+ ],
888
+ "precalculus_review": [
889
+ 0.7812,
890
+ 0.8333,
891
+ 0.0
892
+ ],
893
+ "sequences_series": [
894
+ 0.4351,
895
+ 0.4467,
896
+ 0.0
897
+ ]
898
+ },
899
+ {
900
+ "model_name": "gpt-4o-2024-05-13",
901
+ "judge_model_name": "gpt-4o-2024-08-06",
902
+ "u_math": [
903
+ 43.3636,
904
+ 45.7778,
905
+ 32.5
906
+ ],
907
+ "algebra": [
908
+ 0.7778,
909
+ 0.8733,
910
+ 0.3
911
+ ],
912
+ "differential_calc": [
913
+ 0.2727,
914
+ 0.2067,
915
+ 0.4143
916
+ ],
917
+ "integral_calc": [
918
+ 0.1154,
919
+ 0.08,
920
+ 0.2069
921
+ ],
922
+ "multivariable_calculus": [
923
+ 0.3876,
924
+ 0.3867,
925
+ 0.3929
926
+ ],
927
+ "precalculus_review": [
928
+ 0.775,
929
+ 0.8067,
930
+ 0.3
931
+ ],
932
+ "sequences_series": [
933
+ 0.3896,
934
+ 0.3933,
935
+ 0.25
936
+ ]
937
+ },
938
+ {
939
+ "model_name": "gpt-4o-2024-08-06",
940
+ "judge_model_name": "gpt-4o-2024-08-06",
941
+ "u_math": [
942
+ 43.4545,
943
+ 46.4444,
944
+ 30.0
945
+ ],
946
+ "algebra": [
947
+ 0.8111,
948
+ 0.9133,
949
+ 0.3
950
+ ],
951
+ "differential_calc": [
952
+ 0.2318,
953
+ 0.1867,
954
+ 0.3286
955
+ ],
956
+ "integral_calc": [
957
+ 0.1298,
958
+ 0.1,
959
+ 0.2069
960
+ ],
961
+ "multivariable_calculus": [
962
+ 0.4157,
963
+ 0.4133,
964
+ 0.4286
965
+ ],
966
+ "precalculus_review": [
967
+ 0.7625,
968
+ 0.7933,
969
+ 0.3
970
+ ],
971
+ "sequences_series": [
972
+ 0.3766,
973
+ 0.38,
974
+ 0.25
975
+ ]
976
+ },
977
+ {
978
+ "model_name": "Qwen/Qwen2.5-32B-Instruct",
979
+ "judge_model_name": "gpt-4o-2024-08-06",
980
+ "u_math": [
981
+ 43.8182,
982
+ 51.4444,
983
+ 9.5
984
+ ],
985
+ "algebra": [
986
+ 0.7778,
987
+ 0.9267,
988
+ 0.0333
989
+ ],
990
+ "differential_calc": [
991
+ 0.2273,
992
+ 0.3,
993
+ 0.0714
994
+ ],
995
+ "integral_calc": [
996
+ 0.1202,
997
+ 0.12,
998
+ 0.1207
999
+ ],
1000
+ "multivariable_calculus": [
1001
+ 0.4101,
1002
+ 0.4533,
1003
+ 0.1786
1004
+ ],
1005
+ "precalculus_review": [
1006
+ 0.7875,
1007
+ 0.8333,
1008
+ 0.1
1009
+ ],
1010
+ "sequences_series": [
1011
+ 0.4416,
1012
+ 0.4533,
1013
+ 0.0
1014
+ ]
1015
+ },
1016
+ {
1017
+ "model_name": "Nexusflow/Athene-V2-Chat",
1018
+ "judge_model_name": "gpt-4o-2024-08-06",
1019
+ "u_math": [
1020
+ 46.1818,
1021
+ 54.5556,
1022
+ 8.5
1023
+ ],
1024
+ "algebra": [
1025
+ 0.7444,
1026
+ 0.8867,
1027
+ 0.0333
1028
+ ],
1029
+ "differential_calc": [
1030
+ 0.2455,
1031
+ 0.34,
1032
+ 0.0429
1033
+ ],
1034
+ "integral_calc": [
1035
+ 0.1346,
1036
+ 0.16,
1037
+ 0.069
1038
+ ],
1039
+ "multivariable_calculus": [
1040
+ 0.4607,
1041
+ 0.5067,
1042
+ 0.2143
1043
+ ],
1044
+ "precalculus_review": [
1045
+ 0.8375,
1046
+ 0.8867,
1047
+ 0.1
1048
+ ],
1049
+ "sequences_series": [
1050
+ 0.4935,
1051
+ 0.4933,
1052
+ 0.5
1053
+ ]
1054
+ },
1055
+ {
1056
+ "model_name": "Qwen/Qwen2.5-Math-72B-Instruct",
1057
+ "judge_model_name": "gpt-4o-2024-08-06",
1058
+ "u_math": [
1059
+ 50.1818,
1060
+ 59.0,
1061
+ 10.5
1062
+ ],
1063
+ "algebra": [
1064
+ 0.7833,
1065
+ 0.9267,
1066
+ 0.0667
1067
+ ],
1068
+ "differential_calc": [
1069
+ 0.2636,
1070
+ 0.3533,
1071
+ 0.0714
1072
+ ],
1073
+ "integral_calc": [
1074
+ 0.1971,
1075
+ 0.2067,
1076
+ 0.1724
1077
+ ],
1078
+ "multivariable_calculus": [
1079
+ 0.5,
1080
+ 0.58,
1081
+ 0.0714
1082
+ ],
1083
+ "precalculus_review": [
1084
+ 0.8438,
1085
+ 0.9,
1086
+ 0.0
1087
+ ],
1088
+ "sequences_series": [
1089
+ 0.5714,
1090
+ 0.5733,
1091
+ 0.5
1092
+ ]
1093
+ },
1094
+ {
1095
+ "model_name": "Qwen/QVQ-72B-Preview",
1096
+ "judge_model_name": "gpt-4o-2024-08-06",
1097
+ "u_math": [
1098
+ 50.5455,
1099
+ 59.3333,
1100
+ 11.0
1101
+ ],
1102
+ "algebra": [
1103
+ 0.7833,
1104
+ 0.9267,
1105
+ 0.0667
1106
+ ],
1107
+ "differential_calc": [
1108
+ 0.3182,
1109
+ 0.4467,
1110
+ 0.0429
1111
+ ],
1112
+ "integral_calc": [
1113
+ 0.1731,
1114
+ 0.1933,
1115
+ 0.1207
1116
+ ],
1117
+ "multivariable_calculus": [
1118
+ 0.4888,
1119
+ 0.5333,
1120
+ 0.25
1121
+ ],
1122
+ "precalculus_review": [
1123
+ 0.8688,
1124
+ 0.9133,
1125
+ 0.2
1126
+ ],
1127
+ "sequences_series": [
1128
+ 0.539,
1129
+ 0.5467,
1130
+ 0.25
1131
+ ]
1132
+ },
1133
+ {
1134
+ "model_name": "google/gemini-1.5-flash",
1135
+ "judge_model_name": "gpt-4o-2024-08-06",
1136
+ "u_math": [
1137
+ 51.2727,
1138
+ 53.7778,
1139
+ 40.0
1140
+ ],
1141
+ "algebra": [
1142
+ 0.8444,
1143
+ 0.9133,
1144
+ 0.5
1145
+ ],
1146
+ "differential_calc": [
1147
+ 0.3909,
1148
+ 0.36,
1149
+ 0.4571
1150
+ ],
1151
+ "integral_calc": [
1152
+ 0.1683,
1153
+ 0.14,
1154
+ 0.2414
1155
+ ],
1156
+ "multivariable_calculus": [
1157
+ 0.4494,
1158
+ 0.44,
1159
+ 0.5
1160
+ ],
1161
+ "precalculus_review": [
1162
+ 0.775,
1163
+ 0.8067,
1164
+ 0.3
1165
+ ],
1166
+ "sequences_series": [
1167
+ 0.5649,
1168
+ 0.5667,
1169
+ 0.5
1170
+ ]
1171
+ },
1172
+ {
1173
+ "model_name": "google/gemini-1.5-pro",
1174
+ "judge_model_name": "gpt-4o-2024-08-06",
1175
+ "u_math": [
1176
+ 60.0909,
1177
+ 63.4444,
1178
+ 45.0
1179
+ ],
1180
+ "algebra": [
1181
+ 0.8611,
1182
+ 0.9133,
1183
+ 0.6
1184
+ ],
1185
+ "differential_calc": [
1186
+ 0.4955,
1187
+ 0.5067,
1188
+ 0.4714
1189
+ ],
1190
+ "integral_calc": [
1191
+ 0.2644,
1192
+ 0.2733,
1193
+ 0.2414
1194
+ ],
1195
+ "multivariable_calculus": [
1196
+ 0.6011,
1197
+ 0.6067,
1198
+ 0.5714
1199
+ ],
1200
+ "precalculus_review": [
1201
+ 0.8625,
1202
+ 0.8733,
1203
+ 0.7
1204
+ ],
1205
+ "sequences_series": [
1206
+ 0.6299,
1207
+ 0.6333,
1208
+ 0.5
1209
+ ]
1210
+ },
1211
+ {
1212
+ "model_name": "Qwen/QwQ-32B-Preview",
1213
+ "judge_model_name": "gpt-4o-2024-08-06",
1214
+ "u_math": [
1215
+ 61.4545,
1216
+ 71.7778,
1217
+ 15.0
1218
+ ],
1219
+ "algebra": [
1220
+ 0.7889,
1221
+ 0.94,
1222
+ 0.0333
1223
+ ],
1224
+ "differential_calc": [
1225
+ 0.4364,
1226
+ 0.6067,
1227
+ 0.0714
1228
+ ],
1229
+ "integral_calc": [
1230
+ 0.351,
1231
+ 0.3933,
1232
+ 0.2414
1233
+ ],
1234
+ "multivariable_calculus": [
1235
+ 0.5899,
1236
+ 0.6533,
1237
+ 0.25
1238
+ ],
1239
+ "precalculus_review": [
1240
+ 0.875,
1241
+ 0.9267,
1242
+ 0.1
1243
+ ],
1244
+ "sequences_series": [
1245
+ 0.7792,
1246
+ 0.7867,
1247
+ 0.5
1248
+ ]
1249
  }
1250
+ ]
src/populate.py CHANGED
@@ -30,8 +30,10 @@ def model_size_to_symbol(model_size_in_b_params: int | None) -> str:
30
  return "๐Ÿš—"
31
  elif model_size_in_b_params < 100:
32
  return "๐Ÿšš"
33
- else:
34
  return "๐Ÿš€"
 
 
35
 
36
 
37
  def model_type_to_symbol(model_type: str) -> str:
@@ -47,6 +49,10 @@ def model_type_to_symbol(model_type: str) -> str:
47
  def get_hf_data_by_model_name(model_name: str) -> dict:
48
  """Get model data from Hugging Face API by model name"""
49
  still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
 
 
 
 
50
 
51
  architecture = "Unknown"
52
  if model_config is not None:
@@ -58,9 +64,11 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
58
  if still_on_hub:
59
  info = model_info(repo_id=model_name)
60
  try:
61
- num_params = round(info.safetensors["total"] / 1e9, 3)
62
- except AttributeError as e:
63
  print("SafeTensors not found in", model_name, e)
 
 
64
  pass
65
  print("num_params", model_name, num_params)
66
 
 
30
  return "๐Ÿš—"
31
  elif model_size_in_b_params < 100:
32
  return "๐Ÿšš"
33
+ elif model_size_in_b_params < 1000:
34
  return "๐Ÿš€"
35
+ else:
36
+ return "โ“"
37
 
38
 
39
  def model_type_to_symbol(model_type: str) -> str:
 
49
  def get_hf_data_by_model_name(model_name: str) -> dict:
50
  """Get model data from Hugging Face API by model name"""
51
  still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
52
+ if not still_on_hub and '/' in model_name:
53
+ print(f"Model {model_name} is not on the hub, try unsloth/...")
54
+ model_name = "unsloth/" + model_name.split("/")[-1]
55
+ still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
56
 
57
  architecture = "Unknown"
58
  if model_config is not None:
 
64
  if still_on_hub:
65
  info = model_info(repo_id=model_name)
66
  try:
67
+ num_params = round(info.safetensors["total"] / 1e9, 1)
68
+ except Exception as e:
69
  print("SafeTensors not found in", model_name, e)
70
+ if 'Pixtral-12B' in model_name:
71
+ num_params = 12
72
  pass
73
  print("num_params", model_name, num_params)
74