ranarag commited on
Commit
f3b89ce
·
verified ·
1 Parent(s): df30579

Update README.md

Browse files

added BBH scores.

Files changed (1) hide show
  1. README.md +13 -12
README.md CHANGED
@@ -215,8 +215,8 @@ By implementing this innovative prevention strategy, we can significantly reduce
215
  <th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
216
  <th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
217
  <th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
218
- <!-- <th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard</th> -->
219
- <th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="fnref2"><a href="#fn2">2</a></sup></th>
220
  <th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
221
  <th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
222
  <th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
@@ -231,7 +231,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
231
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
232
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
233
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
234
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.46</td> -->
235
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
236
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
237
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
@@ -246,7 +246,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
246
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
247
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
248
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
249
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.27</td> -->
250
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
251
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
252
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
@@ -261,7 +261,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
261
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
262
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
263
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
264
- <!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 52.51 </td> -->
265
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
266
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
267
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
@@ -277,7 +277,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
277
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
278
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
279
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
280
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.66</td> -->
281
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
282
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
283
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
@@ -293,7 +293,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
293
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
294
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
295
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
296
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.71</td> -->
297
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
298
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
299
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
@@ -309,7 +309,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
309
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
310
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
311
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
312
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">70.40</td> -->
313
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
314
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
315
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
@@ -325,7 +325,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
325
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
326
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
327
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
328
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.04</td> -->
329
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
330
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
331
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
@@ -340,7 +340,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
340
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
341
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
342
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
343
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">68.55</td> -->
344
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
345
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
346
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
@@ -356,7 +356,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
356
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
357
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
358
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
359
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.77</td> -->
360
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
361
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
362
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
@@ -371,7 +371,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
371
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
372
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
373
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
374
- <!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.01 </td> -->
375
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
376
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
377
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
@@ -528,4 +528,5 @@ Granite-3.3-2B-Instruct builds upon Granite-3.3-2B-Base, leveraging both permiss
528
 
529
 
530
  <p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
 
531
  <p><a href="#fnref2" title="Jump back to reference">[2]</a> Modified the implementation to handle some of the issues mentioned <a href="https://huggingface.co/blog/open-llm-leaderboard-drop">here</a></p>
 
215
  <th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
216
  <th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
217
  <th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
218
+ <th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard<sup id="fnref2"><a href="#fn2">2</a></sup></th>
219
+ <th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="fnref3"><a href="#fn3">3</a></sup></th>
220
  <th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
221
  <th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
222
  <th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
 
231
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
232
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
233
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
234
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.82</td>
235
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
236
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
237
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
 
246
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
247
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
248
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
249
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.39</td>
250
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
251
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
252
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
 
261
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
262
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
263
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
264
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 63.91 </td>
265
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
266
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
267
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
 
277
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
278
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
279
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
280
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.43</td>
281
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
282
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
283
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
 
293
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
294
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
295
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
296
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.39</td>
297
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
298
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
299
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
 
309
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
310
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
311
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
312
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.19</td>
313
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
314
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
315
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
 
325
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
326
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
327
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
328
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.38</td>
329
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
330
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
331
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
 
340
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
341
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
342
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
343
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.87</td>
344
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
345
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
346
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
 
356
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
357
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
358
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
359
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.86</td>
360
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
361
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
362
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
 
371
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
372
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
373
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
374
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 69.13 </td>
375
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
376
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
377
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
 
528
 
529
 
530
  <p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
531
+ <p><a href="#fnref2" title="Jump back to reference">[2]</a> Added regex for more efficient asnwer extraction.</a></p>
532
  <p><a href="#fnref2" title="Jump back to reference">[2]</a> Modified the implementation to handle some of the issues mentioned <a href="https://huggingface.co/blog/open-llm-leaderboard-drop">here</a></p>