Update README.md
Browse filesadded drop scores
README.md
CHANGED
@@ -216,7 +216,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
216 |
<th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
|
217 |
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
|
218 |
<!-- <th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard</th> -->
|
219 |
-
|
220 |
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
|
221 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
|
222 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
|
@@ -232,7 +232,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
232 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
|
233 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
|
234 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.46</td> -->
|
235 |
-
|
236 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
|
237 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
|
238 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">75.26</td>
|
@@ -247,7 +247,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
247 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
|
248 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
|
249 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.27</td> -->
|
250 |
-
|
251 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
|
252 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
|
253 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.39</td>
|
@@ -262,7 +262,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
262 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
|
263 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
|
264 |
<!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 52.51 </td> -->
|
265 |
-
|
266 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
|
267 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
|
268 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 75.68 </td>
|
@@ -278,7 +278,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
278 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
|
279 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
|
280 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.66</td> -->
|
281 |
-
|
282 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
|
283 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
|
284 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.15</td>
|
@@ -294,7 +294,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
294 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
|
295 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
|
296 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.71</td> -->
|
297 |
-
|
298 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
|
299 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
|
300 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">62.91</td>
|
@@ -310,7 +310,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
310 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
|
311 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
|
312 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">70.40</td> -->
|
313 |
-
|
314 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
|
315 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
|
316 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.91</td>
|
@@ -326,7 +326,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
326 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
|
327 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
|
328 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.04</td> -->
|
329 |
-
|
330 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
|
331 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
|
332 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.43</td>
|
@@ -341,7 +341,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
341 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
|
342 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
|
343 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">68.55</td> -->
|
344 |
-
|
345 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
|
346 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
|
347 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.79</td>
|
@@ -357,7 +357,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
357 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
|
358 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
|
359 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.77</td> -->
|
360 |
-
|
361 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
|
362 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
|
363 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.72</td>
|
@@ -372,7 +372,7 @@ By implementing this innovative prevention strategy, we can significantly reduce
|
|
372 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
|
373 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
|
374 |
<!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.01 </td> -->
|
375 |
-
|
376 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
|
377 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
|
378 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 86.09 </td>
|
@@ -528,3 +528,4 @@ Granite-3.3-2B-Instruct builds upon Granite-3.3-2B-Base, leveraging both permiss
|
|
528 |
|
529 |
|
530 |
<p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
|
|
|
|
216 |
<th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
|
217 |
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
|
218 |
<!-- <th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard</th> -->
|
219 |
+
<th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="fnref2"><a href="#fn2">2</a></sup></th>
|
220 |
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
|
221 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
|
222 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
|
|
|
232 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
|
233 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
|
234 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.46</td> -->
|
235 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
|
236 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
|
237 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
|
238 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">75.26</td>
|
|
|
247 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
|
248 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
|
249 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.27</td> -->
|
250 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
|
251 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
|
252 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
|
253 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.39</td>
|
|
|
262 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
|
263 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
|
264 |
<!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 52.51 </td> -->
|
265 |
+
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
|
266 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
|
267 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
|
268 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 75.68 </td>
|
|
|
278 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
|
279 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
|
280 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.66</td> -->
|
281 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
|
282 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
|
283 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
|
284 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.15</td>
|
|
|
294 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
|
295 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
|
296 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.71</td> -->
|
297 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
|
298 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
|
299 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
|
300 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">62.91</td>
|
|
|
310 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
|
311 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
|
312 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">70.40</td> -->
|
313 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
|
314 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
|
315 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
|
316 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.91</td>
|
|
|
326 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
|
327 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
|
328 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.04</td> -->
|
329 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
|
330 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
|
331 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
|
332 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.43</td>
|
|
|
341 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
|
342 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
|
343 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">68.55</td> -->
|
344 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
|
345 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
|
346 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
|
347 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.79</td>
|
|
|
357 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
|
358 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
|
359 |
<!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.77</td> -->
|
360 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
|
361 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
|
362 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
|
363 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.72</td>
|
|
|
372 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
|
373 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
|
374 |
<!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.01 </td> -->
|
375 |
+
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
|
376 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
|
377 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
|
378 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 86.09 </td>
|
|
|
528 |
|
529 |
|
530 |
<p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
|
531 |
+
<p><a href="#fnref2" title="Jump back to reference">[2]</a> Modified the implementation to handle some of the issues mentioned <a href="https://huggingface.co/blog/open-llm-leaderboard-drop">here</a></p>
|