Update README.md
Browse files
README.md
CHANGED
@@ -189,7 +189,7 @@ The benchmarks and corresponding scores listed in the table below are taken dire
|
|
189 |
|TruthfulQA|0-shot|44|45.3|41.55|-5.56%|-8.27%|
|
190 |
|Winogrande|5-shot|77.4|79.5|67.09|-13.32%|-15.61%|
|
191 |
|HellaSwag|10-shot|82.1|81.9|69.88|-14.88%|-14.68%|
|
192 |
-
|GPQA|5-shot|25.8|
|
193 |
|TheoremQA|5-shot|22.1|28.9|-|-|-|
|
194 |
|MATH|4-shot|20.5|37.7|40.2|+96.10%|+6.63%|
|
195 |
|MMLU-stem|5-shot|55.3|65.1|52.9|-4.34%|-18.74%|
|
@@ -198,5 +198,5 @@ The benchmarks and corresponding scores listed in the table below are taken dire
|
|
198 |
|HumanEval+|0-shot|29.3|30.5|62.2|+112.29%|+103.93%|
|
199 |
|MBPP|0-shot|53.9|62.2|60.3|+11.87%|-3.05%|
|
200 |
|MBPP+|0-shot|44.4|50.6|50.8|+14.41%|+0.40%|
|
201 |
-
|
|
202 |
-
|||||**Average**|**+18.55%**|**+
|
|
|
189 |
|TruthfulQA|0-shot|44|45.3|41.55|-5.56%|-8.27%|
|
190 |
|Winogrande|5-shot|77.4|79.5|67.09|-13.32%|-15.61%|
|
191 |
|HellaSwag|10-shot|82.1|81.9|69.88|-14.88%|-14.68%|
|
192 |
+
|GPQA|5-shot|25.8|32.8|29.24|+13.33%|-10.85%|
|
193 |
|TheoremQA|5-shot|22.1|28.9|-|-|-|
|
194 |
|MATH|4-shot|20.5|37.7|40.2|+96.10%|+6.63%|
|
195 |
|MMLU-stem|5-shot|55.3|65.1|52.9|-4.34%|-18.74%|
|
|
|
198 |
|HumanEval+|0-shot|29.3|30.5|62.2|+112.29%|+103.93%|
|
199 |
|MBPP|0-shot|53.9|62.2|60.3|+11.87%|-3.05%|
|
200 |
|MBPP+|0-shot|44.4|50.6|50.8|+14.41%|+0.40%|
|
201 |
+
|MultiPL-E|0-shot|22.6|34.9|-|-|-|
|
202 |
+
|||||**Average**|**+18.55%**|**+1.12%**|
|