Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | @@ -29,12 +29,6 @@ It slightly improves upon the performance of the basemodel on the following task | |
| 29 |  | 
| 30 | 
             
            # Eval Results aloobun/d-SmolLM2-360M (WIP)
         | 
| 31 |  | 
| 32 | 
            -
            Todo:
         | 
| 33 | 
            -
             | 
| 34 | 
            -
            ifeval (0-shot, generative)
         | 
| 35 | 
            -
             | 
| 36 | 
            -
            Math-lvl-5 (4-shots, generative, minerva version)
         | 
| 37 | 
            -
             | 
| 38 |  | 
| 39 | 
             
            ## GPQA
         | 
| 40 |  | 
| @@ -100,3 +94,16 @@ Math-lvl-5 (4-shots, generative, minerva version) | |
| 100 | 
             
            |                  |       |none  |     0|inst_level_strict_acc  |↑  |0.2770|±  |   N/A|
         | 
| 101 | 
             
            |                  |       |none  |     0|prompt_level_loose_acc |↑  |0.1497|±  |0.0154|
         | 
| 102 | 
             
            |                  |       |none  |     0|prompt_level_strict_acc|↑  |0.1423|±  |0.0150|
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 29 |  | 
| 30 | 
             
            # Eval Results aloobun/d-SmolLM2-360M (WIP)
         | 
| 31 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 32 |  | 
| 33 | 
             
            ## GPQA
         | 
| 34 |  | 
|  | |
| 94 | 
             
            |                  |       |none  |     0|inst_level_strict_acc  |↑  |0.2770|±  |   N/A|
         | 
| 95 | 
             
            |                  |       |none  |     0|prompt_level_loose_acc |↑  |0.1497|±  |0.0154|
         | 
| 96 | 
             
            |                  |       |none  |     0|prompt_level_strict_acc|↑  |0.1423|±  |0.0150|
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            ## MATH HARD
         | 
| 99 | 
            +
            |                    Tasks                    |Version|Filter|n-shot|  Metric   |   |Value |   |Stderr|
         | 
| 100 | 
            +
            |---------------------------------------------|-------|------|-----:|-----------|---|-----:|---|-----:|
         | 
| 101 | 
            +
            |leaderboard_math_hard                        |    N/A|      |      |           |   |      |   |      |
         | 
| 102 | 
            +
            | - leaderboard_math_algebra_hard             |      2|none  |     4|exact_match|↑  |0.0033|±  |0.0033|
         | 
| 103 | 
            +
            | - leaderboard_math_counting_and_prob_hard   |      2|none  |     4|exact_match|↑  |0.0081|±  |0.0081|
         | 
| 104 | 
            +
            | - leaderboard_math_geometry_hard            |      2|none  |     4|exact_match|↑  |0.0000|±  |0.0000|
         | 
| 105 | 
            +
            | - leaderboard_math_intermediate_algebra_hard|      2|none  |     4|exact_match|↑  |0.0000|±  |0.0000|
         | 
| 106 | 
            +
            | - leaderboard_math_num_theory_hard          |      2|none  |     4|exact_match|↑  |0.0065|±  |0.0065|
         | 
| 107 | 
            +
            | - leaderboard_math_prealgebra_hard          |      2|none  |     4|exact_match|↑  |0.0104|±  |0.0073|
         | 
| 108 | 
            +
            | - leaderboard_math_precalculus_hard         |      2|none  |     4|exact_match|↑  |0.0000|±  |0.0000|
         | 
| 109 | 
            +
             | 
