| { | |
| "results": { | |
| "hendrycksTest-abstract_algebra": { | |
| "acc": 0.34, | |
| "acc_stderr": 0.047609522856952365, | |
| "acc_norm": 0.34, | |
| "acc_norm_stderr": 0.047609522856952365 | |
| }, | |
| "hendrycksTest-anatomy": { | |
| "acc": 0.5037037037037037, | |
| "acc_stderr": 0.04319223625811331, | |
| "acc_norm": 0.5037037037037037, | |
| "acc_norm_stderr": 0.04319223625811331 | |
| }, | |
| "hendrycksTest-astronomy": { | |
| "acc": 0.4276315789473684, | |
| "acc_stderr": 0.04026097083296558, | |
| "acc_norm": 0.4276315789473684, | |
| "acc_norm_stderr": 0.04026097083296558 | |
| }, | |
| "hendrycksTest-business_ethics": { | |
| "acc": 0.54, | |
| "acc_stderr": 0.05009082659620333, | |
| "acc_norm": 0.54, | |
| "acc_norm_stderr": 0.05009082659620333 | |
| }, | |
| "hendrycksTest-clinical_knowledge": { | |
| "acc": 0.4679245283018868, | |
| "acc_stderr": 0.030709486992556545, | |
| "acc_norm": 0.4679245283018868, | |
| "acc_norm_stderr": 0.030709486992556545 | |
| }, | |
| "hendrycksTest-college_biology": { | |
| "acc": 0.5, | |
| "acc_stderr": 0.04181210050035455, | |
| "acc_norm": 0.5, | |
| "acc_norm_stderr": 0.04181210050035455 | |
| }, | |
| "hendrycksTest-college_chemistry": { | |
| "acc": 0.29, | |
| "acc_stderr": 0.045604802157206845, | |
| "acc_norm": 0.29, | |
| "acc_norm_stderr": 0.045604802157206845 | |
| }, | |
| "hendrycksTest-college_computer_science": { | |
| "acc": 0.4, | |
| "acc_stderr": 0.04923659639173309, | |
| "acc_norm": 0.4, | |
| "acc_norm_stderr": 0.04923659639173309 | |
| }, | |
| "hendrycksTest-college_mathematics": { | |
| "acc": 0.31, | |
| "acc_stderr": 0.04648231987117316, | |
| "acc_norm": 0.31, | |
| "acc_norm_stderr": 0.04648231987117316 | |
| }, | |
| "hendrycksTest-college_medicine": { | |
| "acc": 0.4393063583815029, | |
| "acc_stderr": 0.03784271932887467, | |
| "acc_norm": 0.4393063583815029, | |
| "acc_norm_stderr": 0.03784271932887467 | |
| }, | |
| "hendrycksTest-college_physics": { | |
| "acc": 0.20588235294117646, | |
| "acc_stderr": 0.04023382273617747, | |
| "acc_norm": 0.20588235294117646, | |
| "acc_norm_stderr": 0.04023382273617747 | |
| }, | |
| "hendrycksTest-computer_security": { | |
| "acc": 0.61, | |
| "acc_stderr": 0.04902071300001975, | |
| "acc_norm": 0.61, | |
| "acc_norm_stderr": 0.04902071300001975 | |
| }, | |
| "hendrycksTest-conceptual_physics": { | |
| "acc": 0.43829787234042555, | |
| "acc_stderr": 0.03243618636108101, | |
| "acc_norm": 0.43829787234042555, | |
| "acc_norm_stderr": 0.03243618636108101 | |
| }, | |
| "hendrycksTest-econometrics": { | |
| "acc": 0.2894736842105263, | |
| "acc_stderr": 0.04266339443159393, | |
| "acc_norm": 0.2894736842105263, | |
| "acc_norm_stderr": 0.04266339443159393 | |
| }, | |
| "hendrycksTest-electrical_engineering": { | |
| "acc": 0.46206896551724136, | |
| "acc_stderr": 0.041546596717075474, | |
| "acc_norm": 0.46206896551724136, | |
| "acc_norm_stderr": 0.041546596717075474 | |
| }, | |
| "hendrycksTest-elementary_mathematics": { | |
| "acc": 0.2962962962962963, | |
| "acc_stderr": 0.023517294335963276, | |
| "acc_norm": 0.2962962962962963, | |
| "acc_norm_stderr": 0.023517294335963276 | |
| }, | |
| "hendrycksTest-formal_logic": { | |
| "acc": 0.30952380952380953, | |
| "acc_stderr": 0.04134913018303316, | |
| "acc_norm": 0.30952380952380953, | |
| "acc_norm_stderr": 0.04134913018303316 | |
| }, | |
| "hendrycksTest-global_facts": { | |
| "acc": 0.38, | |
| "acc_stderr": 0.048783173121456316, | |
| "acc_norm": 0.38, | |
| "acc_norm_stderr": 0.048783173121456316 | |
| }, | |
| "hendrycksTest-high_school_biology": { | |
| "acc": 0.5419354838709678, | |
| "acc_stderr": 0.028343787250540615, | |
| "acc_norm": 0.5419354838709678, | |
| "acc_norm_stderr": 0.028343787250540615 | |
| }, | |
| "hendrycksTest-high_school_chemistry": { | |
| "acc": 0.3448275862068966, | |
| "acc_stderr": 0.03344283744280458, | |
| "acc_norm": 0.3448275862068966, | |
| "acc_norm_stderr": 0.03344283744280458 | |
| }, | |
| "hendrycksTest-high_school_computer_science": { | |
| "acc": 0.4, | |
| "acc_stderr": 0.049236596391733084, | |
| "acc_norm": 0.4, | |
| "acc_norm_stderr": 0.049236596391733084 | |
| }, | |
| "hendrycksTest-high_school_european_history": { | |
| "acc": 0.6363636363636364, | |
| "acc_stderr": 0.03756335775187897, | |
| "acc_norm": 0.6363636363636364, | |
| "acc_norm_stderr": 0.03756335775187897 | |
| }, | |
| "hendrycksTest-high_school_geography": { | |
| "acc": 0.5353535353535354, | |
| "acc_stderr": 0.03553436368828061, | |
| "acc_norm": 0.5353535353535354, | |
| "acc_norm_stderr": 0.03553436368828061 | |
| }, | |
| "hendrycksTest-high_school_government_and_politics": { | |
| "acc": 0.7150259067357513, | |
| "acc_stderr": 0.032577140777096614, | |
| "acc_norm": 0.7150259067357513, | |
| "acc_norm_stderr": 0.032577140777096614 | |
| }, | |
| "hendrycksTest-high_school_macroeconomics": { | |
| "acc": 0.4564102564102564, | |
| "acc_stderr": 0.025254485424799602, | |
| "acc_norm": 0.4564102564102564, | |
| "acc_norm_stderr": 0.025254485424799602 | |
| }, | |
| "hendrycksTest-high_school_mathematics": { | |
| "acc": 0.2740740740740741, | |
| "acc_stderr": 0.027195934804085622, | |
| "acc_norm": 0.2740740740740741, | |
| "acc_norm_stderr": 0.027195934804085622 | |
| }, | |
| "hendrycksTest-high_school_microeconomics": { | |
| "acc": 0.4495798319327731, | |
| "acc_stderr": 0.03231293497137707, | |
| "acc_norm": 0.4495798319327731, | |
| "acc_norm_stderr": 0.03231293497137707 | |
| }, | |
| "hendrycksTest-high_school_physics": { | |
| "acc": 0.2847682119205298, | |
| "acc_stderr": 0.03684881521389023, | |
| "acc_norm": 0.2847682119205298, | |
| "acc_norm_stderr": 0.03684881521389023 | |
| }, | |
| "hendrycksTest-high_school_psychology": { | |
| "acc": 0.655045871559633, | |
| "acc_stderr": 0.020380605405066955, | |
| "acc_norm": 0.655045871559633, | |
| "acc_norm_stderr": 0.020380605405066955 | |
| }, | |
| "hendrycksTest-high_school_statistics": { | |
| "acc": 0.32407407407407407, | |
| "acc_stderr": 0.03191923445686185, | |
| "acc_norm": 0.32407407407407407, | |
| "acc_norm_stderr": 0.03191923445686185 | |
| }, | |
| "hendrycksTest-high_school_us_history": { | |
| "acc": 0.6274509803921569, | |
| "acc_stderr": 0.03393388584958406, | |
| "acc_norm": 0.6274509803921569, | |
| "acc_norm_stderr": 0.03393388584958406 | |
| }, | |
| "hendrycksTest-high_school_world_history": { | |
| "acc": 0.6497890295358649, | |
| "acc_stderr": 0.031052391937584346, | |
| "acc_norm": 0.6497890295358649, | |
| "acc_norm_stderr": 0.031052391937584346 | |
| }, | |
| "hendrycksTest-human_aging": { | |
| "acc": 0.547085201793722, | |
| "acc_stderr": 0.033408675019233246, | |
| "acc_norm": 0.547085201793722, | |
| "acc_norm_stderr": 0.033408675019233246 | |
| }, | |
| "hendrycksTest-human_sexuality": { | |
| "acc": 0.5648854961832062, | |
| "acc_stderr": 0.04348208051644858, | |
| "acc_norm": 0.5648854961832062, | |
| "acc_norm_stderr": 0.04348208051644858 | |
| }, | |
| "hendrycksTest-international_law": { | |
| "acc": 0.6363636363636364, | |
| "acc_stderr": 0.043913262867240704, | |
| "acc_norm": 0.6363636363636364, | |
| "acc_norm_stderr": 0.043913262867240704 | |
| }, | |
| "hendrycksTest-jurisprudence": { | |
| "acc": 0.5555555555555556, | |
| "acc_stderr": 0.04803752235190193, | |
| "acc_norm": 0.5555555555555556, | |
| "acc_norm_stderr": 0.04803752235190193 | |
| }, | |
| "hendrycksTest-logical_fallacies": { | |
| "acc": 0.49693251533742333, | |
| "acc_stderr": 0.03928297078179663, | |
| "acc_norm": 0.49693251533742333, | |
| "acc_norm_stderr": 0.03928297078179663 | |
| }, | |
| "hendrycksTest-machine_learning": { | |
| "acc": 0.39285714285714285, | |
| "acc_stderr": 0.04635550135609976, | |
| "acc_norm": 0.39285714285714285, | |
| "acc_norm_stderr": 0.04635550135609976 | |
| }, | |
| "hendrycksTest-management": { | |
| "acc": 0.5728155339805825, | |
| "acc_stderr": 0.048979577377811674, | |
| "acc_norm": 0.5728155339805825, | |
| "acc_norm_stderr": 0.048979577377811674 | |
| }, | |
| "hendrycksTest-marketing": { | |
| "acc": 0.6965811965811965, | |
| "acc_stderr": 0.030118210106942638, | |
| "acc_norm": 0.6965811965811965, | |
| "acc_norm_stderr": 0.030118210106942638 | |
| }, | |
| "hendrycksTest-medical_genetics": { | |
| "acc": 0.53, | |
| "acc_stderr": 0.05016135580465919, | |
| "acc_norm": 0.53, | |
| "acc_norm_stderr": 0.05016135580465919 | |
| }, | |
| "hendrycksTest-miscellaneous": { | |
| "acc": 0.6602809706257982, | |
| "acc_stderr": 0.016936394114301635, | |
| "acc_norm": 0.6602809706257982, | |
| "acc_norm_stderr": 0.016936394114301635 | |
| }, | |
| "hendrycksTest-moral_disputes": { | |
| "acc": 0.5317919075144508, | |
| "acc_stderr": 0.026864624366756643, | |
| "acc_norm": 0.5317919075144508, | |
| "acc_norm_stderr": 0.026864624366756643 | |
| }, | |
| "hendrycksTest-moral_scenarios": { | |
| "acc": 0.24581005586592178, | |
| "acc_stderr": 0.014400296429225627, | |
| "acc_norm": 0.24581005586592178, | |
| "acc_norm_stderr": 0.014400296429225627 | |
| }, | |
| "hendrycksTest-nutrition": { | |
| "acc": 0.5163398692810458, | |
| "acc_stderr": 0.02861462475280544, | |
| "acc_norm": 0.5163398692810458, | |
| "acc_norm_stderr": 0.02861462475280544 | |
| }, | |
| "hendrycksTest-philosophy": { | |
| "acc": 0.6012861736334405, | |
| "acc_stderr": 0.027809322585774496, | |
| "acc_norm": 0.6012861736334405, | |
| "acc_norm_stderr": 0.027809322585774496 | |
| }, | |
| "hendrycksTest-prehistory": { | |
| "acc": 0.5185185185185185, | |
| "acc_stderr": 0.02780165621232366, | |
| "acc_norm": 0.5185185185185185, | |
| "acc_norm_stderr": 0.02780165621232366 | |
| }, | |
| "hendrycksTest-professional_accounting": { | |
| "acc": 0.3723404255319149, | |
| "acc_stderr": 0.028838921471251458, | |
| "acc_norm": 0.3723404255319149, | |
| "acc_norm_stderr": 0.028838921471251458 | |
| }, | |
| "hendrycksTest-professional_law": { | |
| "acc": 0.3683181225554107, | |
| "acc_stderr": 0.012319403369564637, | |
| "acc_norm": 0.3683181225554107, | |
| "acc_norm_stderr": 0.012319403369564637 | |
| }, | |
| "hendrycksTest-professional_medicine": { | |
| "acc": 0.5330882352941176, | |
| "acc_stderr": 0.030306257722468317, | |
| "acc_norm": 0.5330882352941176, | |
| "acc_norm_stderr": 0.030306257722468317 | |
| }, | |
| "hendrycksTest-professional_psychology": { | |
| "acc": 0.46405228758169936, | |
| "acc_stderr": 0.020175488765484036, | |
| "acc_norm": 0.46405228758169936, | |
| "acc_norm_stderr": 0.020175488765484036 | |
| }, | |
| "hendrycksTest-public_relations": { | |
| "acc": 0.5545454545454546, | |
| "acc_stderr": 0.047605488214603246, | |
| "acc_norm": 0.5545454545454546, | |
| "acc_norm_stderr": 0.047605488214603246 | |
| }, | |
| "hendrycksTest-security_studies": { | |
| "acc": 0.4897959183673469, | |
| "acc_stderr": 0.03200255347893782, | |
| "acc_norm": 0.4897959183673469, | |
| "acc_norm_stderr": 0.03200255347893782 | |
| }, | |
| "hendrycksTest-sociology": { | |
| "acc": 0.6417910447761194, | |
| "acc_stderr": 0.03390393042268814, | |
| "acc_norm": 0.6417910447761194, | |
| "acc_norm_stderr": 0.03390393042268814 | |
| }, | |
| "hendrycksTest-us_foreign_policy": { | |
| "acc": 0.65, | |
| "acc_stderr": 0.047937248544110196, | |
| "acc_norm": 0.65, | |
| "acc_norm_stderr": 0.047937248544110196 | |
| }, | |
| "hendrycksTest-virology": { | |
| "acc": 0.39759036144578314, | |
| "acc_stderr": 0.038099730845402184, | |
| "acc_norm": 0.39759036144578314, | |
| "acc_norm_stderr": 0.038099730845402184 | |
| }, | |
| "hendrycksTest-world_religions": { | |
| "acc": 0.7134502923976608, | |
| "acc_stderr": 0.03467826685703826, | |
| "acc_norm": 0.7134502923976608, | |
| "acc_norm_stderr": 0.03467826685703826 | |
| } | |
| }, | |
| "versions": { | |
| "hendrycksTest-abstract_algebra": 1, | |
| "hendrycksTest-anatomy": 1, | |
| "hendrycksTest-astronomy": 1, | |
| "hendrycksTest-business_ethics": 1, | |
| "hendrycksTest-clinical_knowledge": 1, | |
| "hendrycksTest-college_biology": 1, | |
| "hendrycksTest-college_chemistry": 1, | |
| "hendrycksTest-college_computer_science": 1, | |
| "hendrycksTest-college_mathematics": 1, | |
| "hendrycksTest-college_medicine": 1, | |
| "hendrycksTest-college_physics": 1, | |
| "hendrycksTest-computer_security": 1, | |
| "hendrycksTest-conceptual_physics": 1, | |
| "hendrycksTest-econometrics": 1, | |
| "hendrycksTest-electrical_engineering": 1, | |
| "hendrycksTest-elementary_mathematics": 1, | |
| "hendrycksTest-formal_logic": 1, | |
| "hendrycksTest-global_facts": 1, | |
| "hendrycksTest-high_school_biology": 1, | |
| "hendrycksTest-high_school_chemistry": 1, | |
| "hendrycksTest-high_school_computer_science": 1, | |
| "hendrycksTest-high_school_european_history": 1, | |
| "hendrycksTest-high_school_geography": 1, | |
| "hendrycksTest-high_school_government_and_politics": 1, | |
| "hendrycksTest-high_school_macroeconomics": 1, | |
| "hendrycksTest-high_school_mathematics": 1, | |
| "hendrycksTest-high_school_microeconomics": 1, | |
| "hendrycksTest-high_school_physics": 1, | |
| "hendrycksTest-high_school_psychology": 1, | |
| "hendrycksTest-high_school_statistics": 1, | |
| "hendrycksTest-high_school_us_history": 1, | |
| "hendrycksTest-high_school_world_history": 1, | |
| "hendrycksTest-human_aging": 1, | |
| "hendrycksTest-human_sexuality": 1, | |
| "hendrycksTest-international_law": 1, | |
| "hendrycksTest-jurisprudence": 1, | |
| "hendrycksTest-logical_fallacies": 1, | |
| "hendrycksTest-machine_learning": 1, | |
| "hendrycksTest-management": 1, | |
| "hendrycksTest-marketing": 1, | |
| "hendrycksTest-medical_genetics": 1, | |
| "hendrycksTest-miscellaneous": 1, | |
| "hendrycksTest-moral_disputes": 1, | |
| "hendrycksTest-moral_scenarios": 1, | |
| "hendrycksTest-nutrition": 1, | |
| "hendrycksTest-philosophy": 1, | |
| "hendrycksTest-prehistory": 1, | |
| "hendrycksTest-professional_accounting": 1, | |
| "hendrycksTest-professional_law": 1, | |
| "hendrycksTest-professional_medicine": 1, | |
| "hendrycksTest-professional_psychology": 1, | |
| "hendrycksTest-public_relations": 1, | |
| "hendrycksTest-security_studies": 1, | |
| "hendrycksTest-sociology": 1, | |
| "hendrycksTest-us_foreign_policy": 1, | |
| "hendrycksTest-virology": 1, | |
| "hendrycksTest-world_religions": 1 | |
| }, | |
| "config": { | |
| "model": "hf", | |
| "model_args": "pretrained=/cache/shubhra/models/llama2_denseft_playtpus_dolphin_LR1e-5_E2_noGC_LRScosine/combined/,trust_remote_code=True,dtype=bfloat16", | |
| "num_fewshot": 5, | |
| "batch_size": "16", | |
| "batch_sizes": [], | |
| "device": "cuda:0", | |
| "no_cache": true, | |
| "limit": null, | |
| "bootstrap_iters": 100000, | |
| "description_dict": {} | |
| } | |
| } |