|
{ |
|
"results": { |
|
"hendrycksTest-abstract_algebra": { |
|
"acc": 0.26, |
|
"acc_stderr": 0.04408440022768081, |
|
"acc_norm": 0.26, |
|
"acc_norm_stderr": 0.04408440022768081 |
|
}, |
|
"hendrycksTest-anatomy": { |
|
"acc": 0.32592592592592595, |
|
"acc_stderr": 0.040491220417025055, |
|
"acc_norm": 0.32592592592592595, |
|
"acc_norm_stderr": 0.040491220417025055 |
|
}, |
|
"hendrycksTest-astronomy": { |
|
"acc": 0.2565789473684211, |
|
"acc_stderr": 0.03554180368025689, |
|
"acc_norm": 0.2565789473684211, |
|
"acc_norm_stderr": 0.03554180368025689 |
|
}, |
|
"hendrycksTest-business_ethics": { |
|
"acc": 0.33, |
|
"acc_stderr": 0.047258156262526045, |
|
"acc_norm": 0.33, |
|
"acc_norm_stderr": 0.047258156262526045 |
|
}, |
|
"hendrycksTest-clinical_knowledge": { |
|
"acc": 0.2528301886792453, |
|
"acc_stderr": 0.026749899771241235, |
|
"acc_norm": 0.2528301886792453, |
|
"acc_norm_stderr": 0.026749899771241235 |
|
}, |
|
"hendrycksTest-college_biology": { |
|
"acc": 0.3263888888888889, |
|
"acc_stderr": 0.03921067198982266, |
|
"acc_norm": 0.3263888888888889, |
|
"acc_norm_stderr": 0.03921067198982266 |
|
}, |
|
"hendrycksTest-college_chemistry": { |
|
"acc": 0.23, |
|
"acc_stderr": 0.04229525846816506, |
|
"acc_norm": 0.23, |
|
"acc_norm_stderr": 0.04229525846816506 |
|
}, |
|
"hendrycksTest-college_computer_science": { |
|
"acc": 0.39, |
|
"acc_stderr": 0.04902071300001974, |
|
"acc_norm": 0.39, |
|
"acc_norm_stderr": 0.04902071300001974 |
|
}, |
|
"hendrycksTest-college_mathematics": { |
|
"acc": 0.3, |
|
"acc_stderr": 0.046056618647183814, |
|
"acc_norm": 0.3, |
|
"acc_norm_stderr": 0.046056618647183814 |
|
}, |
|
"hendrycksTest-college_medicine": { |
|
"acc": 0.2543352601156069, |
|
"acc_stderr": 0.0332055644308557, |
|
"acc_norm": 0.2543352601156069, |
|
"acc_norm_stderr": 0.0332055644308557 |
|
}, |
|
"hendrycksTest-college_physics": { |
|
"acc": 0.22549019607843138, |
|
"acc_stderr": 0.041583075330832865, |
|
"acc_norm": 0.22549019607843138, |
|
"acc_norm_stderr": 0.041583075330832865 |
|
}, |
|
"hendrycksTest-computer_security": { |
|
"acc": 0.41, |
|
"acc_stderr": 0.04943110704237102, |
|
"acc_norm": 0.41, |
|
"acc_norm_stderr": 0.04943110704237102 |
|
}, |
|
"hendrycksTest-conceptual_physics": { |
|
"acc": 0.28936170212765955, |
|
"acc_stderr": 0.02964400657700962, |
|
"acc_norm": 0.28936170212765955, |
|
"acc_norm_stderr": 0.02964400657700962 |
|
}, |
|
"hendrycksTest-econometrics": { |
|
"acc": 0.2894736842105263, |
|
"acc_stderr": 0.042663394431593935, |
|
"acc_norm": 0.2894736842105263, |
|
"acc_norm_stderr": 0.042663394431593935 |
|
}, |
|
"hendrycksTest-electrical_engineering": { |
|
"acc": 0.2896551724137931, |
|
"acc_stderr": 0.037800192304380135, |
|
"acc_norm": 0.2896551724137931, |
|
"acc_norm_stderr": 0.037800192304380135 |
|
}, |
|
"hendrycksTest-elementary_mathematics": { |
|
"acc": 0.2777777777777778, |
|
"acc_stderr": 0.02306818884826111, |
|
"acc_norm": 0.2777777777777778, |
|
"acc_norm_stderr": 0.02306818884826111 |
|
}, |
|
"hendrycksTest-formal_logic": { |
|
"acc": 0.3333333333333333, |
|
"acc_stderr": 0.042163702135578345, |
|
"acc_norm": 0.3333333333333333, |
|
"acc_norm_stderr": 0.042163702135578345 |
|
}, |
|
"hendrycksTest-global_facts": { |
|
"acc": 0.32, |
|
"acc_stderr": 0.046882617226215034, |
|
"acc_norm": 0.32, |
|
"acc_norm_stderr": 0.046882617226215034 |
|
}, |
|
"hendrycksTest-high_school_biology": { |
|
"acc": 0.26129032258064516, |
|
"acc_stderr": 0.024993053397764822, |
|
"acc_norm": 0.26129032258064516, |
|
"acc_norm_stderr": 0.024993053397764822 |
|
}, |
|
"hendrycksTest-high_school_chemistry": { |
|
"acc": 0.23645320197044334, |
|
"acc_stderr": 0.029896114291733552, |
|
"acc_norm": 0.23645320197044334, |
|
"acc_norm_stderr": 0.029896114291733552 |
|
}, |
|
"hendrycksTest-high_school_computer_science": { |
|
"acc": 0.35, |
|
"acc_stderr": 0.0479372485441102, |
|
"acc_norm": 0.35, |
|
"acc_norm_stderr": 0.0479372485441102 |
|
}, |
|
"hendrycksTest-high_school_european_history": { |
|
"acc": 0.2787878787878788, |
|
"acc_stderr": 0.03501438706296781, |
|
"acc_norm": 0.2787878787878788, |
|
"acc_norm_stderr": 0.03501438706296781 |
|
}, |
|
"hendrycksTest-high_school_geography": { |
|
"acc": 0.21212121212121213, |
|
"acc_stderr": 0.029126522834586818, |
|
"acc_norm": 0.21212121212121213, |
|
"acc_norm_stderr": 0.029126522834586818 |
|
}, |
|
"hendrycksTest-high_school_government_and_politics": { |
|
"acc": 0.24870466321243523, |
|
"acc_stderr": 0.031195840877700314, |
|
"acc_norm": 0.24870466321243523, |
|
"acc_norm_stderr": 0.031195840877700314 |
|
}, |
|
"hendrycksTest-high_school_macroeconomics": { |
|
"acc": 0.2358974358974359, |
|
"acc_stderr": 0.02152596540740873, |
|
"acc_norm": 0.2358974358974359, |
|
"acc_norm_stderr": 0.02152596540740873 |
|
}, |
|
"hendrycksTest-high_school_mathematics": { |
|
"acc": 0.22962962962962963, |
|
"acc_stderr": 0.02564410863926764, |
|
"acc_norm": 0.22962962962962963, |
|
"acc_norm_stderr": 0.02564410863926764 |
|
}, |
|
"hendrycksTest-high_school_microeconomics": { |
|
"acc": 0.2647058823529412, |
|
"acc_stderr": 0.02865749128507196, |
|
"acc_norm": 0.2647058823529412, |
|
"acc_norm_stderr": 0.02865749128507196 |
|
}, |
|
"hendrycksTest-high_school_physics": { |
|
"acc": 0.2847682119205298, |
|
"acc_stderr": 0.03684881521389023, |
|
"acc_norm": 0.2847682119205298, |
|
"acc_norm_stderr": 0.03684881521389023 |
|
}, |
|
"hendrycksTest-high_school_psychology": { |
|
"acc": 0.26605504587155965, |
|
"acc_stderr": 0.01894602232222559, |
|
"acc_norm": 0.26605504587155965, |
|
"acc_norm_stderr": 0.01894602232222559 |
|
}, |
|
"hendrycksTest-high_school_statistics": { |
|
"acc": 0.2916666666666667, |
|
"acc_stderr": 0.030998666304560524, |
|
"acc_norm": 0.2916666666666667, |
|
"acc_norm_stderr": 0.030998666304560524 |
|
}, |
|
"hendrycksTest-high_school_us_history": { |
|
"acc": 0.27450980392156865, |
|
"acc_stderr": 0.03132179803083291, |
|
"acc_norm": 0.27450980392156865, |
|
"acc_norm_stderr": 0.03132179803083291 |
|
}, |
|
"hendrycksTest-high_school_world_history": { |
|
"acc": 0.2869198312236287, |
|
"acc_stderr": 0.029443773022594693, |
|
"acc_norm": 0.2869198312236287, |
|
"acc_norm_stderr": 0.029443773022594693 |
|
}, |
|
"hendrycksTest-human_aging": { |
|
"acc": 0.28699551569506726, |
|
"acc_stderr": 0.03036037971029195, |
|
"acc_norm": 0.28699551569506726, |
|
"acc_norm_stderr": 0.03036037971029195 |
|
}, |
|
"hendrycksTest-human_sexuality": { |
|
"acc": 0.32061068702290074, |
|
"acc_stderr": 0.040933292298342784, |
|
"acc_norm": 0.32061068702290074, |
|
"acc_norm_stderr": 0.040933292298342784 |
|
}, |
|
"hendrycksTest-international_law": { |
|
"acc": 0.32231404958677684, |
|
"acc_stderr": 0.04266416363352167, |
|
"acc_norm": 0.32231404958677684, |
|
"acc_norm_stderr": 0.04266416363352167 |
|
}, |
|
"hendrycksTest-jurisprudence": { |
|
"acc": 0.3333333333333333, |
|
"acc_stderr": 0.04557239513497752, |
|
"acc_norm": 0.3333333333333333, |
|
"acc_norm_stderr": 0.04557239513497752 |
|
}, |
|
"hendrycksTest-logical_fallacies": { |
|
"acc": 0.22699386503067484, |
|
"acc_stderr": 0.032910995786157686, |
|
"acc_norm": 0.22699386503067484, |
|
"acc_norm_stderr": 0.032910995786157686 |
|
}, |
|
"hendrycksTest-machine_learning": { |
|
"acc": 0.2857142857142857, |
|
"acc_stderr": 0.04287858751340455, |
|
"acc_norm": 0.2857142857142857, |
|
"acc_norm_stderr": 0.04287858751340455 |
|
}, |
|
"hendrycksTest-management": { |
|
"acc": 0.14563106796116504, |
|
"acc_stderr": 0.0349260647662379, |
|
"acc_norm": 0.14563106796116504, |
|
"acc_norm_stderr": 0.0349260647662379 |
|
}, |
|
"hendrycksTest-marketing": { |
|
"acc": 0.3076923076923077, |
|
"acc_stderr": 0.030236389942173106, |
|
"acc_norm": 0.3076923076923077, |
|
"acc_norm_stderr": 0.030236389942173106 |
|
}, |
|
"hendrycksTest-medical_genetics": { |
|
"acc": 0.36, |
|
"acc_stderr": 0.048241815132442176, |
|
"acc_norm": 0.36, |
|
"acc_norm_stderr": 0.048241815132442176 |
|
}, |
|
"hendrycksTest-miscellaneous": { |
|
"acc": 0.30140485312899107, |
|
"acc_stderr": 0.01640909109726878, |
|
"acc_norm": 0.30140485312899107, |
|
"acc_norm_stderr": 0.01640909109726878 |
|
}, |
|
"hendrycksTest-moral_disputes": { |
|
"acc": 0.2976878612716763, |
|
"acc_stderr": 0.024617055388677003, |
|
"acc_norm": 0.2976878612716763, |
|
"acc_norm_stderr": 0.024617055388677003 |
|
}, |
|
"hendrycksTest-moral_scenarios": { |
|
"acc": 0.25251396648044694, |
|
"acc_stderr": 0.014530330201468636, |
|
"acc_norm": 0.25251396648044694, |
|
"acc_norm_stderr": 0.014530330201468636 |
|
}, |
|
"hendrycksTest-nutrition": { |
|
"acc": 0.28104575163398693, |
|
"acc_stderr": 0.025738854797818716, |
|
"acc_norm": 0.28104575163398693, |
|
"acc_norm_stderr": 0.025738854797818716 |
|
}, |
|
"hendrycksTest-philosophy": { |
|
"acc": 0.3440514469453376, |
|
"acc_stderr": 0.026981478043648047, |
|
"acc_norm": 0.3440514469453376, |
|
"acc_norm_stderr": 0.026981478043648047 |
|
}, |
|
"hendrycksTest-prehistory": { |
|
"acc": 0.31790123456790126, |
|
"acc_stderr": 0.025910063528240868, |
|
"acc_norm": 0.31790123456790126, |
|
"acc_norm_stderr": 0.025910063528240868 |
|
}, |
|
"hendrycksTest-professional_accounting": { |
|
"acc": 0.2907801418439716, |
|
"acc_stderr": 0.027090664368353178, |
|
"acc_norm": 0.2907801418439716, |
|
"acc_norm_stderr": 0.027090664368353178 |
|
}, |
|
"hendrycksTest-professional_law": { |
|
"acc": 0.26988265971316816, |
|
"acc_stderr": 0.011337381084250394, |
|
"acc_norm": 0.26988265971316816, |
|
"acc_norm_stderr": 0.011337381084250394 |
|
}, |
|
"hendrycksTest-professional_medicine": { |
|
"acc": 0.1801470588235294, |
|
"acc_stderr": 0.023345163616544855, |
|
"acc_norm": 0.1801470588235294, |
|
"acc_norm_stderr": 0.023345163616544855 |
|
}, |
|
"hendrycksTest-professional_psychology": { |
|
"acc": 0.3022875816993464, |
|
"acc_stderr": 0.018579232711113884, |
|
"acc_norm": 0.3022875816993464, |
|
"acc_norm_stderr": 0.018579232711113884 |
|
}, |
|
"hendrycksTest-public_relations": { |
|
"acc": 0.32727272727272727, |
|
"acc_stderr": 0.04494290866252089, |
|
"acc_norm": 0.32727272727272727, |
|
"acc_norm_stderr": 0.04494290866252089 |
|
}, |
|
"hendrycksTest-security_studies": { |
|
"acc": 0.2163265306122449, |
|
"acc_stderr": 0.02635891633490403, |
|
"acc_norm": 0.2163265306122449, |
|
"acc_norm_stderr": 0.02635891633490403 |
|
}, |
|
"hendrycksTest-sociology": { |
|
"acc": 0.24378109452736318, |
|
"acc_stderr": 0.030360490154014666, |
|
"acc_norm": 0.24378109452736318, |
|
"acc_norm_stderr": 0.030360490154014666 |
|
}, |
|
"hendrycksTest-us_foreign_policy": { |
|
"acc": 0.33, |
|
"acc_stderr": 0.047258156262526045, |
|
"acc_norm": 0.33, |
|
"acc_norm_stderr": 0.047258156262526045 |
|
}, |
|
"hendrycksTest-virology": { |
|
"acc": 0.28313253012048195, |
|
"acc_stderr": 0.03507295431370518, |
|
"acc_norm": 0.28313253012048195, |
|
"acc_norm_stderr": 0.03507295431370518 |
|
}, |
|
"hendrycksTest-world_religions": { |
|
"acc": 0.3508771929824561, |
|
"acc_stderr": 0.036602988340491624, |
|
"acc_norm": 0.3508771929824561, |
|
"acc_norm_stderr": 0.036602988340491624 |
|
} |
|
}, |
|
"versions": { |
|
"hendrycksTest-abstract_algebra": 1, |
|
"hendrycksTest-anatomy": 1, |
|
"hendrycksTest-astronomy": 1, |
|
"hendrycksTest-business_ethics": 1, |
|
"hendrycksTest-clinical_knowledge": 1, |
|
"hendrycksTest-college_biology": 1, |
|
"hendrycksTest-college_chemistry": 1, |
|
"hendrycksTest-college_computer_science": 1, |
|
"hendrycksTest-college_mathematics": 1, |
|
"hendrycksTest-college_medicine": 1, |
|
"hendrycksTest-college_physics": 1, |
|
"hendrycksTest-computer_security": 1, |
|
"hendrycksTest-conceptual_physics": 1, |
|
"hendrycksTest-econometrics": 1, |
|
"hendrycksTest-electrical_engineering": 1, |
|
"hendrycksTest-elementary_mathematics": 1, |
|
"hendrycksTest-formal_logic": 1, |
|
"hendrycksTest-global_facts": 1, |
|
"hendrycksTest-high_school_biology": 1, |
|
"hendrycksTest-high_school_chemistry": 1, |
|
"hendrycksTest-high_school_computer_science": 1, |
|
"hendrycksTest-high_school_european_history": 1, |
|
"hendrycksTest-high_school_geography": 1, |
|
"hendrycksTest-high_school_government_and_politics": 1, |
|
"hendrycksTest-high_school_macroeconomics": 1, |
|
"hendrycksTest-high_school_mathematics": 1, |
|
"hendrycksTest-high_school_microeconomics": 1, |
|
"hendrycksTest-high_school_physics": 1, |
|
"hendrycksTest-high_school_psychology": 1, |
|
"hendrycksTest-high_school_statistics": 1, |
|
"hendrycksTest-high_school_us_history": 1, |
|
"hendrycksTest-high_school_world_history": 1, |
|
"hendrycksTest-human_aging": 1, |
|
"hendrycksTest-human_sexuality": 1, |
|
"hendrycksTest-international_law": 1, |
|
"hendrycksTest-jurisprudence": 1, |
|
"hendrycksTest-logical_fallacies": 1, |
|
"hendrycksTest-machine_learning": 1, |
|
"hendrycksTest-management": 1, |
|
"hendrycksTest-marketing": 1, |
|
"hendrycksTest-medical_genetics": 1, |
|
"hendrycksTest-miscellaneous": 1, |
|
"hendrycksTest-moral_disputes": 1, |
|
"hendrycksTest-moral_scenarios": 1, |
|
"hendrycksTest-nutrition": 1, |
|
"hendrycksTest-philosophy": 1, |
|
"hendrycksTest-prehistory": 1, |
|
"hendrycksTest-professional_accounting": 1, |
|
"hendrycksTest-professional_law": 1, |
|
"hendrycksTest-professional_medicine": 1, |
|
"hendrycksTest-professional_psychology": 1, |
|
"hendrycksTest-public_relations": 1, |
|
"hendrycksTest-security_studies": 1, |
|
"hendrycksTest-sociology": 1, |
|
"hendrycksTest-us_foreign_policy": 1, |
|
"hendrycksTest-virology": 1, |
|
"hendrycksTest-world_religions": 1 |
|
}, |
|
"config": { |
|
"model": "hf-causal", |
|
"model_args": "pretrained=workdir_7b/ckpt_355", |
|
"num_fewshot": 5, |
|
"batch_size": "8", |
|
"batch_sizes": [], |
|
"device": null, |
|
"no_cache": true, |
|
"limit": null, |
|
"bootstrap_iters": 100000, |
|
"description_dict": {} |
|
} |
|
} |