{ "results": { "hendrycksTest-abstract_algebra": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "hendrycksTest-anatomy": { "acc": 0.2222222222222222, "acc_stderr": 0.035914440841969694, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.035914440841969694 }, "hendrycksTest-astronomy": { "acc": 0.26973684210526316, "acc_stderr": 0.03611780560284898, "acc_norm": 0.26973684210526316, "acc_norm_stderr": 0.03611780560284898 }, "hendrycksTest-business_ethics": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-clinical_knowledge": { "acc": 0.2188679245283019, "acc_stderr": 0.02544786382510863, "acc_norm": 0.2188679245283019, "acc_norm_stderr": 0.02544786382510863 }, "hendrycksTest-college_biology": { "acc": 0.2569444444444444, "acc_stderr": 0.03653946969442099, "acc_norm": 0.2569444444444444, "acc_norm_stderr": 0.03653946969442099 }, "hendrycksTest-college_chemistry": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "hendrycksTest-college_computer_science": { "acc": 0.43, "acc_stderr": 0.04975698519562429, "acc_norm": 0.43, "acc_norm_stderr": 0.04975698519562429 }, "hendrycksTest-college_mathematics": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "hendrycksTest-college_medicine": { "acc": 0.2543352601156069, "acc_stderr": 0.0332055644308557, "acc_norm": 0.2543352601156069, "acc_norm_stderr": 0.0332055644308557 }, "hendrycksTest-college_physics": { "acc": 0.22549019607843138, "acc_stderr": 0.041583075330832865, "acc_norm": 0.22549019607843138, "acc_norm_stderr": 0.041583075330832865 }, "hendrycksTest-computer_security": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-conceptual_physics": { "acc": 0.28936170212765955, "acc_stderr": 0.02964400657700962, "acc_norm": 0.28936170212765955, "acc_norm_stderr": 0.02964400657700962 }, "hendrycksTest-econometrics": { "acc": 0.2631578947368421, "acc_stderr": 0.041424397194893624, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.041424397194893624 }, "hendrycksTest-electrical_engineering": { "acc": 0.2689655172413793, "acc_stderr": 0.03695183311650232, "acc_norm": 0.2689655172413793, "acc_norm_stderr": 0.03695183311650232 }, "hendrycksTest-elementary_mathematics": { "acc": 0.26455026455026454, "acc_stderr": 0.022717467897708624, "acc_norm": 0.26455026455026454, "acc_norm_stderr": 0.022717467897708624 }, "hendrycksTest-formal_logic": { "acc": 0.29365079365079366, "acc_stderr": 0.04073524322147127, "acc_norm": 0.29365079365079366, "acc_norm_stderr": 0.04073524322147127 }, "hendrycksTest-global_facts": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "hendrycksTest-high_school_biology": { "acc": 0.22903225806451613, "acc_stderr": 0.023904914311782648, "acc_norm": 0.22903225806451613, "acc_norm_stderr": 0.023904914311782648 }, "hendrycksTest-high_school_chemistry": { "acc": 0.16748768472906403, "acc_stderr": 0.026273086047535418, "acc_norm": 0.16748768472906403, "acc_norm_stderr": 0.026273086047535418 }, "hendrycksTest-high_school_computer_science": { "acc": 0.29, "acc_stderr": 0.04560480215720683, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720683 }, "hendrycksTest-high_school_european_history": { "acc": 0.2787878787878788, "acc_stderr": 0.03501438706296781, "acc_norm": 0.2787878787878788, "acc_norm_stderr": 0.03501438706296781 }, "hendrycksTest-high_school_geography": { "acc": 0.17676767676767677, "acc_stderr": 0.027178752639044915, "acc_norm": 0.17676767676767677, "acc_norm_stderr": 0.027178752639044915 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.22797927461139897, "acc_stderr": 0.030276909945178274, "acc_norm": 0.22797927461139897, "acc_norm_stderr": 0.030276909945178274 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2153846153846154, "acc_stderr": 0.020843034557462874, "acc_norm": 0.2153846153846154, "acc_norm_stderr": 0.020843034557462874 }, "hendrycksTest-high_school_mathematics": { "acc": 0.21851851851851853, "acc_stderr": 0.02519575225182379, "acc_norm": 0.21851851851851853, "acc_norm_stderr": 0.02519575225182379 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.2605042016806723, "acc_stderr": 0.028510251512341937, "acc_norm": 0.2605042016806723, "acc_norm_stderr": 0.028510251512341937 }, "hendrycksTest-high_school_physics": { "acc": 0.2781456953642384, "acc_stderr": 0.03658603262763743, "acc_norm": 0.2781456953642384, "acc_norm_stderr": 0.03658603262763743 }, "hendrycksTest-high_school_psychology": { "acc": 0.23119266055045873, "acc_stderr": 0.018075750241633156, "acc_norm": 0.23119266055045873, "acc_norm_stderr": 0.018075750241633156 }, "hendrycksTest-high_school_statistics": { "acc": 0.25925925925925924, "acc_stderr": 0.029886910547626974, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.029886910547626974 }, "hendrycksTest-high_school_us_history": { "acc": 0.28921568627450983, "acc_stderr": 0.03182231867647553, "acc_norm": 0.28921568627450983, "acc_norm_stderr": 0.03182231867647553 }, "hendrycksTest-high_school_world_history": { "acc": 0.3037974683544304, "acc_stderr": 0.029936696387138608, "acc_norm": 0.3037974683544304, "acc_norm_stderr": 0.029936696387138608 }, "hendrycksTest-human_aging": { "acc": 0.3273542600896861, "acc_stderr": 0.031493846709941306, "acc_norm": 0.3273542600896861, "acc_norm_stderr": 0.031493846709941306 }, "hendrycksTest-human_sexuality": { "acc": 0.3435114503816794, "acc_stderr": 0.041649760719448786, "acc_norm": 0.3435114503816794, "acc_norm_stderr": 0.041649760719448786 }, "hendrycksTest-international_law": { "acc": 0.24793388429752067, "acc_stderr": 0.03941897526516302, "acc_norm": 0.24793388429752067, "acc_norm_stderr": 0.03941897526516302 }, "hendrycksTest-jurisprudence": { "acc": 0.28703703703703703, "acc_stderr": 0.04373313040914761, "acc_norm": 0.28703703703703703, "acc_norm_stderr": 0.04373313040914761 }, "hendrycksTest-logical_fallacies": { "acc": 0.2147239263803681, "acc_stderr": 0.03226219377286774, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774 }, "hendrycksTest-machine_learning": { "acc": 0.2857142857142857, "acc_stderr": 0.04287858751340456, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.04287858751340456 }, "hendrycksTest-management": { "acc": 0.18446601941747573, "acc_stderr": 0.03840423627288276, "acc_norm": 0.18446601941747573, "acc_norm_stderr": 0.03840423627288276 }, "hendrycksTest-marketing": { "acc": 0.3162393162393162, "acc_stderr": 0.030463656747340244, "acc_norm": 0.3162393162393162, "acc_norm_stderr": 0.030463656747340244 }, "hendrycksTest-medical_genetics": { "acc": 0.39, "acc_stderr": 0.04902071300001974, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974 }, "hendrycksTest-miscellaneous": { "acc": 0.2541507024265645, "acc_stderr": 0.015569254692045766, "acc_norm": 0.2541507024265645, "acc_norm_stderr": 0.015569254692045766 }, "hendrycksTest-moral_disputes": { "acc": 0.3236994219653179, "acc_stderr": 0.025190181327608415, "acc_norm": 0.3236994219653179, "acc_norm_stderr": 0.025190181327608415 }, "hendrycksTest-moral_scenarios": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574917, "acc_norm": 0.23798882681564246, "acc_norm_stderr": 0.014242630070574917 }, "hendrycksTest-nutrition": { "acc": 0.2647058823529412, "acc_stderr": 0.025261691219729505, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.025261691219729505 }, "hendrycksTest-philosophy": { "acc": 0.26688102893890675, "acc_stderr": 0.025122637608816653, "acc_norm": 0.26688102893890675, "acc_norm_stderr": 0.025122637608816653 }, "hendrycksTest-prehistory": { "acc": 0.2962962962962963, "acc_stderr": 0.025407197798890162, "acc_norm": 0.2962962962962963, "acc_norm_stderr": 0.025407197798890162 }, "hendrycksTest-professional_accounting": { "acc": 0.2730496453900709, "acc_stderr": 0.02657786094330786, "acc_norm": 0.2730496453900709, "acc_norm_stderr": 0.02657786094330786 }, "hendrycksTest-professional_law": { "acc": 0.2940026075619296, "acc_stderr": 0.011636062953698609, "acc_norm": 0.2940026075619296, "acc_norm_stderr": 0.011636062953698609 }, "hendrycksTest-professional_medicine": { "acc": 0.19117647058823528, "acc_stderr": 0.023886881922440345, "acc_norm": 0.19117647058823528, "acc_norm_stderr": 0.023886881922440345 }, "hendrycksTest-professional_psychology": { "acc": 0.29411764705882354, "acc_stderr": 0.0184334276494019, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.0184334276494019 }, "hendrycksTest-public_relations": { "acc": 0.33636363636363636, "acc_stderr": 0.04525393596302506, "acc_norm": 0.33636363636363636, "acc_norm_stderr": 0.04525393596302506 }, "hendrycksTest-security_studies": { "acc": 0.21224489795918366, "acc_stderr": 0.026176967197866767, "acc_norm": 0.21224489795918366, "acc_norm_stderr": 0.026176967197866767 }, "hendrycksTest-sociology": { "acc": 0.25870646766169153, "acc_stderr": 0.030965903123573012, "acc_norm": 0.25870646766169153, "acc_norm_stderr": 0.030965903123573012 }, "hendrycksTest-us_foreign_policy": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-virology": { "acc": 0.27710843373493976, "acc_stderr": 0.034843315926805875, "acc_norm": 0.27710843373493976, "acc_norm_stderr": 0.034843315926805875 }, "hendrycksTest-world_religions": { "acc": 0.38011695906432746, "acc_stderr": 0.037229657413855394, "acc_norm": 0.38011695906432746, "acc_norm_stderr": 0.037229657413855394 } }, "versions": { "hendrycksTest-abstract_algebra": 1, "hendrycksTest-anatomy": 1, "hendrycksTest-astronomy": 1, "hendrycksTest-business_ethics": 1, "hendrycksTest-clinical_knowledge": 1, "hendrycksTest-college_biology": 1, "hendrycksTest-college_chemistry": 1, "hendrycksTest-college_computer_science": 1, "hendrycksTest-college_mathematics": 1, "hendrycksTest-college_medicine": 1, "hendrycksTest-college_physics": 1, "hendrycksTest-computer_security": 1, "hendrycksTest-conceptual_physics": 1, "hendrycksTest-econometrics": 1, "hendrycksTest-electrical_engineering": 1, "hendrycksTest-elementary_mathematics": 1, "hendrycksTest-formal_logic": 1, "hendrycksTest-global_facts": 1, "hendrycksTest-high_school_biology": 1, "hendrycksTest-high_school_chemistry": 1, "hendrycksTest-high_school_computer_science": 1, "hendrycksTest-high_school_european_history": 1, "hendrycksTest-high_school_geography": 1, "hendrycksTest-high_school_government_and_politics": 1, "hendrycksTest-high_school_macroeconomics": 1, "hendrycksTest-high_school_mathematics": 1, "hendrycksTest-high_school_microeconomics": 1, "hendrycksTest-high_school_physics": 1, "hendrycksTest-high_school_psychology": 1, "hendrycksTest-high_school_statistics": 1, "hendrycksTest-high_school_us_history": 1, "hendrycksTest-high_school_world_history": 1, "hendrycksTest-human_aging": 1, "hendrycksTest-human_sexuality": 1, "hendrycksTest-international_law": 1, "hendrycksTest-jurisprudence": 1, "hendrycksTest-logical_fallacies": 1, "hendrycksTest-machine_learning": 1, "hendrycksTest-management": 1, "hendrycksTest-marketing": 1, "hendrycksTest-medical_genetics": 1, "hendrycksTest-miscellaneous": 1, "hendrycksTest-moral_disputes": 1, "hendrycksTest-moral_scenarios": 1, "hendrycksTest-nutrition": 1, "hendrycksTest-philosophy": 1, "hendrycksTest-prehistory": 1, "hendrycksTest-professional_accounting": 1, "hendrycksTest-professional_law": 1, "hendrycksTest-professional_medicine": 1, "hendrycksTest-professional_psychology": 1, "hendrycksTest-public_relations": 1, "hendrycksTest-security_studies": 1, "hendrycksTest-sociology": 1, "hendrycksTest-us_foreign_policy": 1, "hendrycksTest-virology": 1, "hendrycksTest-world_religions": 1 }, "config": { "model": "hf-causal", "model_args": "pretrained=workdir_7b/ckpt_350", "num_fewshot": 5, "batch_size": "8", "batch_sizes": [], "device": null, "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }