{ "results": { "hendrycksTest-abstract_algebra": { "acc": 0.28, "acc_stderr": 0.045126085985421296, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421296 }, "hendrycksTest-anatomy": { "acc": 0.3037037037037037, "acc_stderr": 0.03972552884785136, "acc_norm": 0.3037037037037037, "acc_norm_stderr": 0.03972552884785136 }, "hendrycksTest-astronomy": { "acc": 0.27631578947368424, "acc_stderr": 0.03639057569952925, "acc_norm": 0.27631578947368424, "acc_norm_stderr": 0.03639057569952925 }, "hendrycksTest-business_ethics": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "hendrycksTest-clinical_knowledge": { "acc": 0.3660377358490566, "acc_stderr": 0.02964781353936525, "acc_norm": 0.3660377358490566, "acc_norm_stderr": 0.02964781353936525 }, "hendrycksTest-college_biology": { "acc": 0.2986111111111111, "acc_stderr": 0.038270523579507554, "acc_norm": 0.2986111111111111, "acc_norm_stderr": 0.038270523579507554 }, "hendrycksTest-college_chemistry": { "acc": 0.28, "acc_stderr": 0.04512608598542127, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "hendrycksTest-college_computer_science": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "hendrycksTest-college_mathematics": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-college_medicine": { "acc": 0.2947976878612717, "acc_stderr": 0.034765996075164785, "acc_norm": 0.2947976878612717, "acc_norm_stderr": 0.034765996075164785 }, "hendrycksTest-college_physics": { "acc": 0.19607843137254902, "acc_stderr": 0.03950581861179961, "acc_norm": 0.19607843137254902, "acc_norm_stderr": 0.03950581861179961 }, "hendrycksTest-computer_security": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "hendrycksTest-conceptual_physics": { "acc": 0.33191489361702126, "acc_stderr": 0.03078373675774565, "acc_norm": 0.33191489361702126, "acc_norm_stderr": 0.03078373675774565 }, "hendrycksTest-econometrics": { "acc": 0.2807017543859649, "acc_stderr": 0.042270544512322004, "acc_norm": 0.2807017543859649, "acc_norm_stderr": 0.042270544512322004 }, "hendrycksTest-electrical_engineering": { "acc": 0.3310344827586207, "acc_stderr": 0.03921545312467122, "acc_norm": 0.3310344827586207, "acc_norm_stderr": 0.03921545312467122 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2698412698412698, "acc_stderr": 0.022860838309232072, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.022860838309232072 }, "hendrycksTest-formal_logic": { "acc": 0.3333333333333333, "acc_stderr": 0.042163702135578345, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.042163702135578345 }, "hendrycksTest-global_facts": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "hendrycksTest-high_school_biology": { "acc": 0.25161290322580643, "acc_stderr": 0.024685979286239938, "acc_norm": 0.25161290322580643, "acc_norm_stderr": 0.024685979286239938 }, "hendrycksTest-high_school_chemistry": { "acc": 0.270935960591133, "acc_stderr": 0.031270907132976984, "acc_norm": 0.270935960591133, "acc_norm_stderr": 0.031270907132976984 }, "hendrycksTest-high_school_computer_science": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "hendrycksTest-high_school_european_history": { "acc": 0.3151515151515151, "acc_stderr": 0.0362773057502241, "acc_norm": 0.3151515151515151, "acc_norm_stderr": 0.0362773057502241 }, "hendrycksTest-high_school_geography": { "acc": 0.3181818181818182, "acc_stderr": 0.03318477333845331, "acc_norm": 0.3181818181818182, "acc_norm_stderr": 0.03318477333845331 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.24352331606217617, "acc_stderr": 0.030975436386845426, "acc_norm": 0.24352331606217617, "acc_norm_stderr": 0.030975436386845426 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2512820512820513, "acc_stderr": 0.021992016662370554, "acc_norm": 0.2512820512820513, "acc_norm_stderr": 0.021992016662370554 }, "hendrycksTest-high_school_mathematics": { "acc": 0.22962962962962963, "acc_stderr": 0.02564410863926763, "acc_norm": 0.22962962962962963, "acc_norm_stderr": 0.02564410863926763 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.29411764705882354, "acc_stderr": 0.029597329730978103, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.029597329730978103 }, "hendrycksTest-high_school_physics": { "acc": 0.2847682119205298, "acc_stderr": 0.03684881521389023, "acc_norm": 0.2847682119205298, "acc_norm_stderr": 0.03684881521389023 }, "hendrycksTest-high_school_psychology": { "acc": 0.3192660550458716, "acc_stderr": 0.01998782906975001, "acc_norm": 0.3192660550458716, "acc_norm_stderr": 0.01998782906975001 }, "hendrycksTest-high_school_statistics": { "acc": 0.2916666666666667, "acc_stderr": 0.030998666304560524, "acc_norm": 0.2916666666666667, "acc_norm_stderr": 0.030998666304560524 }, "hendrycksTest-high_school_us_history": { "acc": 0.2647058823529412, "acc_stderr": 0.03096451792692341, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.03096451792692341 }, "hendrycksTest-high_school_world_history": { "acc": 0.26582278481012656, "acc_stderr": 0.028756799629658335, "acc_norm": 0.26582278481012656, "acc_norm_stderr": 0.028756799629658335 }, "hendrycksTest-human_aging": { "acc": 0.2556053811659193, "acc_stderr": 0.029275891003969927, "acc_norm": 0.2556053811659193, "acc_norm_stderr": 0.029275891003969927 }, "hendrycksTest-human_sexuality": { "acc": 0.3053435114503817, "acc_stderr": 0.04039314978724561, "acc_norm": 0.3053435114503817, "acc_norm_stderr": 0.04039314978724561 }, "hendrycksTest-international_law": { "acc": 0.38016528925619836, "acc_stderr": 0.04431324501968431, "acc_norm": 0.38016528925619836, "acc_norm_stderr": 0.04431324501968431 }, "hendrycksTest-jurisprudence": { "acc": 0.3333333333333333, "acc_stderr": 0.04557239513497751, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.04557239513497751 }, "hendrycksTest-logical_fallacies": { "acc": 0.2147239263803681, "acc_stderr": 0.03226219377286774, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774 }, "hendrycksTest-machine_learning": { "acc": 0.26785714285714285, "acc_stderr": 0.04203277291467764, "acc_norm": 0.26785714285714285, "acc_norm_stderr": 0.04203277291467764 }, "hendrycksTest-management": { "acc": 0.2621359223300971, "acc_stderr": 0.04354631077260597, "acc_norm": 0.2621359223300971, "acc_norm_stderr": 0.04354631077260597 }, "hendrycksTest-marketing": { "acc": 0.34615384615384615, "acc_stderr": 0.031166957367235897, "acc_norm": 0.34615384615384615, "acc_norm_stderr": 0.031166957367235897 }, "hendrycksTest-medical_genetics": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-miscellaneous": { "acc": 0.3269476372924649, "acc_stderr": 0.016774908180131463, "acc_norm": 0.3269476372924649, "acc_norm_stderr": 0.016774908180131463 }, "hendrycksTest-moral_disputes": { "acc": 0.32947976878612717, "acc_stderr": 0.0253052581318797, "acc_norm": 0.32947976878612717, "acc_norm_stderr": 0.0253052581318797 }, "hendrycksTest-moral_scenarios": { "acc": 0.24134078212290502, "acc_stderr": 0.014310999547961455, "acc_norm": 0.24134078212290502, "acc_norm_stderr": 0.014310999547961455 }, "hendrycksTest-nutrition": { "acc": 0.33986928104575165, "acc_stderr": 0.027121956071388856, "acc_norm": 0.33986928104575165, "acc_norm_stderr": 0.027121956071388856 }, "hendrycksTest-philosophy": { "acc": 0.2861736334405145, "acc_stderr": 0.025670259242188936, "acc_norm": 0.2861736334405145, "acc_norm_stderr": 0.025670259242188936 }, "hendrycksTest-prehistory": { "acc": 0.26851851851851855, "acc_stderr": 0.024659685185967273, "acc_norm": 0.26851851851851855, "acc_norm_stderr": 0.024659685185967273 }, "hendrycksTest-professional_accounting": { "acc": 0.30141843971631205, "acc_stderr": 0.02737412888263115, "acc_norm": 0.30141843971631205, "acc_norm_stderr": 0.02737412888263115 }, "hendrycksTest-professional_law": { "acc": 0.2796610169491525, "acc_stderr": 0.011463397393861947, "acc_norm": 0.2796610169491525, "acc_norm_stderr": 0.011463397393861947 }, "hendrycksTest-professional_medicine": { "acc": 0.20588235294117646, "acc_stderr": 0.024562204314142314, "acc_norm": 0.20588235294117646, "acc_norm_stderr": 0.024562204314142314 }, "hendrycksTest-professional_psychology": { "acc": 0.2957516339869281, "acc_stderr": 0.018463154132632806, "acc_norm": 0.2957516339869281, "acc_norm_stderr": 0.018463154132632806 }, "hendrycksTest-public_relations": { "acc": 0.35454545454545455, "acc_stderr": 0.04582004841505416, "acc_norm": 0.35454545454545455, "acc_norm_stderr": 0.04582004841505416 }, "hendrycksTest-security_studies": { "acc": 0.31020408163265306, "acc_stderr": 0.029613459872484378, "acc_norm": 0.31020408163265306, "acc_norm_stderr": 0.029613459872484378 }, "hendrycksTest-sociology": { "acc": 0.31343283582089554, "acc_stderr": 0.032801882053486435, "acc_norm": 0.31343283582089554, "acc_norm_stderr": 0.032801882053486435 }, "hendrycksTest-us_foreign_policy": { "acc": 0.35, "acc_stderr": 0.0479372485441102, "acc_norm": 0.35, "acc_norm_stderr": 0.0479372485441102 }, "hendrycksTest-virology": { "acc": 0.3674698795180723, "acc_stderr": 0.03753267402120574, "acc_norm": 0.3674698795180723, "acc_norm_stderr": 0.03753267402120574 }, "hendrycksTest-world_religions": { "acc": 0.36257309941520466, "acc_stderr": 0.0368713061556206, "acc_norm": 0.36257309941520466, "acc_norm_stderr": 0.0368713061556206 } }, "versions": { "hendrycksTest-abstract_algebra": 1, "hendrycksTest-anatomy": 1, "hendrycksTest-astronomy": 1, "hendrycksTest-business_ethics": 1, "hendrycksTest-clinical_knowledge": 1, "hendrycksTest-college_biology": 1, "hendrycksTest-college_chemistry": 1, "hendrycksTest-college_computer_science": 1, "hendrycksTest-college_mathematics": 1, "hendrycksTest-college_medicine": 1, "hendrycksTest-college_physics": 1, "hendrycksTest-computer_security": 1, "hendrycksTest-conceptual_physics": 1, "hendrycksTest-econometrics": 1, "hendrycksTest-electrical_engineering": 1, "hendrycksTest-elementary_mathematics": 1, "hendrycksTest-formal_logic": 1, "hendrycksTest-global_facts": 1, "hendrycksTest-high_school_biology": 1, "hendrycksTest-high_school_chemistry": 1, "hendrycksTest-high_school_computer_science": 1, "hendrycksTest-high_school_european_history": 1, "hendrycksTest-high_school_geography": 1, "hendrycksTest-high_school_government_and_politics": 1, "hendrycksTest-high_school_macroeconomics": 1, "hendrycksTest-high_school_mathematics": 1, "hendrycksTest-high_school_microeconomics": 1, "hendrycksTest-high_school_physics": 1, "hendrycksTest-high_school_psychology": 1, "hendrycksTest-high_school_statistics": 1, "hendrycksTest-high_school_us_history": 1, "hendrycksTest-high_school_world_history": 1, "hendrycksTest-human_aging": 1, "hendrycksTest-human_sexuality": 1, "hendrycksTest-international_law": 1, "hendrycksTest-jurisprudence": 1, "hendrycksTest-logical_fallacies": 1, "hendrycksTest-machine_learning": 1, "hendrycksTest-management": 1, "hendrycksTest-marketing": 1, "hendrycksTest-medical_genetics": 1, "hendrycksTest-miscellaneous": 1, "hendrycksTest-moral_disputes": 1, "hendrycksTest-moral_scenarios": 1, "hendrycksTest-nutrition": 1, "hendrycksTest-philosophy": 1, "hendrycksTest-prehistory": 1, "hendrycksTest-professional_accounting": 1, "hendrycksTest-professional_law": 1, "hendrycksTest-professional_medicine": 1, "hendrycksTest-professional_psychology": 1, "hendrycksTest-public_relations": 1, "hendrycksTest-security_studies": 1, "hendrycksTest-sociology": 1, "hendrycksTest-us_foreign_policy": 1, "hendrycksTest-virology": 1, "hendrycksTest-world_religions": 1 }, "config": { "model": "hf-causal", "model_args": "pretrained=workdir_7b/ckpt_352", "num_fewshot": 5, "batch_size": "8", "batch_sizes": [], "device": null, "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }