{ "results": { "hendrycksTest-abstract_algebra": { "acc": 0.29, "acc_stderr": 0.04560480215720684, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "hendrycksTest-anatomy": { "acc": 0.3333333333333333, "acc_stderr": 0.04072314811876837, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.04072314811876837 }, "hendrycksTest-astronomy": { "acc": 0.2894736842105263, "acc_stderr": 0.036906779861372814, "acc_norm": 0.2894736842105263, "acc_norm_stderr": 0.036906779861372814 }, "hendrycksTest-business_ethics": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "hendrycksTest-clinical_knowledge": { "acc": 0.3169811320754717, "acc_stderr": 0.028637235639800925, "acc_norm": 0.3169811320754717, "acc_norm_stderr": 0.028637235639800925 }, "hendrycksTest-college_biology": { "acc": 0.2986111111111111, "acc_stderr": 0.03827052357950756, "acc_norm": 0.2986111111111111, "acc_norm_stderr": 0.03827052357950756 }, "hendrycksTest-college_chemistry": { "acc": 0.22, "acc_stderr": 0.0416333199893227, "acc_norm": 0.22, "acc_norm_stderr": 0.0416333199893227 }, "hendrycksTest-college_computer_science": { "acc": 0.42, "acc_stderr": 0.04960449637488583, "acc_norm": 0.42, "acc_norm_stderr": 0.04960449637488583 }, "hendrycksTest-college_mathematics": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "hendrycksTest-college_medicine": { "acc": 0.2774566473988439, "acc_stderr": 0.03414014007044036, "acc_norm": 0.2774566473988439, "acc_norm_stderr": 0.03414014007044036 }, "hendrycksTest-college_physics": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237656, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237656 }, "hendrycksTest-computer_security": { "acc": 0.49, "acc_stderr": 0.05024183937956911, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956911 }, "hendrycksTest-conceptual_physics": { "acc": 0.3191489361702128, "acc_stderr": 0.03047297336338004, "acc_norm": 0.3191489361702128, "acc_norm_stderr": 0.03047297336338004 }, "hendrycksTest-econometrics": { "acc": 0.2894736842105263, "acc_stderr": 0.042663394431593935, "acc_norm": 0.2894736842105263, "acc_norm_stderr": 0.042663394431593935 }, "hendrycksTest-electrical_engineering": { "acc": 0.27586206896551724, "acc_stderr": 0.037245636197746325, "acc_norm": 0.27586206896551724, "acc_norm_stderr": 0.037245636197746325 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2698412698412698, "acc_stderr": 0.022860838309232072, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.022860838309232072 }, "hendrycksTest-formal_logic": { "acc": 0.31746031746031744, "acc_stderr": 0.04163453031302859, "acc_norm": 0.31746031746031744, "acc_norm_stderr": 0.04163453031302859 }, "hendrycksTest-global_facts": { "acc": 0.35, "acc_stderr": 0.04793724854411018, "acc_norm": 0.35, "acc_norm_stderr": 0.04793724854411018 }, "hendrycksTest-high_school_biology": { "acc": 0.25483870967741934, "acc_stderr": 0.024790118459332208, "acc_norm": 0.25483870967741934, "acc_norm_stderr": 0.024790118459332208 }, "hendrycksTest-high_school_chemistry": { "acc": 0.2512315270935961, "acc_stderr": 0.030516530732694433, "acc_norm": 0.2512315270935961, "acc_norm_stderr": 0.030516530732694433 }, "hendrycksTest-high_school_computer_science": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-high_school_european_history": { "acc": 0.2787878787878788, "acc_stderr": 0.03501438706296781, "acc_norm": 0.2787878787878788, "acc_norm_stderr": 0.03501438706296781 }, "hendrycksTest-high_school_geography": { "acc": 0.24242424242424243, "acc_stderr": 0.030532892233932026, "acc_norm": 0.24242424242424243, "acc_norm_stderr": 0.030532892233932026 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.22797927461139897, "acc_stderr": 0.03027690994517825, "acc_norm": 0.22797927461139897, "acc_norm_stderr": 0.03027690994517825 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2230769230769231, "acc_stderr": 0.021107730127243995, "acc_norm": 0.2230769230769231, "acc_norm_stderr": 0.021107730127243995 }, "hendrycksTest-high_school_mathematics": { "acc": 0.24814814814814815, "acc_stderr": 0.0263357394040558, "acc_norm": 0.24814814814814815, "acc_norm_stderr": 0.0263357394040558 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.23949579831932774, "acc_stderr": 0.02772206549336128, "acc_norm": 0.23949579831932774, "acc_norm_stderr": 0.02772206549336128 }, "hendrycksTest-high_school_physics": { "acc": 0.2847682119205298, "acc_stderr": 0.03684881521389023, "acc_norm": 0.2847682119205298, "acc_norm_stderr": 0.03684881521389023 }, "hendrycksTest-high_school_psychology": { "acc": 0.28256880733944956, "acc_stderr": 0.01930424349770715, "acc_norm": 0.28256880733944956, "acc_norm_stderr": 0.01930424349770715 }, "hendrycksTest-high_school_statistics": { "acc": 0.26851851851851855, "acc_stderr": 0.0302252261600124, "acc_norm": 0.26851851851851855, "acc_norm_stderr": 0.0302252261600124 }, "hendrycksTest-high_school_us_history": { "acc": 0.22058823529411764, "acc_stderr": 0.02910225438967409, "acc_norm": 0.22058823529411764, "acc_norm_stderr": 0.02910225438967409 }, "hendrycksTest-high_school_world_history": { "acc": 0.2869198312236287, "acc_stderr": 0.029443773022594693, "acc_norm": 0.2869198312236287, "acc_norm_stderr": 0.029443773022594693 }, "hendrycksTest-human_aging": { "acc": 0.28699551569506726, "acc_stderr": 0.030360379710291954, "acc_norm": 0.28699551569506726, "acc_norm_stderr": 0.030360379710291954 }, "hendrycksTest-human_sexuality": { "acc": 0.2748091603053435, "acc_stderr": 0.03915345408847835, "acc_norm": 0.2748091603053435, "acc_norm_stderr": 0.03915345408847835 }, "hendrycksTest-international_law": { "acc": 0.3884297520661157, "acc_stderr": 0.04449270350068382, "acc_norm": 0.3884297520661157, "acc_norm_stderr": 0.04449270350068382 }, "hendrycksTest-jurisprudence": { "acc": 0.3425925925925926, "acc_stderr": 0.045879047413018105, "acc_norm": 0.3425925925925926, "acc_norm_stderr": 0.045879047413018105 }, "hendrycksTest-logical_fallacies": { "acc": 0.22699386503067484, "acc_stderr": 0.032910995786157686, "acc_norm": 0.22699386503067484, "acc_norm_stderr": 0.032910995786157686 }, "hendrycksTest-machine_learning": { "acc": 0.26785714285714285, "acc_stderr": 0.04203277291467763, "acc_norm": 0.26785714285714285, "acc_norm_stderr": 0.04203277291467763 }, "hendrycksTest-management": { "acc": 0.2621359223300971, "acc_stderr": 0.04354631077260597, "acc_norm": 0.2621359223300971, "acc_norm_stderr": 0.04354631077260597 }, "hendrycksTest-marketing": { "acc": 0.34615384615384615, "acc_stderr": 0.0311669573672359, "acc_norm": 0.34615384615384615, "acc_norm_stderr": 0.0311669573672359 }, "hendrycksTest-medical_genetics": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "hendrycksTest-miscellaneous": { "acc": 0.34227330779054915, "acc_stderr": 0.016967031766413614, "acc_norm": 0.34227330779054915, "acc_norm_stderr": 0.016967031766413614 }, "hendrycksTest-moral_disputes": { "acc": 0.30057803468208094, "acc_stderr": 0.02468531686725781, "acc_norm": 0.30057803468208094, "acc_norm_stderr": 0.02468531686725781 }, "hendrycksTest-moral_scenarios": { "acc": 0.2435754189944134, "acc_stderr": 0.014355911964767864, "acc_norm": 0.2435754189944134, "acc_norm_stderr": 0.014355911964767864 }, "hendrycksTest-nutrition": { "acc": 0.30392156862745096, "acc_stderr": 0.02633661346904664, "acc_norm": 0.30392156862745096, "acc_norm_stderr": 0.02633661346904664 }, "hendrycksTest-philosophy": { "acc": 0.36012861736334406, "acc_stderr": 0.027264297599804012, "acc_norm": 0.36012861736334406, "acc_norm_stderr": 0.027264297599804012 }, "hendrycksTest-prehistory": { "acc": 0.3117283950617284, "acc_stderr": 0.02577311116963045, "acc_norm": 0.3117283950617284, "acc_norm_stderr": 0.02577311116963045 }, "hendrycksTest-professional_accounting": { "acc": 0.2695035460992908, "acc_stderr": 0.026469036818590638, "acc_norm": 0.2695035460992908, "acc_norm_stderr": 0.026469036818590638 }, "hendrycksTest-professional_law": { "acc": 0.27640156453715775, "acc_stderr": 0.011422153194553582, "acc_norm": 0.27640156453715775, "acc_norm_stderr": 0.011422153194553582 }, "hendrycksTest-professional_medicine": { "acc": 0.20588235294117646, "acc_stderr": 0.024562204314142314, "acc_norm": 0.20588235294117646, "acc_norm_stderr": 0.024562204314142314 }, "hendrycksTest-professional_psychology": { "acc": 0.29411764705882354, "acc_stderr": 0.018433427649401903, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.018433427649401903 }, "hendrycksTest-public_relations": { "acc": 0.37272727272727274, "acc_stderr": 0.04631381319425463, "acc_norm": 0.37272727272727274, "acc_norm_stderr": 0.04631381319425463 }, "hendrycksTest-security_studies": { "acc": 0.2530612244897959, "acc_stderr": 0.027833023871399683, "acc_norm": 0.2530612244897959, "acc_norm_stderr": 0.027833023871399683 }, "hendrycksTest-sociology": { "acc": 0.26865671641791045, "acc_stderr": 0.03134328358208955, "acc_norm": 0.26865671641791045, "acc_norm_stderr": 0.03134328358208955 }, "hendrycksTest-us_foreign_policy": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "hendrycksTest-virology": { "acc": 0.3614457831325301, "acc_stderr": 0.037400593820293204, "acc_norm": 0.3614457831325301, "acc_norm_stderr": 0.037400593820293204 }, "hendrycksTest-world_religions": { "acc": 0.3567251461988304, "acc_stderr": 0.03674013002860954, "acc_norm": 0.3567251461988304, "acc_norm_stderr": 0.03674013002860954 } }, "versions": { "hendrycksTest-abstract_algebra": 1, "hendrycksTest-anatomy": 1, "hendrycksTest-astronomy": 1, "hendrycksTest-business_ethics": 1, "hendrycksTest-clinical_knowledge": 1, "hendrycksTest-college_biology": 1, "hendrycksTest-college_chemistry": 1, "hendrycksTest-college_computer_science": 1, "hendrycksTest-college_mathematics": 1, "hendrycksTest-college_medicine": 1, "hendrycksTest-college_physics": 1, "hendrycksTest-computer_security": 1, "hendrycksTest-conceptual_physics": 1, "hendrycksTest-econometrics": 1, "hendrycksTest-electrical_engineering": 1, "hendrycksTest-elementary_mathematics": 1, "hendrycksTest-formal_logic": 1, "hendrycksTest-global_facts": 1, "hendrycksTest-high_school_biology": 1, "hendrycksTest-high_school_chemistry": 1, "hendrycksTest-high_school_computer_science": 1, "hendrycksTest-high_school_european_history": 1, "hendrycksTest-high_school_geography": 1, "hendrycksTest-high_school_government_and_politics": 1, "hendrycksTest-high_school_macroeconomics": 1, "hendrycksTest-high_school_mathematics": 1, "hendrycksTest-high_school_microeconomics": 1, "hendrycksTest-high_school_physics": 1, "hendrycksTest-high_school_psychology": 1, "hendrycksTest-high_school_statistics": 1, "hendrycksTest-high_school_us_history": 1, "hendrycksTest-high_school_world_history": 1, "hendrycksTest-human_aging": 1, "hendrycksTest-human_sexuality": 1, "hendrycksTest-international_law": 1, "hendrycksTest-jurisprudence": 1, "hendrycksTest-logical_fallacies": 1, "hendrycksTest-machine_learning": 1, "hendrycksTest-management": 1, "hendrycksTest-marketing": 1, "hendrycksTest-medical_genetics": 1, "hendrycksTest-miscellaneous": 1, "hendrycksTest-moral_disputes": 1, "hendrycksTest-moral_scenarios": 1, "hendrycksTest-nutrition": 1, "hendrycksTest-philosophy": 1, "hendrycksTest-prehistory": 1, "hendrycksTest-professional_accounting": 1, "hendrycksTest-professional_law": 1, "hendrycksTest-professional_medicine": 1, "hendrycksTest-professional_psychology": 1, "hendrycksTest-public_relations": 1, "hendrycksTest-security_studies": 1, "hendrycksTest-sociology": 1, "hendrycksTest-us_foreign_policy": 1, "hendrycksTest-virology": 1, "hendrycksTest-world_religions": 1 }, "config": { "model": "hf-causal", "model_args": "pretrained=workdir_7b/ckpt_354", "num_fewshot": 5, "batch_size": "8", "batch_sizes": [], "device": null, "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }