Amber / eval_mmlu.json
omkarenator's picture
Upload folder using huggingface_hub
d68bc61
raw
history blame
14.2 kB
{
"results": {
"hendrycksTest-abstract_algebra": {
"acc": 0.28,
"acc_stderr": 0.045126085985421296,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421296
},
"hendrycksTest-anatomy": {
"acc": 0.3037037037037037,
"acc_stderr": 0.03972552884785136,
"acc_norm": 0.3037037037037037,
"acc_norm_stderr": 0.03972552884785136
},
"hendrycksTest-astronomy": {
"acc": 0.27631578947368424,
"acc_stderr": 0.03639057569952925,
"acc_norm": 0.27631578947368424,
"acc_norm_stderr": 0.03639057569952925
},
"hendrycksTest-business_ethics": {
"acc": 0.35,
"acc_stderr": 0.047937248544110196,
"acc_norm": 0.35,
"acc_norm_stderr": 0.047937248544110196
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.3660377358490566,
"acc_stderr": 0.02964781353936525,
"acc_norm": 0.3660377358490566,
"acc_norm_stderr": 0.02964781353936525
},
"hendrycksTest-college_biology": {
"acc": 0.2986111111111111,
"acc_stderr": 0.038270523579507554,
"acc_norm": 0.2986111111111111,
"acc_norm_stderr": 0.038270523579507554
},
"hendrycksTest-college_chemistry": {
"acc": 0.28,
"acc_stderr": 0.04512608598542127,
"acc_norm": 0.28,
"acc_norm_stderr": 0.04512608598542127
},
"hendrycksTest-college_computer_science": {
"acc": 0.39,
"acc_stderr": 0.04902071300001975,
"acc_norm": 0.39,
"acc_norm_stderr": 0.04902071300001975
},
"hendrycksTest-college_mathematics": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"hendrycksTest-college_medicine": {
"acc": 0.2947976878612717,
"acc_stderr": 0.034765996075164785,
"acc_norm": 0.2947976878612717,
"acc_norm_stderr": 0.034765996075164785
},
"hendrycksTest-college_physics": {
"acc": 0.19607843137254902,
"acc_stderr": 0.03950581861179961,
"acc_norm": 0.19607843137254902,
"acc_norm_stderr": 0.03950581861179961
},
"hendrycksTest-computer_security": {
"acc": 0.41,
"acc_stderr": 0.049431107042371025,
"acc_norm": 0.41,
"acc_norm_stderr": 0.049431107042371025
},
"hendrycksTest-conceptual_physics": {
"acc": 0.33191489361702126,
"acc_stderr": 0.03078373675774565,
"acc_norm": 0.33191489361702126,
"acc_norm_stderr": 0.03078373675774565
},
"hendrycksTest-econometrics": {
"acc": 0.2807017543859649,
"acc_stderr": 0.042270544512322004,
"acc_norm": 0.2807017543859649,
"acc_norm_stderr": 0.042270544512322004
},
"hendrycksTest-electrical_engineering": {
"acc": 0.3310344827586207,
"acc_stderr": 0.03921545312467122,
"acc_norm": 0.3310344827586207,
"acc_norm_stderr": 0.03921545312467122
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2698412698412698,
"acc_stderr": 0.022860838309232072,
"acc_norm": 0.2698412698412698,
"acc_norm_stderr": 0.022860838309232072
},
"hendrycksTest-formal_logic": {
"acc": 0.3333333333333333,
"acc_stderr": 0.042163702135578345,
"acc_norm": 0.3333333333333333,
"acc_norm_stderr": 0.042163702135578345
},
"hendrycksTest-global_facts": {
"acc": 0.34,
"acc_stderr": 0.04760952285695235,
"acc_norm": 0.34,
"acc_norm_stderr": 0.04760952285695235
},
"hendrycksTest-high_school_biology": {
"acc": 0.25161290322580643,
"acc_stderr": 0.024685979286239938,
"acc_norm": 0.25161290322580643,
"acc_norm_stderr": 0.024685979286239938
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.270935960591133,
"acc_stderr": 0.031270907132976984,
"acc_norm": 0.270935960591133,
"acc_norm_stderr": 0.031270907132976984
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.29,
"acc_stderr": 0.045604802157206845,
"acc_norm": 0.29,
"acc_norm_stderr": 0.045604802157206845
},
"hendrycksTest-high_school_european_history": {
"acc": 0.3151515151515151,
"acc_stderr": 0.0362773057502241,
"acc_norm": 0.3151515151515151,
"acc_norm_stderr": 0.0362773057502241
},
"hendrycksTest-high_school_geography": {
"acc": 0.3181818181818182,
"acc_stderr": 0.03318477333845331,
"acc_norm": 0.3181818181818182,
"acc_norm_stderr": 0.03318477333845331
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.24352331606217617,
"acc_stderr": 0.030975436386845426,
"acc_norm": 0.24352331606217617,
"acc_norm_stderr": 0.030975436386845426
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2512820512820513,
"acc_stderr": 0.021992016662370554,
"acc_norm": 0.2512820512820513,
"acc_norm_stderr": 0.021992016662370554
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.22962962962962963,
"acc_stderr": 0.02564410863926763,
"acc_norm": 0.22962962962962963,
"acc_norm_stderr": 0.02564410863926763
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.29411764705882354,
"acc_stderr": 0.029597329730978103,
"acc_norm": 0.29411764705882354,
"acc_norm_stderr": 0.029597329730978103
},
"hendrycksTest-high_school_physics": {
"acc": 0.2847682119205298,
"acc_stderr": 0.03684881521389023,
"acc_norm": 0.2847682119205298,
"acc_norm_stderr": 0.03684881521389023
},
"hendrycksTest-high_school_psychology": {
"acc": 0.3192660550458716,
"acc_stderr": 0.01998782906975001,
"acc_norm": 0.3192660550458716,
"acc_norm_stderr": 0.01998782906975001
},
"hendrycksTest-high_school_statistics": {
"acc": 0.2916666666666667,
"acc_stderr": 0.030998666304560524,
"acc_norm": 0.2916666666666667,
"acc_norm_stderr": 0.030998666304560524
},
"hendrycksTest-high_school_us_history": {
"acc": 0.2647058823529412,
"acc_stderr": 0.03096451792692341,
"acc_norm": 0.2647058823529412,
"acc_norm_stderr": 0.03096451792692341
},
"hendrycksTest-high_school_world_history": {
"acc": 0.26582278481012656,
"acc_stderr": 0.028756799629658335,
"acc_norm": 0.26582278481012656,
"acc_norm_stderr": 0.028756799629658335
},
"hendrycksTest-human_aging": {
"acc": 0.2556053811659193,
"acc_stderr": 0.029275891003969927,
"acc_norm": 0.2556053811659193,
"acc_norm_stderr": 0.029275891003969927
},
"hendrycksTest-human_sexuality": {
"acc": 0.3053435114503817,
"acc_stderr": 0.04039314978724561,
"acc_norm": 0.3053435114503817,
"acc_norm_stderr": 0.04039314978724561
},
"hendrycksTest-international_law": {
"acc": 0.38016528925619836,
"acc_stderr": 0.04431324501968431,
"acc_norm": 0.38016528925619836,
"acc_norm_stderr": 0.04431324501968431
},
"hendrycksTest-jurisprudence": {
"acc": 0.3333333333333333,
"acc_stderr": 0.04557239513497751,
"acc_norm": 0.3333333333333333,
"acc_norm_stderr": 0.04557239513497751
},
"hendrycksTest-logical_fallacies": {
"acc": 0.2147239263803681,
"acc_stderr": 0.03226219377286774,
"acc_norm": 0.2147239263803681,
"acc_norm_stderr": 0.03226219377286774
},
"hendrycksTest-machine_learning": {
"acc": 0.26785714285714285,
"acc_stderr": 0.04203277291467764,
"acc_norm": 0.26785714285714285,
"acc_norm_stderr": 0.04203277291467764
},
"hendrycksTest-management": {
"acc": 0.2621359223300971,
"acc_stderr": 0.04354631077260597,
"acc_norm": 0.2621359223300971,
"acc_norm_stderr": 0.04354631077260597
},
"hendrycksTest-marketing": {
"acc": 0.34615384615384615,
"acc_stderr": 0.031166957367235897,
"acc_norm": 0.34615384615384615,
"acc_norm_stderr": 0.031166957367235897
},
"hendrycksTest-medical_genetics": {
"acc": 0.3,
"acc_stderr": 0.046056618647183814,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-miscellaneous": {
"acc": 0.3269476372924649,
"acc_stderr": 0.016774908180131463,
"acc_norm": 0.3269476372924649,
"acc_norm_stderr": 0.016774908180131463
},
"hendrycksTest-moral_disputes": {
"acc": 0.32947976878612717,
"acc_stderr": 0.0253052581318797,
"acc_norm": 0.32947976878612717,
"acc_norm_stderr": 0.0253052581318797
},
"hendrycksTest-moral_scenarios": {
"acc": 0.24134078212290502,
"acc_stderr": 0.014310999547961455,
"acc_norm": 0.24134078212290502,
"acc_norm_stderr": 0.014310999547961455
},
"hendrycksTest-nutrition": {
"acc": 0.33986928104575165,
"acc_stderr": 0.027121956071388856,
"acc_norm": 0.33986928104575165,
"acc_norm_stderr": 0.027121956071388856
},
"hendrycksTest-philosophy": {
"acc": 0.2861736334405145,
"acc_stderr": 0.025670259242188936,
"acc_norm": 0.2861736334405145,
"acc_norm_stderr": 0.025670259242188936
},
"hendrycksTest-prehistory": {
"acc": 0.26851851851851855,
"acc_stderr": 0.024659685185967273,
"acc_norm": 0.26851851851851855,
"acc_norm_stderr": 0.024659685185967273
},
"hendrycksTest-professional_accounting": {
"acc": 0.30141843971631205,
"acc_stderr": 0.02737412888263115,
"acc_norm": 0.30141843971631205,
"acc_norm_stderr": 0.02737412888263115
},
"hendrycksTest-professional_law": {
"acc": 0.2796610169491525,
"acc_stderr": 0.011463397393861947,
"acc_norm": 0.2796610169491525,
"acc_norm_stderr": 0.011463397393861947
},
"hendrycksTest-professional_medicine": {
"acc": 0.20588235294117646,
"acc_stderr": 0.024562204314142314,
"acc_norm": 0.20588235294117646,
"acc_norm_stderr": 0.024562204314142314
},
"hendrycksTest-professional_psychology": {
"acc": 0.2957516339869281,
"acc_stderr": 0.018463154132632806,
"acc_norm": 0.2957516339869281,
"acc_norm_stderr": 0.018463154132632806
},
"hendrycksTest-public_relations": {
"acc": 0.35454545454545455,
"acc_stderr": 0.04582004841505416,
"acc_norm": 0.35454545454545455,
"acc_norm_stderr": 0.04582004841505416
},
"hendrycksTest-security_studies": {
"acc": 0.31020408163265306,
"acc_stderr": 0.029613459872484378,
"acc_norm": 0.31020408163265306,
"acc_norm_stderr": 0.029613459872484378
},
"hendrycksTest-sociology": {
"acc": 0.31343283582089554,
"acc_stderr": 0.032801882053486435,
"acc_norm": 0.31343283582089554,
"acc_norm_stderr": 0.032801882053486435
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.35,
"acc_stderr": 0.0479372485441102,
"acc_norm": 0.35,
"acc_norm_stderr": 0.0479372485441102
},
"hendrycksTest-virology": {
"acc": 0.3674698795180723,
"acc_stderr": 0.03753267402120574,
"acc_norm": 0.3674698795180723,
"acc_norm_stderr": 0.03753267402120574
},
"hendrycksTest-world_religions": {
"acc": 0.36257309941520466,
"acc_stderr": 0.0368713061556206,
"acc_norm": 0.36257309941520466,
"acc_norm_stderr": 0.0368713061556206
}
},
"versions": {
"hendrycksTest-abstract_algebra": 1,
"hendrycksTest-anatomy": 1,
"hendrycksTest-astronomy": 1,
"hendrycksTest-business_ethics": 1,
"hendrycksTest-clinical_knowledge": 1,
"hendrycksTest-college_biology": 1,
"hendrycksTest-college_chemistry": 1,
"hendrycksTest-college_computer_science": 1,
"hendrycksTest-college_mathematics": 1,
"hendrycksTest-college_medicine": 1,
"hendrycksTest-college_physics": 1,
"hendrycksTest-computer_security": 1,
"hendrycksTest-conceptual_physics": 1,
"hendrycksTest-econometrics": 1,
"hendrycksTest-electrical_engineering": 1,
"hendrycksTest-elementary_mathematics": 1,
"hendrycksTest-formal_logic": 1,
"hendrycksTest-global_facts": 1,
"hendrycksTest-high_school_biology": 1,
"hendrycksTest-high_school_chemistry": 1,
"hendrycksTest-high_school_computer_science": 1,
"hendrycksTest-high_school_european_history": 1,
"hendrycksTest-high_school_geography": 1,
"hendrycksTest-high_school_government_and_politics": 1,
"hendrycksTest-high_school_macroeconomics": 1,
"hendrycksTest-high_school_mathematics": 1,
"hendrycksTest-high_school_microeconomics": 1,
"hendrycksTest-high_school_physics": 1,
"hendrycksTest-high_school_psychology": 1,
"hendrycksTest-high_school_statistics": 1,
"hendrycksTest-high_school_us_history": 1,
"hendrycksTest-high_school_world_history": 1,
"hendrycksTest-human_aging": 1,
"hendrycksTest-human_sexuality": 1,
"hendrycksTest-international_law": 1,
"hendrycksTest-jurisprudence": 1,
"hendrycksTest-logical_fallacies": 1,
"hendrycksTest-machine_learning": 1,
"hendrycksTest-management": 1,
"hendrycksTest-marketing": 1,
"hendrycksTest-medical_genetics": 1,
"hendrycksTest-miscellaneous": 1,
"hendrycksTest-moral_disputes": 1,
"hendrycksTest-moral_scenarios": 1,
"hendrycksTest-nutrition": 1,
"hendrycksTest-philosophy": 1,
"hendrycksTest-prehistory": 1,
"hendrycksTest-professional_accounting": 1,
"hendrycksTest-professional_law": 1,
"hendrycksTest-professional_medicine": 1,
"hendrycksTest-professional_psychology": 1,
"hendrycksTest-public_relations": 1,
"hendrycksTest-security_studies": 1,
"hendrycksTest-sociology": 1,
"hendrycksTest-us_foreign_policy": 1,
"hendrycksTest-virology": 1,
"hendrycksTest-world_religions": 1
},
"config": {
"model": "hf-causal",
"model_args": "pretrained=workdir_7b/ckpt_352",
"num_fewshot": 5,
"batch_size": "8",
"batch_sizes": [],
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}