Voyage / results_mmlu.json
gagan3012's picture
Upload folder using huggingface_hub
bbfe8db verified
{
"results": {
"mmlu": {
"acc,none": 0.5959977211223473,
"acc_stderr,none": 0.003940274143686019,
"alias": "mmlu"
},
"mmlu_humanities": {
"alias": " - humanities",
"acc,none": 0.5598299681190223,
"acc_stderr,none": 0.006857746834862335
},
"mmlu_formal_logic": {
"alias": " - formal_logic",
"acc,none": 0.40476190476190477,
"acc_stderr,none": 0.043902592653775614
},
"mmlu_high_school_european_history": {
"alias": " - high_school_european_history",
"acc,none": 0.7272727272727273,
"acc_stderr,none": 0.0347769116216366
},
"mmlu_high_school_us_history": {
"alias": " - high_school_us_history",
"acc,none": 0.7794117647058824,
"acc_stderr,none": 0.029102254389674093
},
"mmlu_high_school_world_history": {
"alias": " - high_school_world_history",
"acc,none": 0.7679324894514767,
"acc_stderr,none": 0.02747974455080852
},
"mmlu_international_law": {
"alias": " - international_law",
"acc,none": 0.7933884297520661,
"acc_stderr,none": 0.03695980128098824
},
"mmlu_jurisprudence": {
"alias": " - jurisprudence",
"acc,none": 0.7314814814814815,
"acc_stderr,none": 0.042844679680521934
},
"mmlu_logical_fallacies": {
"alias": " - logical_fallacies",
"acc,none": 0.7177914110429447,
"acc_stderr,none": 0.03536117886664743
},
"mmlu_moral_disputes": {
"alias": " - moral_disputes",
"acc,none": 0.684971098265896,
"acc_stderr,none": 0.0250093137900697
},
"mmlu_moral_scenarios": {
"alias": " - moral_scenarios",
"acc,none": 0.3888268156424581,
"acc_stderr,none": 0.01630389953079613
},
"mmlu_philosophy": {
"alias": " - philosophy",
"acc,none": 0.6913183279742765,
"acc_stderr,none": 0.026236965881153256
},
"mmlu_prehistory": {
"alias": " - prehistory",
"acc,none": 0.6851851851851852,
"acc_stderr,none": 0.025842248700902175
},
"mmlu_professional_law": {
"alias": " - professional_law",
"acc,none": 0.4335071707953064,
"acc_stderr,none": 0.012656810383983964
},
"mmlu_world_religions": {
"alias": " - world_religions",
"acc,none": 0.8362573099415205,
"acc_stderr,none": 0.028380919596145866
},
"mmlu_other": {
"alias": " - other",
"acc,none": 0.657547473447055,
"acc_stderr,none": 0.008223099667026796
},
"mmlu_business_ethics": {
"alias": " - business_ethics",
"acc,none": 0.56,
"acc_stderr,none": 0.04988876515698589
},
"mmlu_clinical_knowledge": {
"alias": " - clinical_knowledge",
"acc,none": 0.660377358490566,
"acc_stderr,none": 0.02914690474779833
},
"mmlu_college_medicine": {
"alias": " - college_medicine",
"acc,none": 0.5895953757225434,
"acc_stderr,none": 0.03750757044895536
},
"mmlu_global_facts": {
"alias": " - global_facts",
"acc,none": 0.37,
"acc_stderr,none": 0.048523658709391
},
"mmlu_human_aging": {
"alias": " - human_aging",
"acc,none": 0.6098654708520179,
"acc_stderr,none": 0.03273766725459156
},
"mmlu_management": {
"alias": " - management",
"acc,none": 0.7378640776699029,
"acc_stderr,none": 0.04354631077260595
},
"mmlu_marketing": {
"alias": " - marketing",
"acc,none": 0.8589743589743589,
"acc_stderr,none": 0.022801382534597518
},
"mmlu_medical_genetics": {
"alias": " - medical_genetics",
"acc,none": 0.67,
"acc_stderr,none": 0.04725815626252607
},
"mmlu_miscellaneous": {
"alias": " - miscellaneous",
"acc,none": 0.7752234993614304,
"acc_stderr,none": 0.014927447101937157
},
"mmlu_nutrition": {
"alias": " - nutrition",
"acc,none": 0.6928104575163399,
"acc_stderr,none": 0.026415601914389002
},
"mmlu_professional_accounting": {
"alias": " - professional_accounting",
"acc,none": 0.4432624113475177,
"acc_stderr,none": 0.029634838473766006
},
"mmlu_professional_medicine": {
"alias": " - professional_medicine",
"acc,none": 0.6102941176470589,
"acc_stderr,none": 0.02962466358115969
},
"mmlu_virology": {
"alias": " - virology",
"acc,none": 0.5,
"acc_stderr,none": 0.03892494720807614
},
"mmlu_social_sciences": {
"alias": " - social_sciences",
"acc,none": 0.693532661683458,
"acc_stderr,none": 0.008120210287270358
},
"mmlu_econometrics": {
"alias": " - econometrics",
"acc,none": 0.40350877192982454,
"acc_stderr,none": 0.046151869625837026
},
"mmlu_high_school_geography": {
"alias": " - high_school_geography",
"acc,none": 0.7626262626262627,
"acc_stderr,none": 0.030313710538198892
},
"mmlu_high_school_government_and_politics": {
"alias": " - high_school_government_and_politics",
"acc,none": 0.8393782383419689,
"acc_stderr,none": 0.02649905770139744
},
"mmlu_high_school_macroeconomics": {
"alias": " - high_school_macroeconomics",
"acc,none": 0.5666666666666667,
"acc_stderr,none": 0.025124653525885124
},
"mmlu_high_school_microeconomics": {
"alias": " - high_school_microeconomics",
"acc,none": 0.6470588235294118,
"acc_stderr,none": 0.031041941304059278
},
"mmlu_high_school_psychology": {
"alias": " - high_school_psychology",
"acc,none": 0.7926605504587156,
"acc_stderr,none": 0.017381415563608667
},
"mmlu_human_sexuality": {
"alias": " - human_sexuality",
"acc,none": 0.7480916030534351,
"acc_stderr,none": 0.03807387116306086
},
"mmlu_professional_psychology": {
"alias": " - professional_psychology",
"acc,none": 0.6290849673202614,
"acc_stderr,none": 0.01954210156485412
},
"mmlu_public_relations": {
"alias": " - public_relations",
"acc,none": 0.7272727272727273,
"acc_stderr,none": 0.04265792110940588
},
"mmlu_security_studies": {
"alias": " - security_studies",
"acc,none": 0.710204081632653,
"acc_stderr,none": 0.02904308868330433
},
"mmlu_sociology": {
"alias": " - sociology",
"acc,none": 0.736318407960199,
"acc_stderr,none": 0.031157150869355568
},
"mmlu_us_foreign_policy": {
"alias": " - us_foreign_policy",
"acc,none": 0.83,
"acc_stderr,none": 0.0377525168068637
},
"mmlu_stem": {
"alias": " - stem",
"acc,none": 0.4941325721535046,
"acc_stderr,none": 0.008646286993166217
},
"mmlu_abstract_algebra": {
"alias": " - abstract_algebra",
"acc,none": 0.3,
"acc_stderr,none": 0.046056618647183814
},
"mmlu_anatomy": {
"alias": " - anatomy",
"acc,none": 0.5777777777777777,
"acc_stderr,none": 0.04266763404099582
},
"mmlu_astronomy": {
"alias": " - astronomy",
"acc,none": 0.6381578947368421,
"acc_stderr,none": 0.03910525752849724
},
"mmlu_college_biology": {
"alias": " - college_biology",
"acc,none": 0.6805555555555556,
"acc_stderr,none": 0.03899073687357335
},
"mmlu_college_chemistry": {
"alias": " - college_chemistry",
"acc,none": 0.41,
"acc_stderr,none": 0.049431107042371025
},
"mmlu_college_computer_science": {
"alias": " - college_computer_science",
"acc,none": 0.51,
"acc_stderr,none": 0.05024183937956912
},
"mmlu_college_mathematics": {
"alias": " - college_mathematics",
"acc,none": 0.42,
"acc_stderr,none": 0.04960449637488584
},
"mmlu_college_physics": {
"alias": " - college_physics",
"acc,none": 0.38235294117647056,
"acc_stderr,none": 0.04835503696107224
},
"mmlu_computer_security": {
"alias": " - computer_security",
"acc,none": 0.67,
"acc_stderr,none": 0.04725815626252609
},
"mmlu_conceptual_physics": {
"alias": " - conceptual_physics",
"acc,none": 0.5404255319148936,
"acc_stderr,none": 0.03257901482099835
},
"mmlu_electrical_engineering": {
"alias": " - electrical_engineering",
"acc,none": 0.5862068965517241,
"acc_stderr,none": 0.04104269211806232
},
"mmlu_elementary_mathematics": {
"alias": " - elementary_mathematics",
"acc,none": 0.3915343915343915,
"acc_stderr,none": 0.025138091388851112
},
"mmlu_high_school_biology": {
"alias": " - high_school_biology",
"acc,none": 0.6709677419354839,
"acc_stderr,none": 0.026729499068349954
},
"mmlu_high_school_chemistry": {
"alias": " - high_school_chemistry",
"acc,none": 0.5221674876847291,
"acc_stderr,none": 0.035145285621750094
},
"mmlu_high_school_computer_science": {
"alias": " - high_school_computer_science",
"acc,none": 0.57,
"acc_stderr,none": 0.049756985195624284
},
"mmlu_high_school_mathematics": {
"alias": " - high_school_mathematics",
"acc,none": 0.2962962962962963,
"acc_stderr,none": 0.027840811495871927
},
"mmlu_high_school_physics": {
"alias": " - high_school_physics",
"acc,none": 0.33774834437086093,
"acc_stderr,none": 0.03861557546255169
},
"mmlu_high_school_statistics": {
"alias": " - high_school_statistics",
"acc,none": 0.4861111111111111,
"acc_stderr,none": 0.03408655867977748
},
"mmlu_machine_learning": {
"alias": " - machine_learning",
"acc,none": 0.42857142857142855,
"acc_stderr,none": 0.04697113923010213
}
},
"groups": {
"mmlu": {
"acc,none": 0.5959977211223473,
"acc_stderr,none": 0.003940274143686019,
"alias": "mmlu"
},
"mmlu_humanities": {
"alias": " - humanities",
"acc,none": 0.5598299681190223,
"acc_stderr,none": 0.006857746834862335
},
"mmlu_other": {
"alias": " - other",
"acc,none": 0.657547473447055,
"acc_stderr,none": 0.008223099667026796
},
"mmlu_social_sciences": {
"alias": " - social_sciences",
"acc,none": 0.693532661683458,
"acc_stderr,none": 0.008120210287270358
},
"mmlu_stem": {
"alias": " - stem",
"acc,none": 0.4941325721535046,
"acc_stderr,none": 0.008646286993166217
}
},
"configs": {
"mmlu_abstract_algebra": {
"task": "mmlu_abstract_algebra",
"task_alias": "abstract_algebra",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "abstract_algebra",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_anatomy": {
"task": "mmlu_anatomy",
"task_alias": "anatomy",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "anatomy",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_astronomy": {
"task": "mmlu_astronomy",
"task_alias": "astronomy",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "astronomy",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_business_ethics": {
"task": "mmlu_business_ethics",
"task_alias": "business_ethics",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "business_ethics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_clinical_knowledge": {
"task": "mmlu_clinical_knowledge",
"task_alias": "clinical_knowledge",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "clinical_knowledge",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_biology": {
"task": "mmlu_college_biology",
"task_alias": "college_biology",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "college_biology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college biology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_chemistry": {
"task": "mmlu_college_chemistry",
"task_alias": "college_chemistry",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "college_chemistry",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_computer_science": {
"task": "mmlu_college_computer_science",
"task_alias": "college_computer_science",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "college_computer_science",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_mathematics": {
"task": "mmlu_college_mathematics",
"task_alias": "college_mathematics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "college_mathematics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_medicine": {
"task": "mmlu_college_medicine",
"task_alias": "college_medicine",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "college_medicine",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_college_physics": {
"task": "mmlu_college_physics",
"task_alias": "college_physics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "college_physics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about college physics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_computer_security": {
"task": "mmlu_computer_security",
"task_alias": "computer_security",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "computer_security",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about computer security.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_conceptual_physics": {
"task": "mmlu_conceptual_physics",
"task_alias": "conceptual_physics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "conceptual_physics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_econometrics": {
"task": "mmlu_econometrics",
"task_alias": "econometrics",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "econometrics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_electrical_engineering": {
"task": "mmlu_electrical_engineering",
"task_alias": "electrical_engineering",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "electrical_engineering",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_elementary_mathematics": {
"task": "mmlu_elementary_mathematics",
"task_alias": "elementary_mathematics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "elementary_mathematics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_formal_logic": {
"task": "mmlu_formal_logic",
"task_alias": "formal_logic",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "formal_logic",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_global_facts": {
"task": "mmlu_global_facts",
"task_alias": "global_facts",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "global_facts",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about global facts.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_biology": {
"task": "mmlu_high_school_biology",
"task_alias": "high_school_biology",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_biology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_chemistry": {
"task": "mmlu_high_school_chemistry",
"task_alias": "high_school_chemistry",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_chemistry",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_computer_science": {
"task": "mmlu_high_school_computer_science",
"task_alias": "high_school_computer_science",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_computer_science",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_european_history": {
"task": "mmlu_high_school_european_history",
"task_alias": "high_school_european_history",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_european_history",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_geography": {
"task": "mmlu_high_school_geography",
"task_alias": "high_school_geography",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_geography",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_government_and_politics": {
"task": "mmlu_high_school_government_and_politics",
"task_alias": "high_school_government_and_politics",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_government_and_politics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_macroeconomics": {
"task": "mmlu_high_school_macroeconomics",
"task_alias": "high_school_macroeconomics",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_macroeconomics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_mathematics": {
"task": "mmlu_high_school_mathematics",
"task_alias": "high_school_mathematics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_mathematics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_microeconomics": {
"task": "mmlu_high_school_microeconomics",
"task_alias": "high_school_microeconomics",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_microeconomics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_physics": {
"task": "mmlu_high_school_physics",
"task_alias": "high_school_physics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_physics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_psychology": {
"task": "mmlu_high_school_psychology",
"task_alias": "high_school_psychology",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_psychology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_statistics": {
"task": "mmlu_high_school_statistics",
"task_alias": "high_school_statistics",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_statistics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_us_history": {
"task": "mmlu_high_school_us_history",
"task_alias": "high_school_us_history",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_us_history",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_high_school_world_history": {
"task": "mmlu_high_school_world_history",
"task_alias": "high_school_world_history",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "high_school_world_history",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_human_aging": {
"task": "mmlu_human_aging",
"task_alias": "human_aging",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "human_aging",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about human aging.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_human_sexuality": {
"task": "mmlu_human_sexuality",
"task_alias": "human_sexuality",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "human_sexuality",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_international_law": {
"task": "mmlu_international_law",
"task_alias": "international_law",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "international_law",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about international law.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_jurisprudence": {
"task": "mmlu_jurisprudence",
"task_alias": "jurisprudence",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "jurisprudence",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_logical_fallacies": {
"task": "mmlu_logical_fallacies",
"task_alias": "logical_fallacies",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "logical_fallacies",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_machine_learning": {
"task": "mmlu_machine_learning",
"task_alias": "machine_learning",
"group": "mmlu_stem",
"group_alias": "stem",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "machine_learning",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_management": {
"task": "mmlu_management",
"task_alias": "management",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "management",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about management.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_marketing": {
"task": "mmlu_marketing",
"task_alias": "marketing",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "marketing",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about marketing.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_medical_genetics": {
"task": "mmlu_medical_genetics",
"task_alias": "medical_genetics",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "medical_genetics",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_miscellaneous": {
"task": "mmlu_miscellaneous",
"task_alias": "miscellaneous",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "miscellaneous",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_moral_disputes": {
"task": "mmlu_moral_disputes",
"task_alias": "moral_disputes",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "moral_disputes",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_moral_scenarios": {
"task": "mmlu_moral_scenarios",
"task_alias": "moral_scenarios",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "moral_scenarios",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_nutrition": {
"task": "mmlu_nutrition",
"task_alias": "nutrition",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "nutrition",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_philosophy": {
"task": "mmlu_philosophy",
"task_alias": "philosophy",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "philosophy",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_prehistory": {
"task": "mmlu_prehistory",
"task_alias": "prehistory",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "prehistory",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_professional_accounting": {
"task": "mmlu_professional_accounting",
"task_alias": "professional_accounting",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "professional_accounting",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_professional_law": {
"task": "mmlu_professional_law",
"task_alias": "professional_law",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "professional_law",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about professional law.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_professional_medicine": {
"task": "mmlu_professional_medicine",
"task_alias": "professional_medicine",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "professional_medicine",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_professional_psychology": {
"task": "mmlu_professional_psychology",
"task_alias": "professional_psychology",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "professional_psychology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_public_relations": {
"task": "mmlu_public_relations",
"task_alias": "public_relations",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "public_relations",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about public relations.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_security_studies": {
"task": "mmlu_security_studies",
"task_alias": "security_studies",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "security_studies",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about security studies.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_sociology": {
"task": "mmlu_sociology",
"task_alias": "sociology",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "sociology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about sociology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_us_foreign_policy": {
"task": "mmlu_us_foreign_policy",
"task_alias": "us_foreign_policy",
"group": "mmlu_social_sciences",
"group_alias": "social_sciences",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "us_foreign_policy",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_virology": {
"task": "mmlu_virology",
"task_alias": "virology",
"group": "mmlu_other",
"group_alias": "other",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "virology",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about virology.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
},
"mmlu_world_religions": {
"task": "mmlu_world_religions",
"task_alias": "world_religions",
"group": "mmlu_humanities",
"group_alias": "humanities",
"dataset_path": "/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/eval/mmlu_no_train",
"dataset_name": "world_religions",
"test_split": "test",
"fewshot_split": "dev",
"doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
"doc_to_target": "answer",
"doc_to_choice": [
"A",
"B",
"C",
"D"
],
"description": "The following are multiple choice questions (with answers) about world religions.\n\n",
"target_delimiter": " ",
"fewshot_delimiter": "\n\n",
"fewshot_config": {
"sampler": "first_n"
},
"num_fewshot": 5,
"metric_list": [
{
"metric": "acc",
"aggregation": "mean",
"higher_is_better": true
}
],
"output_type": "multiple_choice",
"repeats": 1,
"should_decontaminate": false,
"metadata": {
"version": 0.0
}
}
},
"versions": {
"mmlu": "N/A",
"mmlu_abstract_algebra": 0.0,
"mmlu_anatomy": 0.0,
"mmlu_astronomy": 0.0,
"mmlu_business_ethics": 0.0,
"mmlu_clinical_knowledge": 0.0,
"mmlu_college_biology": 0.0,
"mmlu_college_chemistry": 0.0,
"mmlu_college_computer_science": 0.0,
"mmlu_college_mathematics": 0.0,
"mmlu_college_medicine": 0.0,
"mmlu_college_physics": 0.0,
"mmlu_computer_security": 0.0,
"mmlu_conceptual_physics": 0.0,
"mmlu_econometrics": 0.0,
"mmlu_electrical_engineering": 0.0,
"mmlu_elementary_mathematics": 0.0,
"mmlu_formal_logic": 0.0,
"mmlu_global_facts": 0.0,
"mmlu_high_school_biology": 0.0,
"mmlu_high_school_chemistry": 0.0,
"mmlu_high_school_computer_science": 0.0,
"mmlu_high_school_european_history": 0.0,
"mmlu_high_school_geography": 0.0,
"mmlu_high_school_government_and_politics": 0.0,
"mmlu_high_school_macroeconomics": 0.0,
"mmlu_high_school_mathematics": 0.0,
"mmlu_high_school_microeconomics": 0.0,
"mmlu_high_school_physics": 0.0,
"mmlu_high_school_psychology": 0.0,
"mmlu_high_school_statistics": 0.0,
"mmlu_high_school_us_history": 0.0,
"mmlu_high_school_world_history": 0.0,
"mmlu_human_aging": 0.0,
"mmlu_human_sexuality": 0.0,
"mmlu_humanities": "N/A",
"mmlu_international_law": 0.0,
"mmlu_jurisprudence": 0.0,
"mmlu_logical_fallacies": 0.0,
"mmlu_machine_learning": 0.0,
"mmlu_management": 0.0,
"mmlu_marketing": 0.0,
"mmlu_medical_genetics": 0.0,
"mmlu_miscellaneous": 0.0,
"mmlu_moral_disputes": 0.0,
"mmlu_moral_scenarios": 0.0,
"mmlu_nutrition": 0.0,
"mmlu_other": "N/A",
"mmlu_philosophy": 0.0,
"mmlu_prehistory": 0.0,
"mmlu_professional_accounting": 0.0,
"mmlu_professional_law": 0.0,
"mmlu_professional_medicine": 0.0,
"mmlu_professional_psychology": 0.0,
"mmlu_public_relations": 0.0,
"mmlu_security_studies": 0.0,
"mmlu_social_sciences": "N/A",
"mmlu_sociology": 0.0,
"mmlu_stem": "N/A",
"mmlu_us_foreign_policy": 0.0,
"mmlu_virology": 0.0,
"mmlu_world_religions": 0.0
},
"n-shot": {
"mmlu": 0,
"mmlu_abstract_algebra": 5,
"mmlu_anatomy": 5,
"mmlu_astronomy": 5,
"mmlu_business_ethics": 5,
"mmlu_clinical_knowledge": 5,
"mmlu_college_biology": 5,
"mmlu_college_chemistry": 5,
"mmlu_college_computer_science": 5,
"mmlu_college_mathematics": 5,
"mmlu_college_medicine": 5,
"mmlu_college_physics": 5,
"mmlu_computer_security": 5,
"mmlu_conceptual_physics": 5,
"mmlu_econometrics": 5,
"mmlu_electrical_engineering": 5,
"mmlu_elementary_mathematics": 5,
"mmlu_formal_logic": 5,
"mmlu_global_facts": 5,
"mmlu_high_school_biology": 5,
"mmlu_high_school_chemistry": 5,
"mmlu_high_school_computer_science": 5,
"mmlu_high_school_european_history": 5,
"mmlu_high_school_geography": 5,
"mmlu_high_school_government_and_politics": 5,
"mmlu_high_school_macroeconomics": 5,
"mmlu_high_school_mathematics": 5,
"mmlu_high_school_microeconomics": 5,
"mmlu_high_school_physics": 5,
"mmlu_high_school_psychology": 5,
"mmlu_high_school_statistics": 5,
"mmlu_high_school_us_history": 5,
"mmlu_high_school_world_history": 5,
"mmlu_human_aging": 5,
"mmlu_human_sexuality": 5,
"mmlu_humanities": 5,
"mmlu_international_law": 5,
"mmlu_jurisprudence": 5,
"mmlu_logical_fallacies": 5,
"mmlu_machine_learning": 5,
"mmlu_management": 5,
"mmlu_marketing": 5,
"mmlu_medical_genetics": 5,
"mmlu_miscellaneous": 5,
"mmlu_moral_disputes": 5,
"mmlu_moral_scenarios": 5,
"mmlu_nutrition": 5,
"mmlu_other": 5,
"mmlu_philosophy": 5,
"mmlu_prehistory": 5,
"mmlu_professional_accounting": 5,
"mmlu_professional_law": 5,
"mmlu_professional_medicine": 5,
"mmlu_professional_psychology": 5,
"mmlu_public_relations": 5,
"mmlu_security_studies": 5,
"mmlu_social_sciences": 5,
"mmlu_sociology": 5,
"mmlu_stem": 5,
"mmlu_us_foreign_policy": 5,
"mmlu_virology": 5,
"mmlu_world_religions": 5
},
"config": {
"model": "vllm",
"model_args": "pretrained=/lustre07/scratch/gagan30/arocr/meta-llama/self_rewarding_models/Voyage-dpo-1,tensor_parallel_size=1,dtype=auto,gpu_memory_utilization=0.9,data_parallel_size=1,max_model_len=4096",
"batch_size": "auto:128",
"batch_sizes": [],
"device": "cuda",
"use_cache": "/lustre07/scratch/gagan30/arocr/cache/",
"limit": null,
"bootstrap_iters": 100000,
"gen_kwargs": null
},
"git_hash": null
}