diff --git a/opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..3917660c570756510515a5c407c3181dfb481115 --- /dev/null +++ b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .MMLUArabic_gen_326684 import MMLUArabic_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py new file mode 100644 index 0000000000000000000000000000000000000000..fe1475e777484d0d9814cafb29114794348fe722 --- /dev/null +++ b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py @@ -0,0 +1,59 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUArabicDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic + +MMLUArabic_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] +MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم'] + +MMLUArabic_datasets = [] +for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar): + _system = f"فيما يلي أسئلة الاختيار من متعدد (مع الإجابات) حول {' '.join(_name_ar.split('_'))}" + _hint = '\n{input}' + MMLUArabic_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict(role='SYSTEM', fallback_role='HUMAN', prompt=_system), + '', + ], + round=[ + dict( + role='HUMAN', + prompt=_hint.format(input='سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}') + ), + dict(role='BOT', prompt='إجابة: {target}') + ]), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + MMLUArabic_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + MMLUArabic_datasets.append( + dict( + abbr=f'acegpt_MMLUArabic_{_name}', + type=MMLUArabicDataset, + path='./data/MMLUArabic/', + name=_name, + reader_cfg=MMLUArabic_reader_cfg, + infer_cfg=MMLUArabic_infer_cfg, + eval_cfg=MMLUArabic_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl.py b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl.py new file mode 100644 index 0000000000000000000000000000000000000000..dd0fc134107c9318c7467f7bd42a866b32fc96e5 --- /dev/null +++ b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .MMLUArabic_ppl_d2333a import MMLUArabic_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py new file mode 100644 index 0000000000000000000000000000000000000000..6485c7f421142c2269dc14b2f4e2f6ed62270812 --- /dev/null +++ b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py @@ -0,0 +1,51 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUArabicDataset + +# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar + +MMLUArabic_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] +MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم'] + +MMLUArabic_datasets = [] +for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar): + # _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n' + _hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n" + # question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}' + question_overall = 'سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}' + MMLUArabic_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={opt: f'{question_overall}\nإجابة: {opt}\n' for opt in ['A', 'B', 'C', 'D']}, + ), + prompt_template=dict( + type=PromptTemplate, + template={opt: f'{_hint}{question_overall}\nإجابة: {opt}' for opt in ['A', 'B', 'C', 'D']}, + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=PPLInferencer), + ) + + MMLUArabic_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) + + MMLUArabic_datasets.append( + dict( + abbr=f'acegpt_MMLUArabic_{_name}', + type=MMLUArabicDataset, + path='./data/MMLUArabic/', + name=_name, + reader_cfg=MMLUArabic_reader_cfg, + infer_cfg=MMLUArabic_infer_cfg, + eval_cfg=MMLUArabic_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..29dda9c0a77209b95fc9edbf4e2f0b438950786a --- /dev/null +++ b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .MMLUArabic_zero_shot_gen_3523e0 import MMLUArabic_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py new file mode 100644 index 0000000000000000000000000000000000000000..efaf6ede10764bbab9de413119ef9393cd80b281 --- /dev/null +++ b/opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py @@ -0,0 +1,53 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUArabicDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic + +MMLUArabic_reader_cfg = dict( + input_columns=['input', 'A', 'B', 'C', 'D'], + output_column='target', + train_split='dev') + +MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions'] +MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم'] + +MMLUArabic_datasets = [] +for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar): + _hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n" + '{input}\n' + "من فضلك اختر إجابة واحدة من بين 'A، B، C، D' دون شرح." + MMLUArabic_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt=_hint.format(input='سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}') + ), + ]), + ice_token='', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + MMLUArabic_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + + MMLUArabic_datasets.append( + dict( + abbr=f'acegpt_MMLUArabic_{_name}', + type=MMLUArabicDataset, + path='./data/MMLUArabic/', + name=_name, + reader_cfg=MMLUArabic_reader_cfg, + infer_cfg=MMLUArabic_infer_cfg, + eval_cfg=MMLUArabic_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/MMLUArabic/README.md b/opencompass/configs/datasets/MMLUArabic/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ff16517f51295b69ace850b8c6b7b4edd68701a6 --- /dev/null +++ b/opencompass/configs/datasets/MMLUArabic/README.md @@ -0,0 +1,26 @@ +# MMLUArabic +## Dataset Description +MMLUArabic is a benchmark for the assessment of knowledge in Arabic and covers a wide range of topics and aspects, consisting of multiple-choice questions in various branches of knowledge. + + +## How to Use +Download file from [link](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic) + +```python +val_ds = load_dataset("MMLUArabic", header=None)['validation'] +test_ds = load_dataset("MMLUArabic", header=None)['test'] +# input, option_a, option_b, option_c, option_d, target +print(next(iter(val_ds))) +``` + +## Citation +``` +@misc{huang2023acegpt, + title={AceGPT, Localizing Large Language Models in Arabic}, + author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu}, + year={2023}, + eprint={2309.12053}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/opencompass/configs/datasets/QuALITY/QuALITY_gen.py b/opencompass/configs/datasets/QuALITY/QuALITY_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..8a9168ede2d349f526e3a3c4614c0385a1bf1d10 --- /dev/null +++ b/opencompass/configs/datasets/QuALITY/QuALITY_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .QuALITY_gen_c407cb import QuALITY_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/QuALITY/QuALITY_gen_c407cb.py b/opencompass/configs/datasets/QuALITY/QuALITY_gen_c407cb.py new file mode 100644 index 0000000000000000000000000000000000000000..9c841cc9424a243d323806aca8bcc41e0a59ec0a --- /dev/null +++ b/opencompass/configs/datasets/QuALITY/QuALITY_gen_c407cb.py @@ -0,0 +1,38 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import QuALITYDataset, QuALITYEvaluator +from opencompass.utils.text_postprocessors import first_option_postprocess + +QuALITY_reader_cfg = dict( + input_columns=['article', 'question', 'A', 'B', 'C', 'D'], + output_column='gold_label', +) + +QuALITY_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt= + 'Read the article, and answer the question.\n\nArticle:\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}' + ), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +QuALITY_eval_cfg = dict( + evaluator=dict(type=QuALITYEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), + pred_role='BOT') + +QuALITY_datasets = [ + dict( + abbr='QuALITY', + type=QuALITYDataset, + path='./data/QuALITY/QuALITY.v1.0.1.htmlstripped.dev', + reader_cfg=QuALITY_reader_cfg, + infer_cfg=QuALITY_infer_cfg, + eval_cfg=QuALITY_eval_cfg), +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py new file mode 100644 index 0000000000000000000000000000000000000000..3ab8320bf4f94392fb67983180b61d61f432a25b --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py @@ -0,0 +1,55 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDatasetV2 +from opencompass.utils.text_postprocessors import ( + first_option_postprocess, +) + +QUERY_TEMPLATE = """ +Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering. + +Passage: {passage} + +Question: {question} + +A. Yes +B. NO + +""".strip() + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='label', +) + +BoolQ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +BoolQ_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=first_option_postprocess, options='AB'), +) + +BoolQ_datasets = [ + dict( + abbr='BoolQ', + type=BoolQDatasetV2, + path='opencompass/boolq', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py new file mode 100644 index 0000000000000000000000000000000000000000..188b774a826fe562198e69460dc8e62acf963ae5 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py @@ -0,0 +1,47 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDatasetV2 +from opencompass.utils.text_postprocessors import first_capital_postprocess + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='label', +) + +BoolQ_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict( + role='HUMAN', + prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:', + ), + dict(role='BOT', prompt='{label}'), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]), + inferencer=dict(type=GenInferencer, max_out_len=50), +) + +BoolQ_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=first_capital_postprocess), +) + +BoolQ_datasets = [ + dict( + abbr='BoolQ', + type=BoolQDatasetV2, + path='opencompass/boolq', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_ppl.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_ppl.py new file mode 100644 index 0000000000000000000000000000000000000000..665289428af0aa3418d9682608b3a73b455544c6 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_ppl.py @@ -0,0 +1,47 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDatasetV2 +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='label', +) + +BoolQ_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={ + 'B': dict( + round=[ + dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'), + dict(role='BOT', prompt='No'), + ] + ), + 'A': dict( + round=[ + dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'), + dict(role='BOT', prompt='Yes'), + ] + ), + }, + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]), + inferencer=dict(type=PPLInferencer, max_out_len=50), +) + +BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +BoolQ_datasets = [ + dict( + abbr='BoolQ', + type=BoolQDatasetV2, + path='opencompass/boolq', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..83af4c0e9f37ee3244bc1d5156c3446f18844e67 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py new file mode 100644 index 0000000000000000000000000000000000000000..78f368bda4e857e7d6b05f1628fe61772594908f --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDatasetV2 +from opencompass.utils.text_postprocessors import first_capital_postprocess + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='label', +) + +BoolQ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:'), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +BoolQ_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=first_capital_postprocess), +) + +BoolQ_datasets = [ + dict( + abbr='BoolQ', + type=BoolQDatasetV2, + path='opencompass/boolq', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py new file mode 100644 index 0000000000000000000000000000000000000000..7f119adc917591bc165b03cc6e65583e5d77c783 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py new file mode 100644 index 0000000000000000000000000000000000000000..e24ea1e67a5dc1880c24706e81740a2797a1bbe8 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDatasetV2 + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='label', +) + +BoolQ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 'A': + dict(round=[ + dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'), + dict(role='BOT', prompt='Yes'), + ]), + 'B': + dict(round=[ + dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'), + dict(role='BOT', prompt='No'), + ]), + }, + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), +) + +BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +BoolQ_datasets = [ + dict( + abbr='BoolQ', + type=BoolQDatasetV2, + path='opencompass/boolq', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py new file mode 100644 index 0000000000000000000000000000000000000000..9e9c2ff04d10637d4e8ad49980b17b0fcd18faf1 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDatasetV3 + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='label', + test_split='train') + +BoolQ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 'false': + dict(round=[ + dict(role='HUMAN', prompt='Passage: {passage}\nQuestion: {question}?'), + dict(role='BOT', prompt='Answer: No'), + ]), + 'true': + dict(round=[ + dict(role='HUMAN', prompt='Passage: {passage}\nQuestion: {question}?'), + dict(role='BOT', prompt='Answer: Yes'), + ]), + }, + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), +) + +BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +BoolQ_datasets = [ + dict( + abbr='BoolQ', + type=BoolQDatasetV3, + path='opencompass/boolq', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py new file mode 100644 index 0000000000000000000000000000000000000000..0d610ba5940782bfcac374e7a62078f5ac66486b --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDataset + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='answer', + test_split='train') + +BoolQ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: + dict(round=[ + dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'), + dict(role='BOT', prompt='No'), + ]), + 1: + dict(round=[ + dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'), + dict(role='BOT', prompt='Yes'), + ]), + }, + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), +) + +BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +BoolQ_datasets = [ + dict( + type=BoolQDataset, + abbr='BoolQ', + path='json', + data_files='opencompass/boolq', + split='train', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py new file mode 100644 index 0000000000000000000000000000000000000000..9af5893b5ab4b09f42aeac4f88bfe9eecd52d844 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDataset + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='answer', + test_split='train') + +BoolQ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: + dict(round=[ + dict(role='HUMAN', prompt='{passage}\nQuestion: {question}'), + dict(role='BOT', prompt='No.'), + ]), + 1: + dict(round=[ + dict(role='HUMAN', prompt='{passage}\nQuestion: {question}'), + dict(role='BOT', prompt='Yes.'), + ]), + }, + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), +) + +BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +BoolQ_datasets = [ + dict( + type=BoolQDataset, + abbr='BoolQ', + path='json', + data_files='opencompass/boolq', + split='train', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py new file mode 100644 index 0000000000000000000000000000000000000000..47dfb6cbbd4230a39fcec543371ad6e4b776de4f --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py @@ -0,0 +1,34 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BoolQDataset + +BoolQ_reader_cfg = dict( + input_columns=['question', 'passage'], + output_column='answer', + test_split='train') + +BoolQ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: 'Passage:{passage}。\nQuestion:{question}。\nAnswer: No.', + 1: 'Passage:{passage}。\nQuestion:{question}。\nAnswer: Yes.', + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +BoolQ_datasets = [ + dict( + type=BoolQDataset, + abbr='BoolQ', + path='json', + data_files='opencompass/boolq', + split='train', + reader_cfg=BoolQ_reader_cfg, + infer_cfg=BoolQ_infer_cfg, + eval_cfg=BoolQ_eval_cfg) +] diff --git a/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..4cc65e021760b1893658b1b339dd5015bc26967e --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .SuperGLUE_CB_gen_854c6c import CB_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py new file mode 100644 index 0000000000000000000000000000000000000000..65d3752d6ec7fa579b61a1881cdb596b19c38d49 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py @@ -0,0 +1,44 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import CBDatasetV2 +from opencompass.utils.text_postprocessors import first_option_postprocess + +CB_reader_cfg = dict( + input_columns=['premise', 'hypothesis'], + output_column='label', +) + +CB_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt= + '{premise}\n{hypothesis}\nWhat is the relation between the two sentences?\nA. Contradiction\nB. Entailment\nC. Neutral\nAnswer:' + ), + ], ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +CB_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=first_option_postprocess, options='ABC'), +) + +CB_datasets = [ + dict( + abbr='CB', + type=CBDatasetV2, + path='./data/SuperGLUE/CB/val.jsonl', + reader_cfg=CB_reader_cfg, + infer_cfg=CB_infer_cfg, + eval_cfg=CB_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl.py b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl.py new file mode 100644 index 0000000000000000000000000000000000000000..9527f3bca35116c2b0e08a226ec52c874364bb2c --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .SuperGLUE_CB_ppl_0143fe import CB_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_0143fe.py b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_0143fe.py new file mode 100644 index 0000000000000000000000000000000000000000..8b04bd2ce6b6faf47d0974b0f8c3ff32c9473e33 --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_0143fe.py @@ -0,0 +1,62 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import HFDataset + +CB_reader_cfg = dict( + input_columns=['premise', 'hypothesis'], + output_column='label', +) + +CB_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 'contradiction': + dict(round=[ + dict( + role='HUMAN', + prompt= + '{premise}\n{hypothesis}\nWhat is the relation between the two sentences?' + ), + dict(role='BOT', prompt='Contradiction'), + ]), + 'entailment': + dict(round=[ + dict( + role='HUMAN', + prompt= + '{premise}\n{hypothesis}\nWhat is the relation between the two sentences?' + ), + dict(role='BOT', prompt='Entailment'), + ]), + 'neutral': + dict(round=[ + dict( + role='HUMAN', + prompt= + '{premise}\n{hypothesis}\nWhat is the relation between the two sentences?' + ), + dict(role='BOT', prompt='Neutral'), + ]), + }, + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), +) + +CB_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) + +CB_datasets = [ + dict( + type=HFDataset, + abbr='CB', + path='json', + split='train', + data_files='./data/SuperGLUE/CB/val.jsonl', + reader_cfg=CB_reader_cfg, + infer_cfg=CB_infer_cfg, + eval_cfg=CB_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_11c175.py b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_11c175.py new file mode 100644 index 0000000000000000000000000000000000000000..9ee3007d29adc7f0b02c57f5f919d08aa0f8f29d --- /dev/null +++ b/opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_11c175.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import HFDataset + +CB_reader_cfg = dict( + input_columns=['premise', 'hypothesis'], output_column='label') + +CB_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 'contradiction': '{premise}?contradiction, {hypothesis}', + 'entailment': '{premise}?entailment, {hypothesis}', + 'neutral': '{premise}?neutral, {hypothesis}' + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +CB_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) + +CB_datasets = [ + dict( + type=HFDataset, + abbr='CB', + path='json', + split='train', + data_files='./data/SuperGLUE/CB/val.jsonl', + reader_cfg=CB_reader_cfg, + infer_cfg=CB_infer_cfg, + eval_cfg=CB_eval_cfg) +] diff --git a/opencompass/configs/datasets/apps/README.md b/opencompass/configs/datasets/apps/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e8d2eb9444b619670cd0c59055e177341c107095 --- /dev/null +++ b/opencompass/configs/datasets/apps/README.md @@ -0,0 +1,43 @@ +# APPS +## Dataset Description +APPS is a benchmark for code generation with 10000 problems. It can be used to evaluate the ability of language models to generate code from natural language specifications. + +## Dataset Structure +```python +DatasetDict({ + train: Dataset({ + features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'], + num_rows: 5000 + }) + test: Dataset({ + features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'], + num_rows: 5000 + }) +}) +``` +We also offer an apps_mini subset, which includes 1500 questions divided proportionally of introductory, interview, and competition categories, with a ratio of 1:1:1(500 questions each). + +## How to Use +You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level: +```python +ds = load_dataset("codeparrot/apps", split="train", difficulties=["competition"]) +print(next(iter(ds))["question"]) +``` +## Evaluation results + + +| dataset | metric | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf | +|-----------------------|----------|-------------|-------------|-------------|-------------| +| apps_mini | pass@1 | 1.3 | 0.7 | 7.1 | 9.3 | + +Please refer to Table 3 of [code llama](https://scontent-nrt1-2.xx.fbcdn.net/v/t39.2365-6/369856151_1754812304950972_1159666448927483931_n.pdf?_nc_cat=107&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=TxT1PKkNBZoAX8zMHbm&_nc_ht=scontent-nrt1-2.xx&oh=00_AfDmmQAPzqX1-QOKIDUV5lGKzaZqt0CZUVtxFjHtnh6ycQ&oe=65F5AF8F) for original results if needed. + +## Citation +``` +@article{hendrycksapps2021, + title={Measuring Coding Challenge Competence With APPS}, + author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt}, + journal={NeurIPS}, + year={2021} +} +``` diff --git a/opencompass/configs/datasets/apps/apps_gen.py b/opencompass/configs/datasets/apps/apps_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..810e27247ee39d2029c7dcc3aa70b39251c74cc9 --- /dev/null +++ b/opencompass/configs/datasets/apps/apps_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .apps_gen_c7893a import APPS_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/apps/apps_gen_c7893a.py b/opencompass/configs/datasets/apps/apps_gen_c7893a.py new file mode 100644 index 0000000000000000000000000000000000000000..4ca616c51d694e905d1e7f92cb42aa782d6cf1d4 --- /dev/null +++ b/opencompass/configs/datasets/apps/apps_gen_c7893a.py @@ -0,0 +1,28 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import APPSDataset, APPSEvaluator + +APPS_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test') + +APPS_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +APPS_eval_cfg = dict(evaluator=dict(type=APPSEvaluator), pred_role='BOT') + +APPS_datasets = [ + dict( + type=APPSDataset, + abbr='apps', + path='codeparrot/apps', + num_repeats=1, + reader_cfg=APPS_reader_cfg, + infer_cfg=APPS_infer_cfg, + eval_cfg=APPS_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/apps/apps_mini_gen.py b/opencompass/configs/datasets/apps/apps_mini_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..3c537e4750a3c6a9a81413b2dcbe3e0f34bbfaaf --- /dev/null +++ b/opencompass/configs/datasets/apps/apps_mini_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .apps_mini_gen_c7893a import APPS_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/apps/apps_mini_gen_c7893a.py b/opencompass/configs/datasets/apps/apps_mini_gen_c7893a.py new file mode 100644 index 0000000000000000000000000000000000000000..71418d516d3f66ab8550a036700684c6cfe16d39 --- /dev/null +++ b/opencompass/configs/datasets/apps/apps_mini_gen_c7893a.py @@ -0,0 +1,28 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import APPS_miniDataset, APPSEvaluator + +APPS_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test') + +APPS_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +APPS_eval_cfg = dict(evaluator=dict(type=APPSEvaluator), pred_role='BOT') + +APPS_mini_datasets = [ + dict( + type=APPS_miniDataset, + abbr='apps_mini', + path='./data/apps_mini', + num_repeats=1, + reader_cfg=APPS_reader_cfg, + infer_cfg=APPS_infer_cfg, + eval_cfg=APPS_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/apps/deprecated_apps_gen_5b4254.py b/opencompass/configs/datasets/apps/deprecated_apps_gen_5b4254.py new file mode 100644 index 0000000000000000000000000000000000000000..7e6efd0406fcf05a4213f9ac50f19580d658bbb5 --- /dev/null +++ b/opencompass/configs/datasets/apps/deprecated_apps_gen_5b4254.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess + +apps_reader_cfg = dict( + input_columns=['question'], output_column='problem_id', train_split='test') + +# TODO: allow empty output-column +apps_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Write a python program:\n{question}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +apps_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +apps_datasets = [ + dict( + type=HFDataset, + path='codeparrot/apps', + reader_cfg=apps_reader_cfg, + infer_cfg=apps_infer_cfg, + eval_cfg=apps_eval_cfg) +] diff --git a/opencompass/configs/datasets/apps/deprecated_apps_gen_7fbb95.py b/opencompass/configs/datasets/apps/deprecated_apps_gen_7fbb95.py new file mode 100644 index 0000000000000000000000000000000000000000..87043e8619138c53aee6be2a9dca3eeab6b5b1d3 --- /dev/null +++ b/opencompass/configs/datasets/apps/deprecated_apps_gen_7fbb95.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess + +apps_reader_cfg = dict( + input_columns=['question'], output_column='problem_id', train_split='test') + +# TODO: allow empty output-column +apps_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt='Write a python program:'), + ], + round=[ + dict(role='HUMAN', prompt='{question}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +apps_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + pred_role='BOT', + k=[1, 10, 100], # the parameter only for humaneval + pred_postprocessor=dict(type=humaneval_postprocess), +) + +apps_datasets = [ + dict( + type=HFDataset, + path='codeparrot/apps', + reader_cfg=apps_reader_cfg, + infer_cfg=apps_infer_cfg, + eval_cfg=apps_eval_cfg) +] diff --git a/opencompass/configs/datasets/apps/deprecated_apps_gen_b4dee3.py b/opencompass/configs/datasets/apps/deprecated_apps_gen_b4dee3.py new file mode 100644 index 0000000000000000000000000000000000000000..1f8b67736580a0c845448f98e797c3670d4340d6 --- /dev/null +++ b/opencompass/configs/datasets/apps/deprecated_apps_gen_b4dee3.py @@ -0,0 +1,30 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess + +apps_reader_cfg = dict( + input_columns=['question'], output_column='problem_id', train_split='test') + +# TODO: allow empty output-column +apps_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='{question}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +apps_eval_cfg = dict( + evaluator=dict(type=HumanEvalEvaluator), + k=[1, 10, 100], + pred_postprocessor=dict(type=humaneval_postprocess), +) + +apps_datasets = [ + dict( + type=HFDataset, + path='codeparrot/apps', + reader_cfg=apps_reader_cfg, + infer_cfg=apps_infer_cfg, + eval_cfg=apps_eval_cfg) +] diff --git a/opencompass/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py b/opencompass/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py new file mode 100644 index 0000000000000000000000000000000000000000..1d0c6468be4c78df56f81e38d3510f8a860d7194 --- /dev/null +++ b/opencompass/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py @@ -0,0 +1,57 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLOnlyInferencer +from opencompass.openicl.icl_evaluator import AveragePPLEvaluator +from opencompass.datasets import GSM8KDataset, GSM8KReferenceSkywork + +gsm8k_datasets = [] + +gsm8k_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template='{question} {answer}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLOnlyInferencer), +) + +gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator)) + +for split in ['train', 'test']: + gsm8k_reader_cfg = dict( + input_columns=['question', 'answer'], + output_column=None, + train_split=split, + test_split=split, + ) + gsm8k_datasets.append( + dict( + abbr=f'gsm8k-{split}-ppl', + type=GSM8KDataset, + path='./data/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg) + ) + + +gsm8k_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template='{text}'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLOnlyInferencer), +) + +gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator)) + +gsm8k_reader_cfg = dict( + input_columns=['text'], + output_column=None, +) + +gsm8k_datasets.append( + dict( + abbr=f'gsm8k-ref-ppl', + type=GSM8KReferenceSkywork, + path='./data/gsm8k-extra/mock_gsm8k_test.jsonl', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg + ) +) diff --git a/opencompass/configs/datasets/lcsts/lcsts_gen.py b/opencompass/configs/datasets/lcsts/lcsts_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..228f574e7ee7d621dc903f2a18bc0e2edca949ff --- /dev/null +++ b/opencompass/configs/datasets/lcsts/lcsts_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .lcsts_gen_8ee1fe import lcsts_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/lcsts/lcsts_gen_8ee1fe.py b/opencompass/configs/datasets/lcsts/lcsts_gen_8ee1fe.py new file mode 100644 index 0000000000000000000000000000000000000000..fb02f42f01d6013356bd2e4c83c965e3fc562d39 --- /dev/null +++ b/opencompass/configs/datasets/lcsts/lcsts_gen_8ee1fe.py @@ -0,0 +1,32 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator +from opencompass.datasets import LCSTSDataset, lcsts_postprocess + +lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst') + +lcsts_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='阅读以下文章,并给出简短的摘要:{content}\n摘要如下:'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +lcsts_eval_cfg = dict( + evaluator=dict(type=JiebaRougeEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=lcsts_postprocess), +) + +lcsts_datasets = [ + dict( + type=LCSTSDataset, + abbr='lcsts', + path='opencompass/LCSTS', + reader_cfg=lcsts_reader_cfg, + infer_cfg=lcsts_infer_cfg, + eval_cfg=lcsts_eval_cfg) +] diff --git a/opencompass/configs/datasets/lcsts/lcsts_gen_9b0b89.py b/opencompass/configs/datasets/lcsts/lcsts_gen_9b0b89.py new file mode 100644 index 0000000000000000000000000000000000000000..5171ca25d8d29aa17558495e9aa80c08cfefb7d8 --- /dev/null +++ b/opencompass/configs/datasets/lcsts/lcsts_gen_9b0b89.py @@ -0,0 +1,28 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator +from opencompass.datasets import LCSTSDataset, lcsts_postprocess + +lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst') + +lcsts_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, template='阅读文章:{content}\n根据上文,给出简短的单个摘要:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +lcsts_eval_cfg = dict( + evaluator=dict(type=JiebaRougeEvaluator), + pred_postprocessor=dict(type=lcsts_postprocess), +) + +lcsts_datasets = [ + dict( + type=LCSTSDataset, + abbr='lcsts', + path='opencompass/LCSTS', + reader_cfg=lcsts_reader_cfg, + infer_cfg=lcsts_infer_cfg, + eval_cfg=lcsts_eval_cfg) +] diff --git a/opencompass/configs/datasets/leval/leval.py b/opencompass/configs/datasets/leval/leval.py new file mode 100644 index 0000000000000000000000000000000000000000..889b15065513d008b14dec563c7753e35337971b --- /dev/null +++ b/opencompass/configs/datasets/leval/leval.py @@ -0,0 +1,23 @@ +from mmengine.config import read_base + +with read_base(): + from .levalnaturalquestion.leval_naturalquestion_gen import LEval_nq_datasets + from .levalnarrativeqa.leval_narrativeqa_gen import LEval_narrativeqa_datasets + from .levalmultidocqa.leval_multidocqa_gen import LEval_multidocqa_datasets + from .levalcoursera.leval_coursera_gen import LEval_coursera_datasets + from .levaltpo.leval_tpo_gen import LEval_tpo_datasets + from .levalquality.leval_quality_gen import LEval_quality_datasets + from .levalgsm100.leval_gsm100_gen import LEval_gsm100_datasets + from .levaltopicretrieval.leval_topic_retrieval_gen import LEval_tr_datasets + from .levalfinancialqa.leval_financialqa_gen import LEval_financialqa_datasets + from .levalgovreportsumm.leval_gov_report_summ_gen import LEval_govreport_summ_datasets + from .levallegalcontractqa.leval_legalcontractqa_gen import LEval_legalqa_datasets + from .levalmeetingsumm.leval_meetingsumm_gen import LEval_meetingsumm_datasets + from .levalnewssumm.leval_newssumm_gen import LEval_newssumm_datasets + from .levalpaperassistant.leval_paper_assistant_gen import LEval_ps_summ_datasets + from .levalpatentsumm.leval_patent_summ_gen import LEval_patent_summ_datasets + from .levaltvshowsumm.leval_tvshow_summ_gen import LEval_tvshow_summ_datasets + from .levalscientificqa.leval_scientificqa_gen import LEval_scientificqa_datasets + from .levalreviewsumm.leval_review_summ_gen import LEval_review_summ_datasets + +leval_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py b/opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py new file mode 100644 index 0000000000000000000000000000000000000000..8dd3e41e9f16d5665a8f1c666c93eade9c0c34e0 --- /dev/null +++ b/opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py @@ -0,0 +1,90 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess_v2 +) +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict( + type=MATHAgentEvaluator, + version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + abbr='math-agent', + type=MATHDataset, + path='./data/math/math.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/math/math_0shot_gen_393424.py b/opencompass/configs/datasets/math/math_0shot_gen_393424.py new file mode 100644 index 0000000000000000000000000000000000000000..d2fef53cf734a8f2289638bfea98ff151a405227 --- /dev/null +++ b/opencompass/configs/datasets/math/math_0shot_gen_393424.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024), +) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2), +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py b/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8696798b45fab4c38806ed1a54ca63f9523e46 --- /dev/null +++ b/opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py @@ -0,0 +1,30 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +with read_base(): + from .math_4shot_example_from_google_research import prompt + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Problem', '问题:'])) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py b/opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py new file mode 100644 index 0000000000000000000000000000000000000000..95dd620ef1e06fef3eabb0ff26d14378d83797be --- /dev/null +++ b/opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py @@ -0,0 +1,30 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +with read_base(): + from .math_4shot_example_from_google_research import prompt + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=['Problem'])) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_4shot_example_from_google_research.py b/opencompass/configs/datasets/math/math_4shot_example_from_google_research.py new file mode 100644 index 0000000000000000000000000000000000000000..80feee446404f4543da2be0b862a4baaf5422010 --- /dev/null +++ b/opencompass/configs/datasets/math/math_4shot_example_from_google_research.py @@ -0,0 +1,40 @@ +# Solving Quantitative Reasoning Problems with Language Models + +prompt = ''' +Problem: +Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$. + +Solution: +The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$. +Final Answer: The final answer is $[2,5)$. I hope it is correct. + +Problem: +If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$ + +Solution: +We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$ +Final Answer: The final answer is $24$. I hope it is correct. + +Problem: +Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? + +Solution: +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: +\\begin{align*} +30n&=480\\ +\\Rightarrow\\qquad n&=480/30=\\boxed{16} +\\end{align*} +Final Answer: The final answer is $16$. I hope it is correct. + +Problem: +If the system of equations +\\begin{align*} +6x-4y&=a,\\ +6y-9x &=b. +\\end{align*} +has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero. + +Solution: +If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$ +Final Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct. +'''.strip() diff --git a/opencompass/configs/datasets/math/math_agent_gen_861b4f.py b/opencompass/configs/datasets/math/math_agent_gen_861b4f.py new file mode 100644 index 0000000000000000000000000000000000000000..ad1a7272db810144db0bf177810117fa249a8991 --- /dev/null +++ b/opencompass/configs/datasets/math/math_agent_gen_861b4f.py @@ -0,0 +1,90 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess +) + +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict(type=MATHAgentEvaluator), + pred_postprocessor=dict(type=math_postprocess), +) + +math_datasets = [ + dict( + abbr='math-agent', + type=MATHDataset, + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/math/math_agent_gen_af2293.py b/opencompass/configs/datasets/math/math_agent_gen_af2293.py new file mode 100644 index 0000000000000000000000000000000000000000..51b3500b70c0fd40d76de61dedb25aefa2c125b9 --- /dev/null +++ b/opencompass/configs/datasets/math/math_agent_gen_af2293.py @@ -0,0 +1,103 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import AgentInferencer +from opencompass.datasets import ( + MATHDataset, MATHAgentEvaluator, math_postprocess +) + +# use pal format but not perform well +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + # # ################################### NEW SHOT ################################### + dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify + +def solution(): + x = symbols('x') + expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2) + simplified_expr = simplify(expr) + + x3_coefficient = simplified_expr.as_coefficients_dict()[x**3] + result = x3_coefficient + return result"""), + dict(role='SYSTEM', prompt='Response:26'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'), + dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math + +def solution(): + radius = 6 + + # Surface area of the hemisphere + hemisphere_area = 2 * math.pi * radius**2 + + # Area of the circular base + base_area = math.pi * radius**2 + + # Total surface area + total_surface_area = hemisphere_area + base_area + + # Formatting the result in LaTeX + result = r'{}\pi'.format(total_surface_area / math.pi) + return result"""), + dict(role='SYSTEM', prompt='Response:108.0\\pi'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'), + dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution(): + # Probabilities of each outcome + prime_prob = 1 / 6 + composite_prob = 1 / 3 + otherwise_prob = 1 / 6 + + # Expected value of each outcome + prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob) + composite_expected_value = 0 * composite_prob + otherwise_expected_value = -3 * otherwise_prob + + # Total expected value + total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value + + # Dollar value to the nearest cent + result = "{:.2f}".format(total_expected_value) + return result"""), + dict(role='SYSTEM', prompt='Response:1.17'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'), + dict(role='HUMAN', prompt='Given $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$'), + dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import numpy as np + +def solution() + a = np.array([-7, 0, 1]) + b = np.array([4, 2, -1]) + + result = a - 3 * b + + result = r'\\begin{{pmatrix}} {} \ {} \ {} \end{{pmatrix}}'.format(result[0], result[1], result[2]) + return result"""), + dict(role='SYSTEM', prompt='Response:\\begin{pmatrix} -19 \\ -6 \\ 4 \\end{pmatrix}'), + dict(role='BOT', prompt='FinalAnswer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \\end{pmatrix}$. I hope it is correct.'), + dict(role='HUMAN', prompt='{problem}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=AgentInferencer), +) + +math_eval_cfg = dict( + evaluator=dict(type=MATHAgentEvaluator), + pred_postprocessor=dict(type=math_postprocess), +) + +math_datasets = [ + dict( + abbr='math-agent', + type=MATHDataset, + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py b/opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py new file mode 100644 index 0000000000000000000000000000000000000000..6ca42154b318f6196f6aff62d4d894ace29eea69 --- /dev/null +++ b/opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py @@ -0,0 +1,38 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'), + dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'), + dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'), + dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'), + dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_gen_1ed9c2.py b/opencompass/configs/datasets/math/math_gen_1ed9c2.py new file mode 100644 index 0000000000000000000000000000000000000000..a168d511875612bc62d9fae0d61dac18b65e17f0 --- /dev/null +++ b/opencompass/configs/datasets/math/math_gen_1ed9c2.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplified.\nSolution:'), + dict(role='BOT', prompt='Combine like terms to simplify the expression. The coefficient of $x^3$ is calculated as $$(-3+2\cdot(2+1))+(-5)\cdot(-4))$ = 26$. Thus, the coefficient of $x^3$ is $\\boxed{26}$.\nFinal Answer: The final answer is $26$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nThe surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.\nSolution:'), + dict(role='BOT', prompt='The surface area of a hemisphere (not including the base) is half that of a sphere, so it is $2\pi r^2$. The area of the base is $\pi r^2$. Therefore, for a hemisphere with radius 6 cm, the total surface area is $2\pi (6)^2 + \pi (6)^2 = 108\pi$ square cm.\nFinal Answer: The final answer is $108\pi$ square cm. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nMonica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.\nSolution:'), + dict(role='BOT', prompt='The prime numbers rolled could be 2, 3, or 5, and each has a 1/6 chance of being rolled. The composite number 4 or 6 has a 2/6 chance of being rolled, but it results in $0 win. The remaining non-prime and non-composite number is 1 , and it results in a loss of $3, with a 1/6 chance. So, the expected winnings are $(2+3+5)(1/6)+0(2/6)+(-3)(1/6) = \$1.17$.\nFinal Answer: The final answer is $\$1.17$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nGiven $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$\nSolution:'), + dict(role='BOT', prompt='We find $3 \mathbf{b}$ first, which is $\\begin{pmatrix} 12 \\ 6 \\ -3 \end{pmatrix}$. Then we subtract this vector from $\mathbf{a}$. So, $\mathbf{a} - 3 \mathbf{b} = \\begin{pmatrix} -7 - 12 \\ 0 - 6 \\ 1 - (-3) \end{pmatrix} = \\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}.$\nFinal Answer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_gen_559593.py b/opencompass/configs/datasets/math/math_gen_559593.py new file mode 100644 index 0000000000000000000000000000000000000000..18da00285450af578088c3e7ac2b0fd1b8b46de8 --- /dev/null +++ b/opencompass/configs/datasets/math/math_gen_559593.py @@ -0,0 +1,53 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='''Problem: +Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}} +Solution: +The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$. +Final Answer: The final answer is $[2,5)$. I hope it is correct. + +Problem: +If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$ +Solution: +We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$ +Final Answer: The final answer is $24$. I hope it is correct. + +Problem: +Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? +Solution: +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}} +Final Answer: The final answer is $16$. I hope it is correct. + +Problem: +If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero. +Solution: +If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$ +Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct. + +Problem: +{problem} +Solution: +{solution}'''), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_gen_5e8458.py b/opencompass/configs/datasets/math/math_gen_5e8458.py new file mode 100644 index 0000000000000000000000000000000000000000..ed9c3e5fbe1d85d8bb39bbdb47aa4264db2bdf0e --- /dev/null +++ b/opencompass/configs/datasets/math/math_gen_5e8458.py @@ -0,0 +1,53 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='''Problem: +Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}} +Solution: +The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$. +Final Answer: The final answer is $[2,5)$. I hope it is correct. + +Problem: +If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$ +Solution: +We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$ +Final Answer: The final answer is $24$. I hope it is correct. + +Problem: +Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? +Solution: +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}} +Final Answer: The final answer is $16$. I hope it is correct. + +Problem: +If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero. +Solution: +If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$ +Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct. + +Problem: +{problem}Solution: +{solution}'''), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=dict( + input_columns=['problem'], + output_column='solution', + ), + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_gen_78ced2.py b/opencompass/configs/datasets/math/math_gen_78ced2.py new file mode 100644 index 0000000000000000000000000000000000000000..9088b9758468efc396a82b8739656ad83c996805 --- /dev/null +++ b/opencompass/configs/datasets/math/math_gen_78ced2.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +QUERY_TEMPLATE = """ +Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem. + +{problem} + +Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command. +""".strip() + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + + template=dict(round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_intern_evaluator_gen_265cce.py b/opencompass/configs/datasets/math/math_intern_evaluator_gen_265cce.py new file mode 100644 index 0000000000000000000000000000000000000000..1a6cbf379ea6f43e87992e76289364183a4602a8 --- /dev/null +++ b/opencompass/configs/datasets/math/math_intern_evaluator_gen_265cce.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHInternEvaluator, math_intern_postprocess + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'), + dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'), + dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'), + dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'), + dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'), + dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHInternEvaluator), pred_postprocessor=dict(type=math_intern_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/math/math_llm_judge.py b/opencompass/configs/datasets/math/math_llm_judge.py new file mode 100644 index 0000000000000000000000000000000000000000..6a81bea27fc835f9474440f3c188144ccd3a0d42 --- /dev/null +++ b/opencompass/configs/datasets/math/math_llm_judge.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess + +QUERY_TEMPLATE = """ +Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem. +{problem} +Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command. +""".strip() + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + + template=dict(round=[ + dict(role='HUMAN', prompt=QUERY_TEMPLATE), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024)) + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='opencompass/math', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_cot_gen_08c1de.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_cot_gen_08c1de.py new file mode 100644 index 0000000000000000000000000000000000000000..341e13cf57195860d229764488ec815c2514a98b --- /dev/null +++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_cot_gen_08c1de.py @@ -0,0 +1,64 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUProDataset +from opencompass.utils.text_postprocessors import match_answer_pattern + +with read_base(): + from .mmlu_pro_categories import categories + + + +QUERY_TEMPLATE = """ +Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering. + +Question:\n +{question} + +Options:\n +{options_str} + +""".strip() + +mmlu_pro_datasets = [] + +for category in categories: + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', + prompt=QUERY_TEMPLATE), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict( + type=match_answer_pattern, + answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])') + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_categories.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_categories.py new file mode 100644 index 0000000000000000000000000000000000000000..eff38983e0a06b5a806239fd644e85ed19f141a0 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_categories.py @@ -0,0 +1,16 @@ +categories = [ + 'math', + 'physics', + 'chemistry', + 'law', + 'engineering', + 'other', + 'economics', + 'health', + 'psychology', + 'business', + 'biology', + 'philosophy', + 'computer science', + 'history', +] diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py new file mode 100644 index 0000000000000000000000000000000000000000..dc12fd1d9d18e4b5ea744671e9a495285d1a20f2 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_few_shot_gen_bfaf90.py @@ -0,0 +1,47 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator + +with read_base(): + from .mmlu_pro_categories import categories + +mmlu_pro_datasets = [] + +for category in categories: + hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.' + question_and_options = 'Question:\n{question}\nOptions:\n{options_str}' + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer_string', + train_split='validation', + test_split='test', + ) + mmlu_pro_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=f'{question_and_options}\nAnswer: {{answer}}'), + prompt_template=dict( + type=PromptTemplate, + template=f'{hint}\n{question_and_options}\nAnswer: ', + ice_token='' + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer, max_out_len=100) + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=MMLUProBaseEvaluator) + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py new file mode 100644 index 0000000000000000000000000000000000000000..42c30131ff8fcac716063df504fc6866adba0488 --- /dev/null +++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen_cdbebf.py @@ -0,0 +1,59 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MMLUProDataset +from opencompass.utils.text_postprocessors import first_option_postprocess + +with read_base(): + from .mmlu_pro_categories import categories + + +mmlu_pro_datasets = [] + +for category in categories: + mmlu_pro_reader_cfg = dict( + input_columns=['question', 'cot_content', 'options_str'], + output_column='answer', + train_split='validation', + test_split='test', + ) + + mmlu_pro_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='Question:\n{question}\nOptions:\n{options_str}'), + dict(role='BOT', prompt="Answer: Let's think step by step. {cot_content}") + ]), + ), + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict(role='HUMAN', prompt='Question:\n{question}\nOptions:\n{options_str}'), + ], + ), + ice_token='', + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + mmlu_pro_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCDEFGHIJKLMNOP'), + ) + + mmlu_pro_datasets.append( + dict( + abbr=f'mmlu_pro_{category.replace(" ", "_")}', + type=MMLUProDataset, + path='opencompass/mmlu_pro', + category=category, + reader_cfg=mmlu_pro_reader_cfg, + infer_cfg=mmlu_pro_infer_cfg, + eval_cfg=mmlu_pro_eval_cfg, + )) diff --git a/opencompass/configs/datasets/ruler/README.md b/opencompass/configs/datasets/ruler/README.md new file mode 100644 index 0000000000000000000000000000000000000000..76c42e486291a93d9ffd19d1dbbe3e91cf565fe9 --- /dev/null +++ b/opencompass/configs/datasets/ruler/README.md @@ -0,0 +1,14 @@ +# Ruler +OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. + +OpenCompass have providied two types of evaluation demo for using different tokenizers. + +For using the same tokenizer (typicall GPT-4), you can follow the demo (configs/eval_ruler_fix_tokenizer.py) where most of the settings are already defined. + + +For evaluation using each model's own tokenizer, you have to build the settings when you run the demo (we do not know which model you are trying to evaluate!) you can create a new evaluation script following the example (configs/eval_ruler.py) and change the context window sizes or add models according to your settings. + +```bash +python run.py configs/eval_ruler_fix_tokenizer.py # For evaluation with GPT-4 tokenizer +python run.py configs/eval_ruler.py # For evaluation with model's tokenizer +``` diff --git a/opencompass/configs/datasets/ruler/ruler_128k_gen.py b/opencompass/configs/datasets/ruler/ruler_128k_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..4f302ad250cf3747efdc3b971c86596af60ab077 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_128k_gen.py @@ -0,0 +1,28 @@ +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 128] +abbr_suffixs = ['128k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_16k_gen.py b/opencompass/configs/datasets/ruler/ruler_16k_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..faab3ccab053d5b75113fb787d38b23fe31f06d6 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_16k_gen.py @@ -0,0 +1,29 @@ + +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 16] +abbr_suffixs = ['16k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_1m_gen.py b/opencompass/configs/datasets/ruler/ruler_1m_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..0b20375c45c27dc22d8d3d054bdb03a14ba4361b --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_1m_gen.py @@ -0,0 +1,29 @@ + +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 1024] +abbr_suffixs = ['1m'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_32k_gen.py b/opencompass/configs/datasets/ruler/ruler_32k_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..ab02cb4a2f5e71c674f6932fef79824255090547 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_32k_gen.py @@ -0,0 +1,29 @@ + +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 32] +abbr_suffixs = ['32k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_4k_gen.py b/opencompass/configs/datasets/ruler/ruler_4k_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..f003150725699001f063eacca38a727d84d3425b --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_4k_gen.py @@ -0,0 +1,28 @@ +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 4] +abbr_suffixs = ['4k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_64k_gen.py b/opencompass/configs/datasets/ruler/ruler_64k_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..709260d6c69a23a90b995bbd4235be85aa62536e --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_64k_gen.py @@ -0,0 +1,28 @@ +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 64] +abbr_suffixs: list[str] = ['64k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_8k_gen.py b/opencompass/configs/datasets/ruler/ruler_8k_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..8c9a4ad8f4eceb31bb37a5f3c9dfcbf17d608f66 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_8k_gen.py @@ -0,0 +1,29 @@ + +from mmengine.config import read_base + +with read_base(): + from .ruler_cwe_gen import cwe_datasets as cwe # CWE + from .ruler_fwe_gen import fwe_datasets as fwe # FWE + from .ruler_niah_gen import niah_datasets as niah # Niah + from .ruler_qa_gen import qa_datasets as qa # QA + from .ruler_vt_gen import vt_datasets as vt # VT + + +import_ds = sum((cwe, fwe, niah, qa, vt), []) + +# Evaluation config +NUM_SAMPLES = 100 # Change to the number of samples you need +# Change the context lengths to be tested +max_seq_lens = [1024 * 8] +abbr_suffixs = ['8k'] + +ruler_datasets = [] + +# Different seq length +for max_seq_len, abbr_suffix in zip(max_seq_lens, abbr_suffixs): + for dataset in import_ds: + tmp_dataset = dataset.deepcopy() + tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix + tmp_dataset['num_samples'] = NUM_SAMPLES + tmp_dataset['max_seq_length'] = max_seq_len + ruler_datasets.append(tmp_dataset) diff --git a/opencompass/configs/datasets/ruler/ruler_combined_gen.py b/opencompass/configs/datasets/ruler/ruler_combined_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..077c4f10672c593f7768bc1b202798350119dd63 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_combined_gen.py @@ -0,0 +1,12 @@ +from mmengine.config import read_base + +with read_base(): + from .ruler_1m_gen import ruler_datasets as ruler_1m_ds + from .ruler_4k_gen import ruler_datasets as ruler_4k_ds + from .ruler_8k_gen import ruler_datasets as ruler_8k_ds + from .ruler_16k_gen import ruler_datasets as ruler_16k_ds + from .ruler_32k_gen import ruler_datasets as ruler_32k_ds + from .ruler_64k_gen import ruler_datasets as ruler_64k_ds + from .ruler_128k_gen import ruler_datasets as ruler_128k_ds + +ruler_combined_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), []) diff --git a/opencompass/configs/datasets/ruler/ruler_cwe_gen.py b/opencompass/configs/datasets/ruler/ruler_cwe_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..2bc8fb4a0bb20b58ddcdeffebfff1c357f667d45 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_cwe_gen.py @@ -0,0 +1,34 @@ +from opencompass.datasets.ruler.ruler_cwe import RulerCweDataset +from opencompass.datasets.ruler.ruler_cwe import RulerCweEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer + +# CWE Dataset +cwe_datasets = [ + { + 'abbr': 'ruler_cwe', + 'type': RulerCweDataset, + 'freq_cw': 30, + 'freq_ucw': 3, + 'num_cw': 10, + 'tokens_to_generate': 120, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict(type=RulerCweEvaluator), + ), + } +] diff --git a/opencompass/configs/datasets/ruler/ruler_fwe_gen.py b/opencompass/configs/datasets/ruler/ruler_fwe_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..c5585c309453e426008cbce40eaca7df7ca564b0 --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_fwe_gen.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.ruler.ruler_fwe import RulerFweDataset +from opencompass.datasets.ruler.ruler_fwe import RulerFweEvaluator + +# FWE Dataset +fwe_datasets = [ + { + 'abbr': 'ruler_fwe', + 'type': RulerFweDataset, + 'tokens_to_generate': 50, + 'alpha': 2.0, + 'coded_wordlen': 6, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict(type=RulerFweEvaluator), + ), + } +] diff --git a/opencompass/configs/datasets/ruler/ruler_niah_gen.py b/opencompass/configs/datasets/ruler/ruler_niah_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..bb6e79a5a678465b77215a24f62c26f5b993890a --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_niah_gen.py @@ -0,0 +1,123 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset +from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator + + +# Ruler Dataset settings +niah_configurations = [ + { + 'abbr': 'single_1', + 'type_haystack': 'repeat', + 'type_needle_k': 'words', + 'type_needle_v': 'numbers', + 'num_needle_k': 1, + 'num_needle_v': 1, + 'num_needle_q': 1, + }, + { + 'abbr': 'single_2', + 'type_haystack': 'essay', + 'type_needle_k': 'words', + 'type_needle_v': 'numbers', + 'num_needle_k': 1, + 'num_needle_v': 1, + 'num_needle_q': 1, + }, + { + 'abbr': 'single_3', + 'type_haystack': 'essay', + 'type_needle_k': 'words', + 'type_needle_v': 'uuids', + 'num_needle_k': 1, + 'num_needle_v': 1, + 'num_needle_q': 1, + }, + { + 'abbr': 'multikey_1', + 'type_haystack': 'essay', + 'type_needle_k': 'words', + 'type_needle_v': 'numbers', + 'num_needle_k': 4, + 'num_needle_v': 1, + 'num_needle_q': 1, + }, + { + 'abbr': 'multikey_2', + 'type_haystack': 'needle', + 'type_needle_k': 'words', + 'type_needle_v': 'numbers', + 'num_needle_k': 1, + 'num_needle_v': 1, + 'num_needle_q': 1, + }, + { + 'abbr': 'multikey_3', + 'type_haystack': 'needle', + 'type_needle_k': 'uuids', + 'type_needle_v': 'uuids', + 'num_needle_k': 1, + 'num_needle_v': 1, + 'num_needle_q': 1, + }, + { + 'abbr': 'multivalue', + 'type_haystack': 'essay', + 'type_needle_k': 'words', + 'type_needle_v': 'numbers', + 'num_needle_k': 1, + 'num_needle_v': 4, + 'num_needle_q': 1, + }, + { + 'abbr': 'multiquery', + 'type_haystack': 'essay', + 'type_needle_k': 'words', + 'type_needle_v': 'numbers', + 'num_needle_k': 1, + 'num_needle_v': 1, + 'num_needle_q': 4, + }, +] + +niah_datasets = [] + +# NIAH Dataset +base_path = './data/ruler' +file_path = 'PaulGrahamEssays.jsonl' +for index, config in enumerate(niah_configurations): + dataset_dict = { + 'abbr': f'ruler_niah_{config["abbr"]}', + 'type': RulerNiahDataset, + 'base_path': base_path, + 'file_path': file_path, + # 'tokenizer_model': model_path, + 'tokens_to_generate': 128, + # 'max_seq_length': max_seq_len, + # 'num_samples': NUM_SAMPLES, + 'type_haystack': config['type_haystack'], + 'type_needle_k': config['type_needle_k'], + 'type_needle_v': config['type_needle_v'], + 'num_needle_k': config['num_needle_k'], + 'num_needle_v': config['num_needle_v'], + 'num_needle_q': config['num_needle_q'], + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict(type=RulerNiahEvaluator), + ), + } + niah_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/ruler/ruler_qa_gen.py b/opencompass/configs/datasets/ruler/ruler_qa_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..5c4688397b250cd81abaac7032fc098da6ec692d --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_qa_gen.py @@ -0,0 +1,38 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.ruler.ruler_qa import RulerQaDataset +from opencompass.datasets.ruler.ruler_qa import RulerQaEvaluator + +qa_configurations = [ + {'dataset': 'squad', 'path': './data/ruler/dev-v2.0.json'}, + {'dataset': 'hotpotqa', 'path': './data/ruler/hotpotqa.json'}, +] + +qa_datasets = [] +for index, config in enumerate(qa_configurations): + dataset_dict = { + 'abbr': f'ruler_qa_{config["dataset"]}', + 'dataset': config['dataset'], + 'path': config['path'], + 'type': RulerQaDataset, + 'tokens_to_generate': 50, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict(type=RulerQaEvaluator), + ), + } + qa_datasets.append(dataset_dict) diff --git a/opencompass/configs/datasets/ruler/ruler_vt_gen.py b/opencompass/configs/datasets/ruler/ruler_vt_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..42dadc43b27839aee4f5445faa7083318ff9169c --- /dev/null +++ b/opencompass/configs/datasets/ruler/ruler_vt_gen.py @@ -0,0 +1,32 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.ruler.ruler_vt import RulerVtDataset +from opencompass.datasets.ruler.ruler_vt import RulerVtEvaluator + +# VT Dataset +vt_datasets = [ + { + 'abbr': 'ruler_vt', + 'type': RulerVtDataset, + 'num_chains': 1, + 'num_hops': 4, + 'reader_cfg': dict(input_columns=['prompt'], output_column='answer'), + 'infer_cfg': dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + 'eval_cfg': dict( + evaluator=dict(type=RulerVtEvaluator), + ), + } +] diff --git a/opencompass/configs/datasets/tydiqa/tydiqa_gen.py b/opencompass/configs/datasets/tydiqa/tydiqa_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..269c6334b9de9b1cbb9119557c9c8065b2a78601 --- /dev/null +++ b/opencompass/configs/datasets/tydiqa/tydiqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .tydiqa_gen_978d2a import tydiqa_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/tydiqa/tydiqa_gen_978d2a.py b/opencompass/configs/datasets/tydiqa/tydiqa_gen_978d2a.py new file mode 100644 index 0000000000000000000000000000000000000000..da4f712350c55c169f82f044abed03bdeb7ae393 --- /dev/null +++ b/opencompass/configs/datasets/tydiqa/tydiqa_gen_978d2a.py @@ -0,0 +1,61 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import TydiQADataset, TydiQAEvaluator +from os import environ + +# All configs are for TydiQA Goldp task +tydiqa_reader_cfg = dict( + input_columns=['passage_text', 'question_text'], + output_column='answer' +) + +langs = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai'] + +prefixs_prompt = { + 'english': ('Answer the following question based on the information in the given passage.', 'Passage:', 'Question:', 'Answer:'), + 'arabic': ('أجب على السؤال التالي بناءً على المعلومات في المقطع المعطى.', 'المقطع:', 'السؤال:', 'الإجابة:'), + 'bengali': ('প্রদত্ত অধ্যায়ের তথ্যের উপর ভিত্তি করে নিম্নলিখিত প্রশ্নের উত্তর দিন।', 'অধ্যায়:', 'প্রশ্ন:', 'উত্তর:'), + 'finnish': ('Vastaa seuraavaan kysymykseen annetun kappaleen tiedon perusteella.', 'Kappale:', 'Kysymys:', 'Vastaus:'), + 'indonesian': ('Jawab pertanyaan berikut berdasarkan informasi di bagian yang diberikan.', 'Bagian:', 'Pertanyaan:', 'Jawaban:'), + 'korean': ('주어진 문단의 정보에 기반하여 다음 질문에 답하십시오.', '문단:', '질문:', '답변:'), + 'japanese':('文脈に基づいて質問に答えてください。','ぶんしょう:','しつもん:', 'かいとう:'), + 'russian': ('Ответьте на следующий вопрос на основе информации в данном отрывке.', 'Отрывок:', 'Вопрос:', 'Ответ:'), + 'swahili': ('Jibu swali lifuatalo kulingana na habari kwenye kifungu kilichotolewa.', 'Kifungu:', 'Swali:', 'Jibu:'), + 'telugu': ('ఇచ్చిన పేరాలోని సమాచారం ఆధారంగా కింది ప్రశ్నకు సమాధానం ఇవ్వండి.', 'పేరా:', 'ప్రశ్న:', 'సమాధానం:'), + 'thai':('ตอบคำถามต่อไปนี้โดยอิงตามข้อมูลในตอนข้อความที่กำหนด:', 'ตอนข้อความ:', 'คำถาม:', 'คำตอบ:') +} + +tydiqa_datasets = [] +for _lang in langs: + _hint = prefixs_prompt[_lang] + tydiqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=f'{_hint[0]}\n\n{_hint[1]}{{passage_text}}\n{_hint[2]} {{question_text}}\n{_hint[3]} {{answer}}' , + ice_token='' + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), max_out_len=50 + ) + + tydiqa_eval_cfg = dict( + evaluator=dict(type=TydiQAEvaluator), + ds_split='validation', + ds_column='answer', + ) + + # Skip japanese due to filter rules of Modelscope + if environ.get('DATASET_SOURCE') == 'Modelscope' and _lang == 'japanese': + continue + + tydiqa_datasets.append( + dict(abbr=f'tydiqa-goldp_{_lang}', + type=TydiQADataset, + path='opencompass/tydiqa', + lang=_lang, + reader_cfg=tydiqa_reader_cfg, + infer_cfg=tydiqa_infer_cfg, + eval_cfg=tydiqa_eval_cfg + ) + ) diff --git a/opencompass/configs/datasets/winograd/winograd_ppl.py b/opencompass/configs/datasets/winograd/winograd_ppl.py new file mode 100644 index 0000000000000000000000000000000000000000..4b311eaf87bd1ac67d371e427fee236c992b7cee --- /dev/null +++ b/opencompass/configs/datasets/winograd/winograd_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .winograd_ppl_b6c7ed import winograd_datasets # noqa: F401, F403 diff --git a/opencompass/configs/datasets/winograd/winograd_ppl_8f3049.py b/opencompass/configs/datasets/winograd/winograd_ppl_8f3049.py new file mode 100644 index 0000000000000000000000000000000000000000..107761c0dd18e23be7f01d3657ec3e28db84b5f6 --- /dev/null +++ b/opencompass/configs/datasets/winograd/winograd_ppl_8f3049.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import WinogradDataset + +winograd_reader_cfg = dict( + input_columns=['opt1', 'opt2'], + output_column='label', + train_split='test', + test_split='test') + +winograd_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: + "{prompt} Q: In the previous text, what does '{pronoun}' refer to? A: {opt1}.", # noqa + 1: + "{prompt} Q: In the previous text, what does '{pronoun}' refer to? A: {opt2}.", # noqa + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +winograd_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) + +winograd_datasets = [ + dict( + abbr='winograd', + type=WinogradDataset, + path='winograd_wsc', + trust_remote_code=True, + name='wsc285', + reader_cfg=winograd_reader_cfg, + infer_cfg=winograd_infer_cfg, + eval_cfg=winograd_eval_cfg) +] diff --git a/opencompass/configs/datasets/winograd/winograd_ppl_b6c7ed.py b/opencompass/configs/datasets/winograd/winograd_ppl_b6c7ed.py new file mode 100644 index 0000000000000000000000000000000000000000..6c1f6b5ea11a8bad6277fe9376a988f7a2ae8702 --- /dev/null +++ b/opencompass/configs/datasets/winograd/winograd_ppl_b6c7ed.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import WinogradDataset + +winograd_reader_cfg = dict( + input_columns=['prompt', 'pronoun', 'opt1', 'opt2'], + output_column='label', + train_split='test', + test_split='test') + +winograd_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + i: dict(round=[ + dict( + role='HUMAN', + prompt= + f"{{prompt}} Q: In the previous text, what does '{{pronoun}}' refer to? A: {{opt{i+1}}}" + ), # noqa + ]) + for i in range(2) + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +winograd_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) + +winograd_datasets = [ + dict( + abbr='winograd', + type=WinogradDataset, + path='winograd_wsc', + trust_remote_code=True, + name='wsc285', + reader_cfg=winograd_reader_cfg, + infer_cfg=winograd_infer_cfg, + eval_cfg=winograd_eval_cfg) +]