tuandunghcmut commited on
Commit
60628c7
·
verified ·
1 Parent(s): ea1cac4

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py +4 -0
  2. opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py +59 -0
  3. opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl.py +4 -0
  4. opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py +51 -0
  5. opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py +4 -0
  6. opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py +53 -0
  7. opencompass/configs/datasets/MMLUArabic/README.md +26 -0
  8. opencompass/configs/datasets/QuALITY/QuALITY_gen.py +4 -0
  9. opencompass/configs/datasets/QuALITY/QuALITY_gen_c407cb.py +38 -0
  10. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py +55 -0
  11. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py +47 -0
  12. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_ppl.py +47 -0
  13. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py +4 -0
  14. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py +41 -0
  15. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py +4 -0
  16. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py +43 -0
  17. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py +43 -0
  18. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py +45 -0
  19. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py +45 -0
  20. opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py +34 -0
  21. opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py +4 -0
  22. opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py +44 -0
  23. opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl.py +4 -0
  24. opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_0143fe.py +62 -0
  25. opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_11c175.py +33 -0
  26. opencompass/configs/datasets/apps/README.md +43 -0
  27. opencompass/configs/datasets/apps/apps_gen.py +4 -0
  28. opencompass/configs/datasets/apps/apps_gen_c7893a.py +28 -0
  29. opencompass/configs/datasets/apps/apps_mini_gen.py +4 -0
  30. opencompass/configs/datasets/apps/apps_mini_gen_c7893a.py +28 -0
  31. opencompass/configs/datasets/apps/deprecated_apps_gen_5b4254.py +33 -0
  32. opencompass/configs/datasets/apps/deprecated_apps_gen_7fbb95.py +40 -0
  33. opencompass/configs/datasets/apps/deprecated_apps_gen_b4dee3.py +30 -0
  34. opencompass/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py +57 -0
  35. opencompass/configs/datasets/lcsts/lcsts_gen.py +4 -0
  36. opencompass/configs/datasets/lcsts/lcsts_gen_8ee1fe.py +32 -0
  37. opencompass/configs/datasets/lcsts/lcsts_gen_9b0b89.py +28 -0
  38. opencompass/configs/datasets/leval/leval.py +23 -0
  39. opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py +90 -0
  40. opencompass/configs/datasets/math/math_0shot_gen_393424.py +35 -0
  41. opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py +30 -0
  42. opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py +30 -0
  43. opencompass/configs/datasets/math/math_4shot_example_from_google_research.py +40 -0
  44. opencompass/configs/datasets/math/math_agent_gen_861b4f.py +90 -0
  45. opencompass/configs/datasets/math/math_agent_gen_af2293.py +103 -0
  46. opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py +38 -0
  47. opencompass/configs/datasets/math/math_gen_1ed9c2.py +36 -0
  48. opencompass/configs/datasets/math/math_gen_559593.py +53 -0
  49. opencompass/configs/datasets/math/math_gen_5e8458.py +53 -0
  50. opencompass/configs/datasets/math/math_gen_78ced2.py +37 -0
opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .MMLUArabic_gen_326684 import MMLUArabic_datasets # noqa: F401, F403
opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MMLUArabicDataset
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ # None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
9
+ # Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
10
+
11
+ MMLUArabic_reader_cfg = dict(
12
+ input_columns=['input', 'A', 'B', 'C', 'D'],
13
+ output_column='target',
14
+ train_split='dev')
15
+
16
+ MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
17
+ MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
18
+
19
+ MMLUArabic_datasets = []
20
+ for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
21
+ _system = f"فيما يلي أسئلة الاختيار من متعدد (مع الإجابات) حول {' '.join(_name_ar.split('_'))}"
22
+ _hint = '\n{input}'
23
+ MMLUArabic_infer_cfg = dict(
24
+ ice_template=dict(
25
+ type=PromptTemplate,
26
+ template=dict(
27
+ begin=[
28
+ dict(role='SYSTEM', fallback_role='HUMAN', prompt=_system),
29
+ '</E>',
30
+ ],
31
+ round=[
32
+ dict(
33
+ role='HUMAN',
34
+ prompt=_hint.format(input='سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}')
35
+ ),
36
+ dict(role='BOT', prompt='إجابة: {target}')
37
+ ]),
38
+ ice_token='</E>',
39
+ ),
40
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
41
+ inferencer=dict(type=GenInferencer),
42
+ )
43
+
44
+ MMLUArabic_eval_cfg = dict(
45
+ evaluator=dict(type=AccEvaluator),
46
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
47
+
48
+ MMLUArabic_datasets.append(
49
+ dict(
50
+ abbr=f'acegpt_MMLUArabic_{_name}',
51
+ type=MMLUArabicDataset,
52
+ path='./data/MMLUArabic/',
53
+ name=_name,
54
+ reader_cfg=MMLUArabic_reader_cfg,
55
+ infer_cfg=MMLUArabic_infer_cfg,
56
+ eval_cfg=MMLUArabic_eval_cfg,
57
+ ))
58
+
59
+ del _name, _hint
opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .MMLUArabic_ppl_d2333a import MMLUArabic_datasets # noqa: F401, F403
opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MMLUArabicDataset
6
+
7
+ # None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
8
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
9
+
10
+ MMLUArabic_reader_cfg = dict(
11
+ input_columns=['input', 'A', 'B', 'C', 'D'],
12
+ output_column='target',
13
+ train_split='dev')
14
+
15
+ MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
16
+ MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
17
+
18
+ MMLUArabic_datasets = []
19
+ for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
20
+ # _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
21
+ _hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n"
22
+ # question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
23
+ question_overall = 'سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
24
+ MMLUArabic_infer_cfg = dict(
25
+ ice_template=dict(
26
+ type=PromptTemplate,
27
+ template={opt: f'{question_overall}\nإجابة: {opt}\n' for opt in ['A', 'B', 'C', 'D']},
28
+ ),
29
+ prompt_template=dict(
30
+ type=PromptTemplate,
31
+ template={opt: f'{_hint}</E>{question_overall}\nإجابة: {opt}' for opt in ['A', 'B', 'C', 'D']},
32
+ ice_token='</E>',
33
+ ),
34
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
35
+ inferencer=dict(type=PPLInferencer),
36
+ )
37
+
38
+ MMLUArabic_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
39
+
40
+ MMLUArabic_datasets.append(
41
+ dict(
42
+ abbr=f'acegpt_MMLUArabic_{_name}',
43
+ type=MMLUArabicDataset,
44
+ path='./data/MMLUArabic/',
45
+ name=_name,
46
+ reader_cfg=MMLUArabic_reader_cfg,
47
+ infer_cfg=MMLUArabic_infer_cfg,
48
+ eval_cfg=MMLUArabic_eval_cfg,
49
+ ))
50
+
51
+ del _name, _hint
opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .MMLUArabic_zero_shot_gen_3523e0 import MMLUArabic_datasets # noqa: F401, F403
opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MMLUArabicDataset
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ # None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
9
+ # Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
10
+
11
+ MMLUArabic_reader_cfg = dict(
12
+ input_columns=['input', 'A', 'B', 'C', 'D'],
13
+ output_column='target',
14
+ train_split='dev')
15
+
16
+ MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
17
+ MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
18
+
19
+ MMLUArabic_datasets = []
20
+ for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
21
+ _hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n" + '{input}\n' + "من فضلك اختر إجابة واحدة من بين 'A، B، C، D' دون شرح."
22
+ MMLUArabic_infer_cfg = dict(
23
+ ice_template=dict(
24
+ type=PromptTemplate,
25
+ template=dict(
26
+ round=[
27
+ dict(
28
+ role='HUMAN',
29
+ prompt=_hint.format(input='سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}')
30
+ ),
31
+ ]),
32
+ ice_token='</E>',
33
+ ),
34
+ retriever=dict(type=ZeroRetriever),
35
+ inferencer=dict(type=GenInferencer),
36
+ )
37
+
38
+ MMLUArabic_eval_cfg = dict(
39
+ evaluator=dict(type=AccEvaluator),
40
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
41
+
42
+ MMLUArabic_datasets.append(
43
+ dict(
44
+ abbr=f'acegpt_MMLUArabic_{_name}',
45
+ type=MMLUArabicDataset,
46
+ path='./data/MMLUArabic/',
47
+ name=_name,
48
+ reader_cfg=MMLUArabic_reader_cfg,
49
+ infer_cfg=MMLUArabic_infer_cfg,
50
+ eval_cfg=MMLUArabic_eval_cfg,
51
+ ))
52
+
53
+ del _name, _hint
opencompass/configs/datasets/MMLUArabic/README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MMLUArabic
2
+ ## Dataset Description
3
+ MMLUArabic is a benchmark for the assessment of knowledge in Arabic and covers a wide range of topics and aspects, consisting of multiple-choice questions in various branches of knowledge.
4
+
5
+
6
+ ## How to Use
7
+ Download file from [link](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic)
8
+
9
+ ```python
10
+ val_ds = load_dataset("MMLUArabic", header=None)['validation']
11
+ test_ds = load_dataset("MMLUArabic", header=None)['test']
12
+ # input, option_a, option_b, option_c, option_d, target
13
+ print(next(iter(val_ds)))
14
+ ```
15
+
16
+ ## Citation
17
+ ```
18
+ @misc{huang2023acegpt,
19
+ title={AceGPT, Localizing Large Language Models in Arabic},
20
+ author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
21
+ year={2023},
22
+ eprint={2309.12053},
23
+ archivePrefix={arXiv},
24
+ primaryClass={cs.CL}
25
+ }
26
+ ```
opencompass/configs/datasets/QuALITY/QuALITY_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .QuALITY_gen_c407cb import QuALITY_datasets # noqa: F401, F403
opencompass/configs/datasets/QuALITY/QuALITY_gen_c407cb.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import QuALITYDataset, QuALITYEvaluator
5
+ from opencompass.utils.text_postprocessors import first_option_postprocess
6
+
7
+ QuALITY_reader_cfg = dict(
8
+ input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
9
+ output_column='gold_label',
10
+ )
11
+
12
+ QuALITY_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template=dict(round=[
16
+ dict(
17
+ role='HUMAN',
18
+ prompt=
19
+ 'Read the article, and answer the question.\n\nArticle:\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
20
+ ),
21
+ ])),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(type=GenInferencer))
24
+
25
+ QuALITY_eval_cfg = dict(
26
+ evaluator=dict(type=QuALITYEvaluator),
27
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
28
+ pred_role='BOT')
29
+
30
+ QuALITY_datasets = [
31
+ dict(
32
+ abbr='QuALITY',
33
+ type=QuALITYDataset,
34
+ path='./data/QuALITY/QuALITY.v1.0.1.htmlstripped.dev',
35
+ reader_cfg=QuALITY_reader_cfg,
36
+ infer_cfg=QuALITY_infer_cfg,
37
+ eval_cfg=QuALITY_eval_cfg),
38
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import BoolQDatasetV2
6
+ from opencompass.utils.text_postprocessors import (
7
+ first_option_postprocess,
8
+ )
9
+
10
+ QUERY_TEMPLATE = """
11
+ Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
12
+
13
+ Passage: {passage}
14
+
15
+ Question: {question}
16
+
17
+ A. Yes
18
+ B. NO
19
+
20
+ """.strip()
21
+
22
+ BoolQ_reader_cfg = dict(
23
+ input_columns=['question', 'passage'],
24
+ output_column='label',
25
+ )
26
+
27
+ BoolQ_infer_cfg = dict(
28
+ prompt_template=dict(
29
+ type=PromptTemplate,
30
+ template=dict(
31
+ round=[
32
+ dict(role='HUMAN', prompt=QUERY_TEMPLATE),
33
+ ]
34
+ ),
35
+ ),
36
+ retriever=dict(type=ZeroRetriever),
37
+ inferencer=dict(type=GenInferencer),
38
+ )
39
+
40
+ BoolQ_eval_cfg = dict(
41
+ evaluator=dict(type=AccEvaluator),
42
+ pred_role='BOT',
43
+ pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
44
+ )
45
+
46
+ BoolQ_datasets = [
47
+ dict(
48
+ abbr='BoolQ',
49
+ type=BoolQDatasetV2,
50
+ path='opencompass/boolq',
51
+ reader_cfg=BoolQ_reader_cfg,
52
+ infer_cfg=BoolQ_infer_cfg,
53
+ eval_cfg=BoolQ_eval_cfg,
54
+ )
55
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import BoolQDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ BoolQ_reader_cfg = dict(
9
+ input_columns=['question', 'passage'],
10
+ output_column='label',
11
+ )
12
+
13
+ BoolQ_infer_cfg = dict(
14
+ ice_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(
17
+ begin='</E>',
18
+ round=[
19
+ dict(
20
+ role='HUMAN',
21
+ prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:',
22
+ ),
23
+ dict(role='BOT', prompt='{label}'),
24
+ ],
25
+ ),
26
+ ice_token='</E>',
27
+ ),
28
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
29
+ inferencer=dict(type=GenInferencer, max_out_len=50),
30
+ )
31
+
32
+ BoolQ_eval_cfg = dict(
33
+ evaluator=dict(type=AccEvaluator),
34
+ pred_role='BOT',
35
+ pred_postprocessor=dict(type=first_capital_postprocess),
36
+ )
37
+
38
+ BoolQ_datasets = [
39
+ dict(
40
+ abbr='BoolQ',
41
+ type=BoolQDatasetV2,
42
+ path='opencompass/boolq',
43
+ reader_cfg=BoolQ_reader_cfg,
44
+ infer_cfg=BoolQ_infer_cfg,
45
+ eval_cfg=BoolQ_eval_cfg,
46
+ )
47
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_ppl.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
3
+ from opencompass.openicl.icl_evaluator import AccEvaluator
4
+ from opencompass.datasets import BoolQDatasetV2
5
+ from opencompass.openicl.icl_inferencer import PPLInferencer
6
+ from opencompass.openicl.icl_evaluator import AccEvaluator
7
+
8
+ BoolQ_reader_cfg = dict(
9
+ input_columns=['question', 'passage'],
10
+ output_column='label',
11
+ )
12
+
13
+ BoolQ_infer_cfg = dict(
14
+ ice_template=dict(
15
+ type=PromptTemplate,
16
+ template={
17
+ 'B': dict(
18
+ round=[
19
+ dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
20
+ dict(role='BOT', prompt='No'),
21
+ ]
22
+ ),
23
+ 'A': dict(
24
+ round=[
25
+ dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
26
+ dict(role='BOT', prompt='Yes'),
27
+ ]
28
+ ),
29
+ },
30
+ ice_token='</E>',
31
+ ),
32
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
33
+ inferencer=dict(type=PPLInferencer, max_out_len=50),
34
+ )
35
+
36
+ BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
37
+
38
+ BoolQ_datasets = [
39
+ dict(
40
+ abbr='BoolQ',
41
+ type=BoolQDatasetV2,
42
+ path='opencompass/boolq',
43
+ reader_cfg=BoolQ_reader_cfg,
44
+ infer_cfg=BoolQ_infer_cfg,
45
+ eval_cfg=BoolQ_eval_cfg,
46
+ )
47
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets # noqa: F401, F403
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import BoolQDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ BoolQ_reader_cfg = dict(
9
+ input_columns=['question', 'passage'],
10
+ output_column='label',
11
+ )
12
+
13
+ BoolQ_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:'),
20
+ ]),
21
+ ),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(type=GenInferencer),
24
+ )
25
+
26
+ BoolQ_eval_cfg = dict(
27
+ evaluator=dict(type=AccEvaluator),
28
+ pred_role='BOT',
29
+ pred_postprocessor=dict(type=first_capital_postprocess),
30
+ )
31
+
32
+ BoolQ_datasets = [
33
+ dict(
34
+ abbr='BoolQ',
35
+ type=BoolQDatasetV2,
36
+ path='opencompass/boolq',
37
+ reader_cfg=BoolQ_reader_cfg,
38
+ infer_cfg=BoolQ_infer_cfg,
39
+ eval_cfg=BoolQ_eval_cfg,
40
+ )
41
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets # noqa: F401, F403
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import BoolQDatasetV2
6
+
7
+ BoolQ_reader_cfg = dict(
8
+ input_columns=['question', 'passage'],
9
+ output_column='label',
10
+ )
11
+
12
+ BoolQ_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 'A':
17
+ dict(round=[
18
+ dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
19
+ dict(role='BOT', prompt='Yes'),
20
+ ]),
21
+ 'B':
22
+ dict(round=[
23
+ dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
24
+ dict(role='BOT', prompt='No'),
25
+ ]),
26
+ },
27
+ ),
28
+ retriever=dict(type=ZeroRetriever),
29
+ inferencer=dict(type=PPLInferencer),
30
+ )
31
+
32
+ BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
33
+
34
+ BoolQ_datasets = [
35
+ dict(
36
+ abbr='BoolQ',
37
+ type=BoolQDatasetV2,
38
+ path='opencompass/boolq',
39
+ reader_cfg=BoolQ_reader_cfg,
40
+ infer_cfg=BoolQ_infer_cfg,
41
+ eval_cfg=BoolQ_eval_cfg,
42
+ )
43
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import BoolQDatasetV3
6
+
7
+ BoolQ_reader_cfg = dict(
8
+ input_columns=['question', 'passage'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ BoolQ_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 'false':
17
+ dict(round=[
18
+ dict(role='HUMAN', prompt='Passage: {passage}\nQuestion: {question}?'),
19
+ dict(role='BOT', prompt='Answer: No'),
20
+ ]),
21
+ 'true':
22
+ dict(round=[
23
+ dict(role='HUMAN', prompt='Passage: {passage}\nQuestion: {question}?'),
24
+ dict(role='BOT', prompt='Answer: Yes'),
25
+ ]),
26
+ },
27
+ ),
28
+ retriever=dict(type=ZeroRetriever),
29
+ inferencer=dict(type=PPLInferencer),
30
+ )
31
+
32
+ BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
33
+
34
+ BoolQ_datasets = [
35
+ dict(
36
+ abbr='BoolQ',
37
+ type=BoolQDatasetV3,
38
+ path='opencompass/boolq',
39
+ reader_cfg=BoolQ_reader_cfg,
40
+ infer_cfg=BoolQ_infer_cfg,
41
+ eval_cfg=BoolQ_eval_cfg,
42
+ )
43
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import BoolQDataset
6
+
7
+ BoolQ_reader_cfg = dict(
8
+ input_columns=['question', 'passage'],
9
+ output_column='answer',
10
+ test_split='train')
11
+
12
+ BoolQ_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0:
17
+ dict(round=[
18
+ dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
19
+ dict(role='BOT', prompt='No'),
20
+ ]),
21
+ 1:
22
+ dict(round=[
23
+ dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
24
+ dict(role='BOT', prompt='Yes'),
25
+ ]),
26
+ },
27
+ ),
28
+ retriever=dict(type=ZeroRetriever),
29
+ inferencer=dict(type=PPLInferencer),
30
+ )
31
+
32
+ BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
33
+
34
+ BoolQ_datasets = [
35
+ dict(
36
+ type=BoolQDataset,
37
+ abbr='BoolQ',
38
+ path='json',
39
+ data_files='opencompass/boolq',
40
+ split='train',
41
+ reader_cfg=BoolQ_reader_cfg,
42
+ infer_cfg=BoolQ_infer_cfg,
43
+ eval_cfg=BoolQ_eval_cfg,
44
+ )
45
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import BoolQDataset
6
+
7
+ BoolQ_reader_cfg = dict(
8
+ input_columns=['question', 'passage'],
9
+ output_column='answer',
10
+ test_split='train')
11
+
12
+ BoolQ_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0:
17
+ dict(round=[
18
+ dict(role='HUMAN', prompt='{passage}\nQuestion: {question}'),
19
+ dict(role='BOT', prompt='No.'),
20
+ ]),
21
+ 1:
22
+ dict(round=[
23
+ dict(role='HUMAN', prompt='{passage}\nQuestion: {question}'),
24
+ dict(role='BOT', prompt='Yes.'),
25
+ ]),
26
+ },
27
+ ),
28
+ retriever=dict(type=ZeroRetriever),
29
+ inferencer=dict(type=PPLInferencer),
30
+ )
31
+
32
+ BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
33
+
34
+ BoolQ_datasets = [
35
+ dict(
36
+ type=BoolQDataset,
37
+ abbr='BoolQ',
38
+ path='json',
39
+ data_files='opencompass/boolq',
40
+ split='train',
41
+ reader_cfg=BoolQ_reader_cfg,
42
+ infer_cfg=BoolQ_infer_cfg,
43
+ eval_cfg=BoolQ_eval_cfg,
44
+ )
45
+ ]
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import BoolQDataset
6
+
7
+ BoolQ_reader_cfg = dict(
8
+ input_columns=['question', 'passage'],
9
+ output_column='answer',
10
+ test_split='train')
11
+
12
+ BoolQ_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0: 'Passage:{passage}。\nQuestion:{question}。\nAnswer: No.',
17
+ 1: 'Passage:{passage}。\nQuestion:{question}。\nAnswer: Yes.',
18
+ }),
19
+ retriever=dict(type=ZeroRetriever),
20
+ inferencer=dict(type=PPLInferencer))
21
+
22
+ BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
23
+
24
+ BoolQ_datasets = [
25
+ dict(
26
+ type=BoolQDataset,
27
+ abbr='BoolQ',
28
+ path='json',
29
+ data_files='opencompass/boolq',
30
+ split='train',
31
+ reader_cfg=BoolQ_reader_cfg,
32
+ infer_cfg=BoolQ_infer_cfg,
33
+ eval_cfg=BoolQ_eval_cfg)
34
+ ]
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .SuperGLUE_CB_gen_854c6c import CB_datasets # noqa: F401, F403
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CBDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ CB_reader_cfg = dict(
9
+ input_columns=['premise', 'hypothesis'],
10
+ output_column='label',
11
+ )
12
+
13
+ CB_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(
17
+ round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt=
21
+ '{premise}\n{hypothesis}\nWhat is the relation between the two sentences?\nA. Contradiction\nB. Entailment\nC. Neutral\nAnswer:'
22
+ ),
23
+ ], ),
24
+ ),
25
+ retriever=dict(type=ZeroRetriever),
26
+ inferencer=dict(type=GenInferencer),
27
+ )
28
+
29
+ CB_eval_cfg = dict(
30
+ evaluator=dict(type=AccEvaluator),
31
+ pred_role='BOT',
32
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABC'),
33
+ )
34
+
35
+ CB_datasets = [
36
+ dict(
37
+ abbr='CB',
38
+ type=CBDatasetV2,
39
+ path='./data/SuperGLUE/CB/val.jsonl',
40
+ reader_cfg=CB_reader_cfg,
41
+ infer_cfg=CB_infer_cfg,
42
+ eval_cfg=CB_eval_cfg,
43
+ )
44
+ ]
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .SuperGLUE_CB_ppl_0143fe import CB_datasets # noqa: F401, F403
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_0143fe.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ CB_reader_cfg = dict(
8
+ input_columns=['premise', 'hypothesis'],
9
+ output_column='label',
10
+ )
11
+
12
+ CB_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 'contradiction':
17
+ dict(round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt=
21
+ '{premise}\n{hypothesis}\nWhat is the relation between the two sentences?'
22
+ ),
23
+ dict(role='BOT', prompt='Contradiction'),
24
+ ]),
25
+ 'entailment':
26
+ dict(round=[
27
+ dict(
28
+ role='HUMAN',
29
+ prompt=
30
+ '{premise}\n{hypothesis}\nWhat is the relation between the two sentences?'
31
+ ),
32
+ dict(role='BOT', prompt='Entailment'),
33
+ ]),
34
+ 'neutral':
35
+ dict(round=[
36
+ dict(
37
+ role='HUMAN',
38
+ prompt=
39
+ '{premise}\n{hypothesis}\nWhat is the relation between the two sentences?'
40
+ ),
41
+ dict(role='BOT', prompt='Neutral'),
42
+ ]),
43
+ },
44
+ ),
45
+ retriever=dict(type=ZeroRetriever),
46
+ inferencer=dict(type=PPLInferencer),
47
+ )
48
+
49
+ CB_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
50
+
51
+ CB_datasets = [
52
+ dict(
53
+ type=HFDataset,
54
+ abbr='CB',
55
+ path='json',
56
+ split='train',
57
+ data_files='./data/SuperGLUE/CB/val.jsonl',
58
+ reader_cfg=CB_reader_cfg,
59
+ infer_cfg=CB_infer_cfg,
60
+ eval_cfg=CB_eval_cfg,
61
+ )
62
+ ]
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_11c175.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ CB_reader_cfg = dict(
8
+ input_columns=['premise', 'hypothesis'], output_column='label')
9
+
10
+ CB_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template={
14
+ 'contradiction': '{premise}?contradiction, {hypothesis}',
15
+ 'entailment': '{premise}?entailment, {hypothesis}',
16
+ 'neutral': '{premise}?neutral, {hypothesis}'
17
+ }),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=PPLInferencer))
20
+
21
+ CB_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
22
+
23
+ CB_datasets = [
24
+ dict(
25
+ type=HFDataset,
26
+ abbr='CB',
27
+ path='json',
28
+ split='train',
29
+ data_files='./data/SuperGLUE/CB/val.jsonl',
30
+ reader_cfg=CB_reader_cfg,
31
+ infer_cfg=CB_infer_cfg,
32
+ eval_cfg=CB_eval_cfg)
33
+ ]
opencompass/configs/datasets/apps/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # APPS
2
+ ## Dataset Description
3
+ APPS is a benchmark for code generation with 10000 problems. It can be used to evaluate the ability of language models to generate code from natural language specifications.
4
+
5
+ ## Dataset Structure
6
+ ```python
7
+ DatasetDict({
8
+ train: Dataset({
9
+ features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
10
+ num_rows: 5000
11
+ })
12
+ test: Dataset({
13
+ features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
14
+ num_rows: 5000
15
+ })
16
+ })
17
+ ```
18
+ We also offer an apps_mini subset, which includes 1500 questions divided proportionally of introductory, interview, and competition categories, with a ratio of 1:1:1(500 questions each).
19
+
20
+ ## How to Use
21
+ You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level:
22
+ ```python
23
+ ds = load_dataset("codeparrot/apps", split="train", difficulties=["competition"])
24
+ print(next(iter(ds))["question"])
25
+ ```
26
+ ## Evaluation results
27
+
28
+
29
+ | dataset | metric | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf |
30
+ |-----------------------|----------|-------------|-------------|-------------|-------------|
31
+ | apps_mini | pass@1 | 1.3 | 0.7 | 7.1 | 9.3 |
32
+
33
+ Please refer to Table 3 of [code llama](https://scontent-nrt1-2.xx.fbcdn.net/v/t39.2365-6/369856151_1754812304950972_1159666448927483931_n.pdf?_nc_cat=107&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=TxT1PKkNBZoAX8zMHbm&_nc_ht=scontent-nrt1-2.xx&oh=00_AfDmmQAPzqX1-QOKIDUV5lGKzaZqt0CZUVtxFjHtnh6ycQ&oe=65F5AF8F) for original results if needed.
34
+
35
+ ## Citation
36
+ ```
37
+ @article{hendrycksapps2021,
38
+ title={Measuring Coding Challenge Competence With APPS},
39
+ author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
40
+ journal={NeurIPS},
41
+ year={2021}
42
+ }
43
+ ```
opencompass/configs/datasets/apps/apps_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .apps_gen_c7893a import APPS_datasets # noqa: F401, F403
opencompass/configs/datasets/apps/apps_gen_c7893a.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import APPSDataset, APPSEvaluator
5
+
6
+ APPS_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
7
+
8
+ APPS_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
12
+ retriever=dict(type=ZeroRetriever),
13
+ inferencer=dict(type=GenInferencer, max_out_len=512),
14
+ )
15
+
16
+ APPS_eval_cfg = dict(evaluator=dict(type=APPSEvaluator), pred_role='BOT')
17
+
18
+ APPS_datasets = [
19
+ dict(
20
+ type=APPSDataset,
21
+ abbr='apps',
22
+ path='codeparrot/apps',
23
+ num_repeats=1,
24
+ reader_cfg=APPS_reader_cfg,
25
+ infer_cfg=APPS_infer_cfg,
26
+ eval_cfg=APPS_eval_cfg,
27
+ )
28
+ ]
opencompass/configs/datasets/apps/apps_mini_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .apps_mini_gen_c7893a import APPS_datasets # noqa: F401, F403
opencompass/configs/datasets/apps/apps_mini_gen_c7893a.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import APPS_miniDataset, APPSEvaluator
5
+
6
+ APPS_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
7
+
8
+ APPS_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
12
+ retriever=dict(type=ZeroRetriever),
13
+ inferencer=dict(type=GenInferencer, max_out_len=512),
14
+ )
15
+
16
+ APPS_eval_cfg = dict(evaluator=dict(type=APPSEvaluator), pred_role='BOT')
17
+
18
+ APPS_mini_datasets = [
19
+ dict(
20
+ type=APPS_miniDataset,
21
+ abbr='apps_mini',
22
+ path='./data/apps_mini',
23
+ num_repeats=1,
24
+ reader_cfg=APPS_reader_cfg,
25
+ infer_cfg=APPS_infer_cfg,
26
+ eval_cfg=APPS_eval_cfg,
27
+ )
28
+ ]
opencompass/configs/datasets/apps/deprecated_apps_gen_5b4254.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
5
+
6
+ apps_reader_cfg = dict(
7
+ input_columns=['question'], output_column='problem_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ apps_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(role='HUMAN', prompt='Write a python program:\n{question}'),
15
+ ])),
16
+ retriever=dict(type=ZeroRetriever),
17
+ inferencer=dict(type=GenInferencer))
18
+
19
+ apps_eval_cfg = dict(
20
+ evaluator=dict(type=HumanEvalEvaluator),
21
+ pred_role='BOT',
22
+ k=[1, 10, 100], # the parameter only for humaneval
23
+ pred_postprocessor=dict(type=humaneval_postprocess),
24
+ )
25
+
26
+ apps_datasets = [
27
+ dict(
28
+ type=HFDataset,
29
+ path='codeparrot/apps',
30
+ reader_cfg=apps_reader_cfg,
31
+ infer_cfg=apps_infer_cfg,
32
+ eval_cfg=apps_eval_cfg)
33
+ ]
opencompass/configs/datasets/apps/deprecated_apps_gen_7fbb95.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
5
+
6
+ apps_reader_cfg = dict(
7
+ input_columns=['question'], output_column='problem_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ apps_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(
14
+ begin=[
15
+ dict(
16
+ role='SYSTEM',
17
+ fallback_role='HUMAN',
18
+ prompt='Write a python program:'),
19
+ ],
20
+ round=[
21
+ dict(role='HUMAN', prompt='{question}'),
22
+ ])),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer))
25
+
26
+ apps_eval_cfg = dict(
27
+ evaluator=dict(type=HumanEvalEvaluator),
28
+ pred_role='BOT',
29
+ k=[1, 10, 100], # the parameter only for humaneval
30
+ pred_postprocessor=dict(type=humaneval_postprocess),
31
+ )
32
+
33
+ apps_datasets = [
34
+ dict(
35
+ type=HFDataset,
36
+ path='codeparrot/apps',
37
+ reader_cfg=apps_reader_cfg,
38
+ infer_cfg=apps_infer_cfg,
39
+ eval_cfg=apps_eval_cfg)
40
+ ]
opencompass/configs/datasets/apps/deprecated_apps_gen_b4dee3.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
5
+
6
+ apps_reader_cfg = dict(
7
+ input_columns=['question'], output_column='problem_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ apps_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template='{question}'),
14
+ retriever=dict(type=ZeroRetriever),
15
+ inferencer=dict(type=GenInferencer, max_out_len=512))
16
+
17
+ apps_eval_cfg = dict(
18
+ evaluator=dict(type=HumanEvalEvaluator),
19
+ k=[1, 10, 100],
20
+ pred_postprocessor=dict(type=humaneval_postprocess),
21
+ )
22
+
23
+ apps_datasets = [
24
+ dict(
25
+ type=HFDataset,
26
+ path='codeparrot/apps',
27
+ reader_cfg=apps_reader_cfg,
28
+ infer_cfg=apps_infer_cfg,
29
+ eval_cfg=apps_eval_cfg)
30
+ ]
opencompass/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLOnlyInferencer
4
+ from opencompass.openicl.icl_evaluator import AveragePPLEvaluator
5
+ from opencompass.datasets import GSM8KDataset, GSM8KReferenceSkywork
6
+
7
+ gsm8k_datasets = []
8
+
9
+ gsm8k_infer_cfg = dict(
10
+ prompt_template=dict(type=PromptTemplate, template='{question} {answer}'),
11
+ retriever=dict(type=ZeroRetriever),
12
+ inferencer=dict(type=PPLOnlyInferencer),
13
+ )
14
+
15
+ gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
16
+
17
+ for split in ['train', 'test']:
18
+ gsm8k_reader_cfg = dict(
19
+ input_columns=['question', 'answer'],
20
+ output_column=None,
21
+ train_split=split,
22
+ test_split=split,
23
+ )
24
+ gsm8k_datasets.append(
25
+ dict(
26
+ abbr=f'gsm8k-{split}-ppl',
27
+ type=GSM8KDataset,
28
+ path='./data/gsm8k',
29
+ reader_cfg=gsm8k_reader_cfg,
30
+ infer_cfg=gsm8k_infer_cfg,
31
+ eval_cfg=gsm8k_eval_cfg)
32
+ )
33
+
34
+
35
+ gsm8k_infer_cfg = dict(
36
+ prompt_template=dict(type=PromptTemplate, template='{text}'),
37
+ retriever=dict(type=ZeroRetriever),
38
+ inferencer=dict(type=PPLOnlyInferencer),
39
+ )
40
+
41
+ gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
42
+
43
+ gsm8k_reader_cfg = dict(
44
+ input_columns=['text'],
45
+ output_column=None,
46
+ )
47
+
48
+ gsm8k_datasets.append(
49
+ dict(
50
+ abbr=f'gsm8k-ref-ppl',
51
+ type=GSM8KReferenceSkywork,
52
+ path='./data/gsm8k-extra/mock_gsm8k_test.jsonl',
53
+ reader_cfg=gsm8k_reader_cfg,
54
+ infer_cfg=gsm8k_infer_cfg,
55
+ eval_cfg=gsm8k_eval_cfg
56
+ )
57
+ )
opencompass/configs/datasets/lcsts/lcsts_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .lcsts_gen_8ee1fe import lcsts_datasets # noqa: F401, F403
opencompass/configs/datasets/lcsts/lcsts_gen_8ee1fe.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator
5
+ from opencompass.datasets import LCSTSDataset, lcsts_postprocess
6
+
7
+ lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst')
8
+
9
+ lcsts_infer_cfg = dict(
10
+ prompt_template=dict(
11
+ type=PromptTemplate,
12
+ template=dict(round=[
13
+ dict(role='HUMAN', prompt='阅读以下文章,并给出简短的摘要:{content}\n摘要如下:'),
14
+ ])),
15
+ retriever=dict(type=ZeroRetriever),
16
+ inferencer=dict(type=GenInferencer))
17
+
18
+ lcsts_eval_cfg = dict(
19
+ evaluator=dict(type=JiebaRougeEvaluator),
20
+ pred_role='BOT',
21
+ pred_postprocessor=dict(type=lcsts_postprocess),
22
+ )
23
+
24
+ lcsts_datasets = [
25
+ dict(
26
+ type=LCSTSDataset,
27
+ abbr='lcsts',
28
+ path='opencompass/LCSTS',
29
+ reader_cfg=lcsts_reader_cfg,
30
+ infer_cfg=lcsts_infer_cfg,
31
+ eval_cfg=lcsts_eval_cfg)
32
+ ]
opencompass/configs/datasets/lcsts/lcsts_gen_9b0b89.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator
5
+ from opencompass.datasets import LCSTSDataset, lcsts_postprocess
6
+
7
+ lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst')
8
+
9
+ lcsts_infer_cfg = dict(
10
+ prompt_template=dict(
11
+ type=PromptTemplate, template='阅读文章:{content}\n根据上文,给出简短的单个摘要:'),
12
+ retriever=dict(type=ZeroRetriever),
13
+ inferencer=dict(type=GenInferencer))
14
+
15
+ lcsts_eval_cfg = dict(
16
+ evaluator=dict(type=JiebaRougeEvaluator),
17
+ pred_postprocessor=dict(type=lcsts_postprocess),
18
+ )
19
+
20
+ lcsts_datasets = [
21
+ dict(
22
+ type=LCSTSDataset,
23
+ abbr='lcsts',
24
+ path='opencompass/LCSTS',
25
+ reader_cfg=lcsts_reader_cfg,
26
+ infer_cfg=lcsts_infer_cfg,
27
+ eval_cfg=lcsts_eval_cfg)
28
+ ]
opencompass/configs/datasets/leval/leval.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .levalnaturalquestion.leval_naturalquestion_gen import LEval_nq_datasets
5
+ from .levalnarrativeqa.leval_narrativeqa_gen import LEval_narrativeqa_datasets
6
+ from .levalmultidocqa.leval_multidocqa_gen import LEval_multidocqa_datasets
7
+ from .levalcoursera.leval_coursera_gen import LEval_coursera_datasets
8
+ from .levaltpo.leval_tpo_gen import LEval_tpo_datasets
9
+ from .levalquality.leval_quality_gen import LEval_quality_datasets
10
+ from .levalgsm100.leval_gsm100_gen import LEval_gsm100_datasets
11
+ from .levaltopicretrieval.leval_topic_retrieval_gen import LEval_tr_datasets
12
+ from .levalfinancialqa.leval_financialqa_gen import LEval_financialqa_datasets
13
+ from .levalgovreportsumm.leval_gov_report_summ_gen import LEval_govreport_summ_datasets
14
+ from .levallegalcontractqa.leval_legalcontractqa_gen import LEval_legalqa_datasets
15
+ from .levalmeetingsumm.leval_meetingsumm_gen import LEval_meetingsumm_datasets
16
+ from .levalnewssumm.leval_newssumm_gen import LEval_newssumm_datasets
17
+ from .levalpaperassistant.leval_paper_assistant_gen import LEval_ps_summ_datasets
18
+ from .levalpatentsumm.leval_patent_summ_gen import LEval_patent_summ_datasets
19
+ from .levaltvshowsumm.leval_tvshow_summ_gen import LEval_tvshow_summ_datasets
20
+ from .levalscientificqa.leval_scientificqa_gen import LEval_scientificqa_datasets
21
+ from .levalreviewsumm.leval_review_summ_gen import LEval_review_summ_datasets
22
+
23
+ leval_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import AgentInferencer
4
+ from opencompass.datasets import (
5
+ MATHDataset, MATHAgentEvaluator, math_postprocess_v2
6
+ )
7
+ # use pal format but not perform well
8
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
9
+
10
+ math_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(
14
+ round=[
15
+ # # ################################### NEW SHOT ###################################
16
+ dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
17
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify
18
+
19
+ def solution():
20
+ x = symbols('x')
21
+ expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
22
+ simplified_expr = simplify(expr)
23
+
24
+ x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
25
+ result = x3_coefficient
26
+ return result"""),
27
+ dict(role='SYSTEM', prompt='Response:26'),
28
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
29
+ dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
30
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math
31
+
32
+ def solution():
33
+ radius = 6
34
+
35
+ # Surface area of the hemisphere
36
+ hemisphere_area = 2 * math.pi * radius**2
37
+
38
+ # Area of the circular base
39
+ base_area = math.pi * radius**2
40
+
41
+ # Total surface area
42
+ total_surface_area = hemisphere_area + base_area
43
+
44
+ # Formatting the result in LaTeX
45
+ result = r'{}\pi'.format(total_surface_area / math.pi)
46
+ return result"""),
47
+ dict(role='SYSTEM', prompt='Response:108.0\\pi'),
48
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
49
+ dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
50
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():
51
+ # Probabilities of each outcome
52
+ prime_prob = 1 / 6
53
+ composite_prob = 1 / 3
54
+ otherwise_prob = 1 / 6
55
+
56
+ # Expected value of each outcome
57
+ prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
58
+ composite_expected_value = 0 * composite_prob
59
+ otherwise_expected_value = -3 * otherwise_prob
60
+
61
+ # Total expected value
62
+ total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
63
+
64
+ # Dollar value to the nearest cent
65
+ result = "{:.2f}".format(total_expected_value)
66
+ return result"""),
67
+ dict(role='SYSTEM', prompt='Response:1.17'),
68
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
69
+ dict(role='HUMAN', prompt='{problem}'),
70
+ ])),
71
+ retriever=dict(type=ZeroRetriever),
72
+ inferencer=dict(type=AgentInferencer),
73
+ )
74
+
75
+ math_eval_cfg = dict(
76
+ evaluator=dict(
77
+ type=MATHAgentEvaluator,
78
+ version='v2'),
79
+ pred_postprocessor=dict(type=math_postprocess_v2))
80
+
81
+ math_datasets = [
82
+ dict(
83
+ abbr='math-agent',
84
+ type=MATHDataset,
85
+ path='./data/math/math.json',
86
+ reader_cfg=math_reader_cfg,
87
+ infer_cfg=math_infer_cfg,
88
+ eval_cfg=math_eval_cfg,
89
+ )
90
+ ]
opencompass/configs/datasets/math/math_0shot_gen_393424.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
5
+
6
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
7
+
8
+ math_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template=dict(
12
+ round=[
13
+ dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
14
+ ]
15
+ ),
16
+ ),
17
+ retriever=dict(type=ZeroRetriever),
18
+ inferencer=dict(type=GenInferencer, max_out_len=1024),
19
+ )
20
+
21
+ # postprocess v2
22
+ math_eval_cfg = dict(
23
+ evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
24
+ )
25
+
26
+ math_datasets = [
27
+ dict(
28
+ type=MATHDataset,
29
+ abbr='math',
30
+ path='opencompass/math',
31
+ reader_cfg=math_reader_cfg,
32
+ infer_cfg=math_infer_cfg,
33
+ eval_cfg=math_eval_cfg,
34
+ )
35
+ ]
opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
6
+
7
+ with read_base():
8
+ from .math_4shot_example_from_google_research import prompt
9
+
10
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
11
+
12
+ math_infer_cfg = dict(
13
+ prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'),
14
+ retriever=dict(type=ZeroRetriever),
15
+ inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Problem', '问题:']))
16
+
17
+ # postprocess v2
18
+ math_eval_cfg = dict(
19
+ evaluator=dict(type=MATHEvaluator, version='v2'),
20
+ pred_postprocessor=dict(type=math_postprocess_v2))
21
+
22
+ math_datasets = [
23
+ dict(
24
+ type=MATHDataset,
25
+ abbr='math',
26
+ path='opencompass/math',
27
+ reader_cfg=math_reader_cfg,
28
+ infer_cfg=math_infer_cfg,
29
+ eval_cfg=math_eval_cfg)
30
+ ]
opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
6
+
7
+ with read_base():
8
+ from .math_4shot_example_from_google_research import prompt
9
+
10
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
11
+
12
+ math_infer_cfg = dict(
13
+ prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'),
14
+ retriever=dict(type=ZeroRetriever),
15
+ inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=['Problem']))
16
+
17
+ # postprocess v2
18
+ math_eval_cfg = dict(
19
+ evaluator=dict(type=MATHEvaluator, version='v2'),
20
+ pred_postprocessor=dict(type=math_postprocess_v2))
21
+
22
+ math_datasets = [
23
+ dict(
24
+ type=MATHDataset,
25
+ abbr='math',
26
+ path='opencompass/math',
27
+ reader_cfg=math_reader_cfg,
28
+ infer_cfg=math_infer_cfg,
29
+ eval_cfg=math_eval_cfg)
30
+ ]
opencompass/configs/datasets/math/math_4shot_example_from_google_research.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Solving Quantitative Reasoning Problems with Language Models
2
+
3
+ prompt = '''
4
+ Problem:
5
+ Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.
6
+
7
+ Solution:
8
+ The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.
9
+ Final Answer: The final answer is $[2,5)$. I hope it is correct.
10
+
11
+ Problem:
12
+ If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$
13
+
14
+ Solution:
15
+ We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$
16
+ Final Answer: The final answer is $24$. I hope it is correct.
17
+
18
+ Problem:
19
+ Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
20
+
21
+ Solution:
22
+ If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:
23
+ \\begin{align*}
24
+ 30n&=480\\
25
+ \\Rightarrow\\qquad n&=480/30=\\boxed{16}
26
+ \\end{align*}
27
+ Final Answer: The final answer is $16$. I hope it is correct.
28
+
29
+ Problem:
30
+ If the system of equations
31
+ \\begin{align*}
32
+ 6x-4y&=a,\\
33
+ 6y-9x &=b.
34
+ \\end{align*}
35
+ has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.
36
+
37
+ Solution:
38
+ If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$
39
+ Final Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.
40
+ '''.strip()
opencompass/configs/datasets/math/math_agent_gen_861b4f.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import AgentInferencer
4
+ from opencompass.datasets import (
5
+ MATHDataset, MATHAgentEvaluator, math_postprocess
6
+ )
7
+
8
+ # use pal format but not perform well
9
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
10
+
11
+ math_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template=dict(
15
+ round=[
16
+ # # ################################### NEW SHOT ###################################
17
+ dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
18
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify
19
+
20
+ def solution():
21
+ x = symbols('x')
22
+ expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
23
+ simplified_expr = simplify(expr)
24
+
25
+ x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
26
+ result = x3_coefficient
27
+ return result"""),
28
+ dict(role='SYSTEM', prompt='Response:26'),
29
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
30
+ dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
31
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math
32
+
33
+ def solution():
34
+ radius = 6
35
+
36
+ # Surface area of the hemisphere
37
+ hemisphere_area = 2 * math.pi * radius**2
38
+
39
+ # Area of the circular base
40
+ base_area = math.pi * radius**2
41
+
42
+ # Total surface area
43
+ total_surface_area = hemisphere_area + base_area
44
+
45
+ # Formatting the result in LaTeX
46
+ result = r'{}\pi'.format(total_surface_area / math.pi)
47
+ return result"""),
48
+ dict(role='SYSTEM', prompt='Response:108.0\\pi'),
49
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
50
+ dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
51
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():
52
+ # Probabilities of each outcome
53
+ prime_prob = 1 / 6
54
+ composite_prob = 1 / 3
55
+ otherwise_prob = 1 / 6
56
+
57
+ # Expected value of each outcome
58
+ prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
59
+ composite_expected_value = 0 * composite_prob
60
+ otherwise_expected_value = -3 * otherwise_prob
61
+
62
+ # Total expected value
63
+ total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
64
+
65
+ # Dollar value to the nearest cent
66
+ result = "{:.2f}".format(total_expected_value)
67
+ return result"""),
68
+ dict(role='SYSTEM', prompt='Response:1.17'),
69
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
70
+ dict(role='HUMAN', prompt='{problem}'),
71
+ ])),
72
+ retriever=dict(type=ZeroRetriever),
73
+ inferencer=dict(type=AgentInferencer),
74
+ )
75
+
76
+ math_eval_cfg = dict(
77
+ evaluator=dict(type=MATHAgentEvaluator),
78
+ pred_postprocessor=dict(type=math_postprocess),
79
+ )
80
+
81
+ math_datasets = [
82
+ dict(
83
+ abbr='math-agent',
84
+ type=MATHDataset,
85
+ path='opencompass/math',
86
+ reader_cfg=math_reader_cfg,
87
+ infer_cfg=math_infer_cfg,
88
+ eval_cfg=math_eval_cfg,
89
+ )
90
+ ]
opencompass/configs/datasets/math/math_agent_gen_af2293.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import AgentInferencer
4
+ from opencompass.datasets import (
5
+ MATHDataset, MATHAgentEvaluator, math_postprocess
6
+ )
7
+
8
+ # use pal format but not perform well
9
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
10
+
11
+ math_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template=dict(
15
+ round=[
16
+ # # ################################### NEW SHOT ###################################
17
+ dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
18
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify
19
+
20
+ def solution():
21
+ x = symbols('x')
22
+ expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
23
+ simplified_expr = simplify(expr)
24
+
25
+ x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
26
+ result = x3_coefficient
27
+ return result"""),
28
+ dict(role='SYSTEM', prompt='Response:26'),
29
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
30
+ dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
31
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math
32
+
33
+ def solution():
34
+ radius = 6
35
+
36
+ # Surface area of the hemisphere
37
+ hemisphere_area = 2 * math.pi * radius**2
38
+
39
+ # Area of the circular base
40
+ base_area = math.pi * radius**2
41
+
42
+ # Total surface area
43
+ total_surface_area = hemisphere_area + base_area
44
+
45
+ # Formatting the result in LaTeX
46
+ result = r'{}\pi'.format(total_surface_area / math.pi)
47
+ return result"""),
48
+ dict(role='SYSTEM', prompt='Response:108.0\\pi'),
49
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
50
+ dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
51
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():
52
+ # Probabilities of each outcome
53
+ prime_prob = 1 / 6
54
+ composite_prob = 1 / 3
55
+ otherwise_prob = 1 / 6
56
+
57
+ # Expected value of each outcome
58
+ prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
59
+ composite_expected_value = 0 * composite_prob
60
+ otherwise_expected_value = -3 * otherwise_prob
61
+
62
+ # Total expected value
63
+ total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
64
+
65
+ # Dollar value to the nearest cent
66
+ result = "{:.2f}".format(total_expected_value)
67
+ return result"""),
68
+ dict(role='SYSTEM', prompt='Response:1.17'),
69
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
70
+ dict(role='HUMAN', prompt='Given $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$'),
71
+ dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import numpy as np
72
+
73
+ def solution()
74
+ a = np.array([-7, 0, 1])
75
+ b = np.array([4, 2, -1])
76
+
77
+ result = a - 3 * b
78
+
79
+ result = r'\\begin{{pmatrix}} {} \ {} \ {} \end{{pmatrix}}'.format(result[0], result[1], result[2])
80
+ return result"""),
81
+ dict(role='SYSTEM', prompt='Response:\\begin{pmatrix} -19 \\ -6 \\ 4 \\end{pmatrix}'),
82
+ dict(role='BOT', prompt='FinalAnswer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \\end{pmatrix}$. I hope it is correct.'),
83
+ dict(role='HUMAN', prompt='{problem}'),
84
+ ])),
85
+ retriever=dict(type=ZeroRetriever),
86
+ inferencer=dict(type=AgentInferencer),
87
+ )
88
+
89
+ math_eval_cfg = dict(
90
+ evaluator=dict(type=MATHAgentEvaluator),
91
+ pred_postprocessor=dict(type=math_postprocess),
92
+ )
93
+
94
+ math_datasets = [
95
+ dict(
96
+ abbr='math-agent',
97
+ type=MATHDataset,
98
+ path='opencompass/math',
99
+ reader_cfg=math_reader_cfg,
100
+ infer_cfg=math_infer_cfg,
101
+ eval_cfg=math_eval_cfg,
102
+ )
103
+ ]
opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
5
+
6
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
7
+
8
+ math_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template=dict(round=[
12
+ dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'),
13
+ dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'),
14
+ dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'),
15
+ dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'),
16
+ dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'),
17
+ dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'),
18
+ dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'),
19
+ dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'),
20
+ dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
21
+ ])),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
24
+
25
+ # postprocess v2
26
+ math_eval_cfg = dict(
27
+ evaluator=dict(type=MATHEvaluator, version='v2'),
28
+ pred_postprocessor=dict(type=math_postprocess_v2))
29
+
30
+ math_datasets = [
31
+ dict(
32
+ type=MATHDataset,
33
+ abbr='math',
34
+ path='opencompass/math',
35
+ reader_cfg=math_reader_cfg,
36
+ infer_cfg=math_infer_cfg,
37
+ eval_cfg=math_eval_cfg)
38
+ ]
opencompass/configs/datasets/math/math_gen_1ed9c2.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
5
+
6
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
7
+
8
+ math_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template=dict(round=[
12
+ dict(role='HUMAN', prompt='Problem:\nFind the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplified.\nSolution:'),
13
+ dict(role='BOT', prompt='Combine like terms to simplify the expression. The coefficient of $x^3$ is calculated as $$(-3+2\cdot(2+1))+(-5)\cdot(-4))$ = 26$. Thus, the coefficient of $x^3$ is $\\boxed{26}$.\nFinal Answer: The final answer is $26$. I hope it is correct.\n'),
14
+ dict(role='HUMAN', prompt='Problem:\nThe surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.\nSolution:'),
15
+ dict(role='BOT', prompt='The surface area of a hemisphere (not including the base) is half that of a sphere, so it is $2\pi r^2$. The area of the base is $\pi r^2$. Therefore, for a hemisphere with radius 6 cm, the total surface area is $2\pi (6)^2 + \pi (6)^2 = 108\pi$ square cm.\nFinal Answer: The final answer is $108\pi$ square cm. I hope it is correct.\n'),
16
+ dict(role='HUMAN', prompt='Problem:\nMonica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.\nSolution:'),
17
+ dict(role='BOT', prompt='The prime numbers rolled could be 2, 3, or 5, and each has a 1/6 chance of being rolled. The composite number 4 or 6 has a 2/6 chance of being rolled, but it results in $0 win. The remaining non-prime and non-composite number is 1 , and it results in a loss of $3, with a 1/6 chance. So, the expected winnings are $(2+3+5)(1/6)+0(2/6)+(-3)(1/6) = \$1.17$.\nFinal Answer: The final answer is $\$1.17$. I hope it is correct.\n'),
18
+ dict(role='HUMAN', prompt='Problem:\nGiven $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$\nSolution:'),
19
+ dict(role='BOT', prompt='We find $3 \mathbf{b}$ first, which is $\\begin{pmatrix} 12 \\ 6 \\ -3 \end{pmatrix}$. Then we subtract this vector from $\mathbf{a}$. So, $\mathbf{a} - 3 \mathbf{b} = \\begin{pmatrix} -7 - 12 \\ 0 - 6 \\ 1 - (-3) \end{pmatrix} = \\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}.$\nFinal Answer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}$. I hope it is correct.\n'),
20
+ dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
21
+ ])),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(type=GenInferencer, max_out_len=512))
24
+
25
+ math_eval_cfg = dict(
26
+ evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
27
+
28
+ math_datasets = [
29
+ dict(
30
+ type=MATHDataset,
31
+ abbr='math',
32
+ path='opencompass/math',
33
+ reader_cfg=math_reader_cfg,
34
+ infer_cfg=math_infer_cfg,
35
+ eval_cfg=math_eval_cfg)
36
+ ]
opencompass/configs/datasets/math/math_gen_559593.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
5
+
6
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
7
+
8
+ math_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template='''Problem:
12
+ Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}
13
+ Solution:
14
+ The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.
15
+ Final Answer: The final answer is $[2,5)$. I hope it is correct.
16
+
17
+ Problem:
18
+ If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$
19
+ Solution:
20
+ We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$
21
+ Final Answer: The final answer is $24$. I hope it is correct.
22
+
23
+ Problem:
24
+ Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
25
+ Solution:
26
+ If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}
27
+ Final Answer: The final answer is $16$. I hope it is correct.
28
+
29
+ Problem:
30
+ If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.
31
+ Solution:
32
+ If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$
33
+ Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.
34
+
35
+ Problem:
36
+ {problem}
37
+ Solution:
38
+ {solution}'''),
39
+ retriever=dict(type=ZeroRetriever),
40
+ inferencer=dict(type=GenInferencer, max_out_len=512))
41
+
42
+ math_eval_cfg = dict(
43
+ evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
44
+
45
+ math_datasets = [
46
+ dict(
47
+ type=MATHDataset,
48
+ abbr='math',
49
+ path='opencompass/math',
50
+ reader_cfg=math_reader_cfg,
51
+ infer_cfg=math_infer_cfg,
52
+ eval_cfg=math_eval_cfg)
53
+ ]
opencompass/configs/datasets/math/math_gen_5e8458.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
5
+
6
+ math_infer_cfg = dict(
7
+ prompt_template=dict(
8
+ type=PromptTemplate,
9
+ template='''Problem:
10
+ Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}
11
+ Solution:
12
+ The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.
13
+ Final Answer: The final answer is $[2,5)$. I hope it is correct.
14
+
15
+ Problem:
16
+ If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$
17
+ Solution:
18
+ We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$
19
+ Final Answer: The final answer is $24$. I hope it is correct.
20
+
21
+ Problem:
22
+ Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
23
+ Solution:
24
+ If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}
25
+ Final Answer: The final answer is $16$. I hope it is correct.
26
+
27
+ Problem:
28
+ If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.
29
+ Solution:
30
+ If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$
31
+ Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.
32
+
33
+ Problem:
34
+ {problem}Solution:
35
+ {solution}'''),
36
+ retriever=dict(type=ZeroRetriever),
37
+ inferencer=dict(type=GenInferencer, max_out_len=512))
38
+
39
+ math_eval_cfg = dict(
40
+ evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
41
+
42
+ math_datasets = [
43
+ dict(
44
+ type=MATHDataset,
45
+ abbr='math',
46
+ path='opencompass/math',
47
+ reader_cfg=dict(
48
+ input_columns=['problem'],
49
+ output_column='solution',
50
+ ),
51
+ infer_cfg=math_infer_cfg,
52
+ eval_cfg=math_eval_cfg)
53
+ ]
opencompass/configs/datasets/math/math_gen_78ced2.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
5
+
6
+ QUERY_TEMPLATE = """
7
+ Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
8
+
9
+ {problem}
10
+
11
+ Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command.
12
+ """.strip()
13
+
14
+ math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
15
+
16
+ math_infer_cfg = dict(
17
+ prompt_template=dict(
18
+ type=PromptTemplate,
19
+
20
+ template=dict(round=[
21
+ dict(role='HUMAN', prompt=QUERY_TEMPLATE),
22
+ ])),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer, max_out_len=512))
25
+
26
+ math_eval_cfg = dict(
27
+ evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
28
+
29
+ math_datasets = [
30
+ dict(
31
+ type=MATHDataset,
32
+ abbr='math',
33
+ path='opencompass/math',
34
+ reader_cfg=math_reader_cfg,
35
+ infer_cfg=math_infer_cfg,
36
+ eval_cfg=math_eval_cfg)
37
+ ]