Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py +4 -0
- opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py +59 -0
- opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl.py +4 -0
- opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py +51 -0
- opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py +4 -0
- opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py +53 -0
- opencompass/configs/datasets/MMLUArabic/README.md +26 -0
- opencompass/configs/datasets/QuALITY/QuALITY_gen.py +4 -0
- opencompass/configs/datasets/QuALITY/QuALITY_gen_c407cb.py +38 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py +55 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py +47 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_ppl.py +47 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py +4 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py +41 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py +4 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py +43 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py +43 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py +45 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py +45 -0
- opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py +34 -0
- opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py +4 -0
- opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py +44 -0
- opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl.py +4 -0
- opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_0143fe.py +62 -0
- opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_11c175.py +33 -0
- opencompass/configs/datasets/apps/README.md +43 -0
- opencompass/configs/datasets/apps/apps_gen.py +4 -0
- opencompass/configs/datasets/apps/apps_gen_c7893a.py +28 -0
- opencompass/configs/datasets/apps/apps_mini_gen.py +4 -0
- opencompass/configs/datasets/apps/apps_mini_gen_c7893a.py +28 -0
- opencompass/configs/datasets/apps/deprecated_apps_gen_5b4254.py +33 -0
- opencompass/configs/datasets/apps/deprecated_apps_gen_7fbb95.py +40 -0
- opencompass/configs/datasets/apps/deprecated_apps_gen_b4dee3.py +30 -0
- opencompass/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py +57 -0
- opencompass/configs/datasets/lcsts/lcsts_gen.py +4 -0
- opencompass/configs/datasets/lcsts/lcsts_gen_8ee1fe.py +32 -0
- opencompass/configs/datasets/lcsts/lcsts_gen_9b0b89.py +28 -0
- opencompass/configs/datasets/leval/leval.py +23 -0
- opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py +90 -0
- opencompass/configs/datasets/math/math_0shot_gen_393424.py +35 -0
- opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py +30 -0
- opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py +30 -0
- opencompass/configs/datasets/math/math_4shot_example_from_google_research.py +40 -0
- opencompass/configs/datasets/math/math_agent_gen_861b4f.py +90 -0
- opencompass/configs/datasets/math/math_agent_gen_af2293.py +103 -0
- opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py +38 -0
- opencompass/configs/datasets/math/math_gen_1ed9c2.py +36 -0
- opencompass/configs/datasets/math/math_gen_559593.py +53 -0
- opencompass/configs/datasets/math/math_gen_5e8458.py +53 -0
- opencompass/configs/datasets/math/math_gen_78ced2.py +37 -0
opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .MMLUArabic_gen_326684 import MMLUArabic_datasets # noqa: F401, F403
|
opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen_326684.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import MMLUArabicDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
|
9 |
+
# Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
|
10 |
+
|
11 |
+
MMLUArabic_reader_cfg = dict(
|
12 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
13 |
+
output_column='target',
|
14 |
+
train_split='dev')
|
15 |
+
|
16 |
+
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
|
17 |
+
MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
|
18 |
+
|
19 |
+
MMLUArabic_datasets = []
|
20 |
+
for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
|
21 |
+
_system = f"فيما يلي أسئلة الاختيار من متعدد (مع الإجابات) حول {' '.join(_name_ar.split('_'))}"
|
22 |
+
_hint = '\n{input}'
|
23 |
+
MMLUArabic_infer_cfg = dict(
|
24 |
+
ice_template=dict(
|
25 |
+
type=PromptTemplate,
|
26 |
+
template=dict(
|
27 |
+
begin=[
|
28 |
+
dict(role='SYSTEM', fallback_role='HUMAN', prompt=_system),
|
29 |
+
'</E>',
|
30 |
+
],
|
31 |
+
round=[
|
32 |
+
dict(
|
33 |
+
role='HUMAN',
|
34 |
+
prompt=_hint.format(input='سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}')
|
35 |
+
),
|
36 |
+
dict(role='BOT', prompt='إجابة: {target}')
|
37 |
+
]),
|
38 |
+
ice_token='</E>',
|
39 |
+
),
|
40 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
41 |
+
inferencer=dict(type=GenInferencer),
|
42 |
+
)
|
43 |
+
|
44 |
+
MMLUArabic_eval_cfg = dict(
|
45 |
+
evaluator=dict(type=AccEvaluator),
|
46 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
47 |
+
|
48 |
+
MMLUArabic_datasets.append(
|
49 |
+
dict(
|
50 |
+
abbr=f'acegpt_MMLUArabic_{_name}',
|
51 |
+
type=MMLUArabicDataset,
|
52 |
+
path='./data/MMLUArabic/',
|
53 |
+
name=_name,
|
54 |
+
reader_cfg=MMLUArabic_reader_cfg,
|
55 |
+
infer_cfg=MMLUArabic_infer_cfg,
|
56 |
+
eval_cfg=MMLUArabic_eval_cfg,
|
57 |
+
))
|
58 |
+
|
59 |
+
del _name, _hint
|
opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .MMLUArabic_ppl_d2333a import MMLUArabic_datasets # noqa: F401, F403
|
opencompass/configs/datasets/MMLUArabic/MMLUArabic_ppl_d2333a.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import MMLUArabicDataset
|
6 |
+
|
7 |
+
# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
|
8 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
9 |
+
|
10 |
+
MMLUArabic_reader_cfg = dict(
|
11 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
12 |
+
output_column='target',
|
13 |
+
train_split='dev')
|
14 |
+
|
15 |
+
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
|
16 |
+
MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
|
17 |
+
|
18 |
+
MMLUArabic_datasets = []
|
19 |
+
for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
|
20 |
+
# _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
|
21 |
+
_hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n"
|
22 |
+
# question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
|
23 |
+
question_overall = 'سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
|
24 |
+
MMLUArabic_infer_cfg = dict(
|
25 |
+
ice_template=dict(
|
26 |
+
type=PromptTemplate,
|
27 |
+
template={opt: f'{question_overall}\nإجابة: {opt}\n' for opt in ['A', 'B', 'C', 'D']},
|
28 |
+
),
|
29 |
+
prompt_template=dict(
|
30 |
+
type=PromptTemplate,
|
31 |
+
template={opt: f'{_hint}</E>{question_overall}\nإجابة: {opt}' for opt in ['A', 'B', 'C', 'D']},
|
32 |
+
ice_token='</E>',
|
33 |
+
),
|
34 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
35 |
+
inferencer=dict(type=PPLInferencer),
|
36 |
+
)
|
37 |
+
|
38 |
+
MMLUArabic_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
39 |
+
|
40 |
+
MMLUArabic_datasets.append(
|
41 |
+
dict(
|
42 |
+
abbr=f'acegpt_MMLUArabic_{_name}',
|
43 |
+
type=MMLUArabicDataset,
|
44 |
+
path='./data/MMLUArabic/',
|
45 |
+
name=_name,
|
46 |
+
reader_cfg=MMLUArabic_reader_cfg,
|
47 |
+
infer_cfg=MMLUArabic_infer_cfg,
|
48 |
+
eval_cfg=MMLUArabic_eval_cfg,
|
49 |
+
))
|
50 |
+
|
51 |
+
del _name, _hint
|
opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .MMLUArabic_zero_shot_gen_3523e0 import MMLUArabic_datasets # noqa: F401, F403
|
opencompass/configs/datasets/MMLUArabic/MMLUArabic_zero_shot_gen_3523e0.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import MMLUArabicDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
# None of the MMLUArabic dataset in huggingface is correctly parsed, so we use our own dataset reader
|
9 |
+
# Please download the dataset from https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic
|
10 |
+
|
11 |
+
MMLUArabic_reader_cfg = dict(
|
12 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
13 |
+
output_column='target',
|
14 |
+
train_split='dev')
|
15 |
+
|
16 |
+
MMLUArabic_all_sets = ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
|
17 |
+
MMLUArabic_all_sets_ar = ['جبر_تجريدي', 'تشريح', 'علم_الفلك', 'أخلاقيات_الأعمال', 'المعرفة_السريرية', 'علم_الأحياء_الجامعي', 'كيمياء_جامعية', 'علوم_الحاسوب_الجامعية', 'رياضيات_جامعية', 'طب_جامعي', 'فيزياء_جامعية', 'أمان_الحاسوب', 'فيزياء_مفاهيمية', 'الاقتصاد_القياسي', 'هندسة_كهربائية', 'رياضيات_ابتدائية', 'منطق_رسمي', 'حقائق_عالمية', 'علم_الأحياء_الثانوي', 'كيمياء_ثانوية', 'علوم_الحاسوب_الثانوية', 'تاريخ_أوروبا_الثانوي', 'جغرافية_ثانوية', 'الحكومة_والسياسة_الثانوية', 'اقتصاد_كلي_ثانوي', 'رياضيات_ثانوية', 'اقتصاد_جزئي_ثانوي', 'فيزياء_ثانوية', 'علم_النفس_الثانوي', 'إحصاء_ثانوي', 'تاريخ_الولايات_المتحدة_الثانوي', 'تاريخ_العالم_الثانوي', 'شيخوخة_الإنسان', 'جنسانية_بشرية', 'قانون_دولي', 'فقه', 'أخطاء_منطقية', 'تعلم_الآلة', 'إدارة', 'تسويق', 'جينات_طبية', 'متفرقات', 'نزاعات_أخلاقية', 'سيناريوهات_أخلاقية', 'تغذية', 'فلسفة', 'ما_قبل_التاريخ', 'محاسبة_مهنية', 'قانون_مهني', 'طب_مهني', 'علم_النفس_المهني', 'علاقات_عامة', 'دراسات_الأمان', 'علم_الاجتماع', 'سياسة_خارجية_أمريكية', 'علم_الفيروسات', 'أديان_العالم']
|
18 |
+
|
19 |
+
MMLUArabic_datasets = []
|
20 |
+
for _name, _name_ar in zip(MMLUArabic_all_sets, MMLUArabic_all_sets_ar):
|
21 |
+
_hint = f"فيما يلي أسئلة الاختيار من متعدد حول {' '.join(_name_ar.split('_'))}\n\n" + '{input}\n' + "من فضلك اختر إجابة واحدة من بين 'A، B، C، D' دون شرح."
|
22 |
+
MMLUArabic_infer_cfg = dict(
|
23 |
+
ice_template=dict(
|
24 |
+
type=PromptTemplate,
|
25 |
+
template=dict(
|
26 |
+
round=[
|
27 |
+
dict(
|
28 |
+
role='HUMAN',
|
29 |
+
prompt=_hint.format(input='سؤال: {input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}')
|
30 |
+
),
|
31 |
+
]),
|
32 |
+
ice_token='</E>',
|
33 |
+
),
|
34 |
+
retriever=dict(type=ZeroRetriever),
|
35 |
+
inferencer=dict(type=GenInferencer),
|
36 |
+
)
|
37 |
+
|
38 |
+
MMLUArabic_eval_cfg = dict(
|
39 |
+
evaluator=dict(type=AccEvaluator),
|
40 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
41 |
+
|
42 |
+
MMLUArabic_datasets.append(
|
43 |
+
dict(
|
44 |
+
abbr=f'acegpt_MMLUArabic_{_name}',
|
45 |
+
type=MMLUArabicDataset,
|
46 |
+
path='./data/MMLUArabic/',
|
47 |
+
name=_name,
|
48 |
+
reader_cfg=MMLUArabic_reader_cfg,
|
49 |
+
infer_cfg=MMLUArabic_infer_cfg,
|
50 |
+
eval_cfg=MMLUArabic_eval_cfg,
|
51 |
+
))
|
52 |
+
|
53 |
+
del _name, _hint
|
opencompass/configs/datasets/MMLUArabic/README.md
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MMLUArabic
|
2 |
+
## Dataset Description
|
3 |
+
MMLUArabic is a benchmark for the assessment of knowledge in Arabic and covers a wide range of topics and aspects, consisting of multiple-choice questions in various branches of knowledge.
|
4 |
+
|
5 |
+
|
6 |
+
## How to Use
|
7 |
+
Download file from [link](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic)
|
8 |
+
|
9 |
+
```python
|
10 |
+
val_ds = load_dataset("MMLUArabic", header=None)['validation']
|
11 |
+
test_ds = load_dataset("MMLUArabic", header=None)['test']
|
12 |
+
# input, option_a, option_b, option_c, option_d, target
|
13 |
+
print(next(iter(val_ds)))
|
14 |
+
```
|
15 |
+
|
16 |
+
## Citation
|
17 |
+
```
|
18 |
+
@misc{huang2023acegpt,
|
19 |
+
title={AceGPT, Localizing Large Language Models in Arabic},
|
20 |
+
author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
|
21 |
+
year={2023},
|
22 |
+
eprint={2309.12053},
|
23 |
+
archivePrefix={arXiv},
|
24 |
+
primaryClass={cs.CL}
|
25 |
+
}
|
26 |
+
```
|
opencompass/configs/datasets/QuALITY/QuALITY_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .QuALITY_gen_c407cb import QuALITY_datasets # noqa: F401, F403
|
opencompass/configs/datasets/QuALITY/QuALITY_gen_c407cb.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import QuALITYDataset, QuALITYEvaluator
|
5 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
6 |
+
|
7 |
+
QuALITY_reader_cfg = dict(
|
8 |
+
input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
|
9 |
+
output_column='gold_label',
|
10 |
+
)
|
11 |
+
|
12 |
+
QuALITY_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template=dict(round=[
|
16 |
+
dict(
|
17 |
+
role='HUMAN',
|
18 |
+
prompt=
|
19 |
+
'Read the article, and answer the question.\n\nArticle:\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
|
20 |
+
),
|
21 |
+
])),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer))
|
24 |
+
|
25 |
+
QuALITY_eval_cfg = dict(
|
26 |
+
evaluator=dict(type=QuALITYEvaluator),
|
27 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
28 |
+
pred_role='BOT')
|
29 |
+
|
30 |
+
QuALITY_datasets = [
|
31 |
+
dict(
|
32 |
+
abbr='QuALITY',
|
33 |
+
type=QuALITYDataset,
|
34 |
+
path='./data/QuALITY/QuALITY.v1.0.1.htmlstripped.dev',
|
35 |
+
reader_cfg=QuALITY_reader_cfg,
|
36 |
+
infer_cfg=QuALITY_infer_cfg,
|
37 |
+
eval_cfg=QuALITY_eval_cfg),
|
38 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_cot_gen_1d56df.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import BoolQDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import (
|
7 |
+
first_option_postprocess,
|
8 |
+
)
|
9 |
+
|
10 |
+
QUERY_TEMPLATE = """
|
11 |
+
Answer the following question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of AB. Think step by step before answering.
|
12 |
+
|
13 |
+
Passage: {passage}
|
14 |
+
|
15 |
+
Question: {question}
|
16 |
+
|
17 |
+
A. Yes
|
18 |
+
B. NO
|
19 |
+
|
20 |
+
""".strip()
|
21 |
+
|
22 |
+
BoolQ_reader_cfg = dict(
|
23 |
+
input_columns=['question', 'passage'],
|
24 |
+
output_column='label',
|
25 |
+
)
|
26 |
+
|
27 |
+
BoolQ_infer_cfg = dict(
|
28 |
+
prompt_template=dict(
|
29 |
+
type=PromptTemplate,
|
30 |
+
template=dict(
|
31 |
+
round=[
|
32 |
+
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
33 |
+
]
|
34 |
+
),
|
35 |
+
),
|
36 |
+
retriever=dict(type=ZeroRetriever),
|
37 |
+
inferencer=dict(type=GenInferencer),
|
38 |
+
)
|
39 |
+
|
40 |
+
BoolQ_eval_cfg = dict(
|
41 |
+
evaluator=dict(type=AccEvaluator),
|
42 |
+
pred_role='BOT',
|
43 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
44 |
+
)
|
45 |
+
|
46 |
+
BoolQ_datasets = [
|
47 |
+
dict(
|
48 |
+
abbr='BoolQ',
|
49 |
+
type=BoolQDatasetV2,
|
50 |
+
path='opencompass/boolq',
|
51 |
+
reader_cfg=BoolQ_reader_cfg,
|
52 |
+
infer_cfg=BoolQ_infer_cfg,
|
53 |
+
eval_cfg=BoolQ_eval_cfg,
|
54 |
+
)
|
55 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_gen_ba58ea.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import BoolQDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
BoolQ_reader_cfg = dict(
|
9 |
+
input_columns=['question', 'passage'],
|
10 |
+
output_column='label',
|
11 |
+
)
|
12 |
+
|
13 |
+
BoolQ_infer_cfg = dict(
|
14 |
+
ice_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(
|
17 |
+
begin='</E>',
|
18 |
+
round=[
|
19 |
+
dict(
|
20 |
+
role='HUMAN',
|
21 |
+
prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:',
|
22 |
+
),
|
23 |
+
dict(role='BOT', prompt='{label}'),
|
24 |
+
],
|
25 |
+
),
|
26 |
+
ice_token='</E>',
|
27 |
+
),
|
28 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
|
29 |
+
inferencer=dict(type=GenInferencer, max_out_len=50),
|
30 |
+
)
|
31 |
+
|
32 |
+
BoolQ_eval_cfg = dict(
|
33 |
+
evaluator=dict(type=AccEvaluator),
|
34 |
+
pred_role='BOT',
|
35 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
36 |
+
)
|
37 |
+
|
38 |
+
BoolQ_datasets = [
|
39 |
+
dict(
|
40 |
+
abbr='BoolQ',
|
41 |
+
type=BoolQDatasetV2,
|
42 |
+
path='opencompass/boolq',
|
43 |
+
reader_cfg=BoolQ_reader_cfg,
|
44 |
+
infer_cfg=BoolQ_infer_cfg,
|
45 |
+
eval_cfg=BoolQ_eval_cfg,
|
46 |
+
)
|
47 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_few_shot_ppl.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
3 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
4 |
+
from opencompass.datasets import BoolQDatasetV2
|
5 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
6 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
7 |
+
|
8 |
+
BoolQ_reader_cfg = dict(
|
9 |
+
input_columns=['question', 'passage'],
|
10 |
+
output_column='label',
|
11 |
+
)
|
12 |
+
|
13 |
+
BoolQ_infer_cfg = dict(
|
14 |
+
ice_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template={
|
17 |
+
'B': dict(
|
18 |
+
round=[
|
19 |
+
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
|
20 |
+
dict(role='BOT', prompt='No'),
|
21 |
+
]
|
22 |
+
),
|
23 |
+
'A': dict(
|
24 |
+
round=[
|
25 |
+
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
|
26 |
+
dict(role='BOT', prompt='Yes'),
|
27 |
+
]
|
28 |
+
),
|
29 |
+
},
|
30 |
+
ice_token='</E>',
|
31 |
+
),
|
32 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
|
33 |
+
inferencer=dict(type=PPLInferencer, max_out_len=50),
|
34 |
+
)
|
35 |
+
|
36 |
+
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
37 |
+
|
38 |
+
BoolQ_datasets = [
|
39 |
+
dict(
|
40 |
+
abbr='BoolQ',
|
41 |
+
type=BoolQDatasetV2,
|
42 |
+
path='opencompass/boolq',
|
43 |
+
reader_cfg=BoolQ_reader_cfg,
|
44 |
+
infer_cfg=BoolQ_infer_cfg,
|
45 |
+
eval_cfg=BoolQ_eval_cfg,
|
46 |
+
)
|
47 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets # noqa: F401, F403
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import BoolQDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
BoolQ_reader_cfg = dict(
|
9 |
+
input_columns=['question', 'passage'],
|
10 |
+
output_column='label',
|
11 |
+
)
|
12 |
+
|
13 |
+
BoolQ_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt='{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:'),
|
20 |
+
]),
|
21 |
+
),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer),
|
24 |
+
)
|
25 |
+
|
26 |
+
BoolQ_eval_cfg = dict(
|
27 |
+
evaluator=dict(type=AccEvaluator),
|
28 |
+
pred_role='BOT',
|
29 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
30 |
+
)
|
31 |
+
|
32 |
+
BoolQ_datasets = [
|
33 |
+
dict(
|
34 |
+
abbr='BoolQ',
|
35 |
+
type=BoolQDatasetV2,
|
36 |
+
path='opencompass/boolq',
|
37 |
+
reader_cfg=BoolQ_reader_cfg,
|
38 |
+
infer_cfg=BoolQ_infer_cfg,
|
39 |
+
eval_cfg=BoolQ_eval_cfg,
|
40 |
+
)
|
41 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets # noqa: F401, F403
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_16b1d9.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import BoolQDatasetV2
|
6 |
+
|
7 |
+
BoolQ_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'passage'],
|
9 |
+
output_column='label',
|
10 |
+
)
|
11 |
+
|
12 |
+
BoolQ_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'A':
|
17 |
+
dict(round=[
|
18 |
+
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
|
19 |
+
dict(role='BOT', prompt='Yes'),
|
20 |
+
]),
|
21 |
+
'B':
|
22 |
+
dict(round=[
|
23 |
+
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
|
24 |
+
dict(role='BOT', prompt='No'),
|
25 |
+
]),
|
26 |
+
},
|
27 |
+
),
|
28 |
+
retriever=dict(type=ZeroRetriever),
|
29 |
+
inferencer=dict(type=PPLInferencer),
|
30 |
+
)
|
31 |
+
|
32 |
+
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
33 |
+
|
34 |
+
BoolQ_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='BoolQ',
|
37 |
+
type=BoolQDatasetV2,
|
38 |
+
path='opencompass/boolq',
|
39 |
+
reader_cfg=BoolQ_reader_cfg,
|
40 |
+
infer_cfg=BoolQ_infer_cfg,
|
41 |
+
eval_cfg=BoolQ_eval_cfg,
|
42 |
+
)
|
43 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314797.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import BoolQDatasetV3
|
6 |
+
|
7 |
+
BoolQ_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'passage'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
BoolQ_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'false':
|
17 |
+
dict(round=[
|
18 |
+
dict(role='HUMAN', prompt='Passage: {passage}\nQuestion: {question}?'),
|
19 |
+
dict(role='BOT', prompt='Answer: No'),
|
20 |
+
]),
|
21 |
+
'true':
|
22 |
+
dict(round=[
|
23 |
+
dict(role='HUMAN', prompt='Passage: {passage}\nQuestion: {question}?'),
|
24 |
+
dict(role='BOT', prompt='Answer: Yes'),
|
25 |
+
]),
|
26 |
+
},
|
27 |
+
),
|
28 |
+
retriever=dict(type=ZeroRetriever),
|
29 |
+
inferencer=dict(type=PPLInferencer),
|
30 |
+
)
|
31 |
+
|
32 |
+
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
33 |
+
|
34 |
+
BoolQ_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='BoolQ',
|
37 |
+
type=BoolQDatasetV3,
|
38 |
+
path='opencompass/boolq',
|
39 |
+
reader_cfg=BoolQ_reader_cfg,
|
40 |
+
infer_cfg=BoolQ_infer_cfg,
|
41 |
+
eval_cfg=BoolQ_eval_cfg,
|
42 |
+
)
|
43 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_314b96.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import BoolQDataset
|
6 |
+
|
7 |
+
BoolQ_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'passage'],
|
9 |
+
output_column='answer',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
BoolQ_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0:
|
17 |
+
dict(round=[
|
18 |
+
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
|
19 |
+
dict(role='BOT', prompt='No'),
|
20 |
+
]),
|
21 |
+
1:
|
22 |
+
dict(round=[
|
23 |
+
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}?'),
|
24 |
+
dict(role='BOT', prompt='Yes'),
|
25 |
+
]),
|
26 |
+
},
|
27 |
+
),
|
28 |
+
retriever=dict(type=ZeroRetriever),
|
29 |
+
inferencer=dict(type=PPLInferencer),
|
30 |
+
)
|
31 |
+
|
32 |
+
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
33 |
+
|
34 |
+
BoolQ_datasets = [
|
35 |
+
dict(
|
36 |
+
type=BoolQDataset,
|
37 |
+
abbr='BoolQ',
|
38 |
+
path='json',
|
39 |
+
data_files='opencompass/boolq',
|
40 |
+
split='train',
|
41 |
+
reader_cfg=BoolQ_reader_cfg,
|
42 |
+
infer_cfg=BoolQ_infer_cfg,
|
43 |
+
eval_cfg=BoolQ_eval_cfg,
|
44 |
+
)
|
45 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_4da4db.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import BoolQDataset
|
6 |
+
|
7 |
+
BoolQ_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'passage'],
|
9 |
+
output_column='answer',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
BoolQ_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0:
|
17 |
+
dict(round=[
|
18 |
+
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}'),
|
19 |
+
dict(role='BOT', prompt='No.'),
|
20 |
+
]),
|
21 |
+
1:
|
22 |
+
dict(round=[
|
23 |
+
dict(role='HUMAN', prompt='{passage}\nQuestion: {question}'),
|
24 |
+
dict(role='BOT', prompt='Yes.'),
|
25 |
+
]),
|
26 |
+
},
|
27 |
+
),
|
28 |
+
retriever=dict(type=ZeroRetriever),
|
29 |
+
inferencer=dict(type=PPLInferencer),
|
30 |
+
)
|
31 |
+
|
32 |
+
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
33 |
+
|
34 |
+
BoolQ_datasets = [
|
35 |
+
dict(
|
36 |
+
type=BoolQDataset,
|
37 |
+
abbr='BoolQ',
|
38 |
+
path='json',
|
39 |
+
data_files='opencompass/boolq',
|
40 |
+
split='train',
|
41 |
+
reader_cfg=BoolQ_reader_cfg,
|
42 |
+
infer_cfg=BoolQ_infer_cfg,
|
43 |
+
eval_cfg=BoolQ_eval_cfg,
|
44 |
+
)
|
45 |
+
]
|
opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl_9619db.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import BoolQDataset
|
6 |
+
|
7 |
+
BoolQ_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'passage'],
|
9 |
+
output_column='answer',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
BoolQ_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0: 'Passage:{passage}。\nQuestion:{question}。\nAnswer: No.',
|
17 |
+
1: 'Passage:{passage}。\nQuestion:{question}。\nAnswer: Yes.',
|
18 |
+
}),
|
19 |
+
retriever=dict(type=ZeroRetriever),
|
20 |
+
inferencer=dict(type=PPLInferencer))
|
21 |
+
|
22 |
+
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
23 |
+
|
24 |
+
BoolQ_datasets = [
|
25 |
+
dict(
|
26 |
+
type=BoolQDataset,
|
27 |
+
abbr='BoolQ',
|
28 |
+
path='json',
|
29 |
+
data_files='opencompass/boolq',
|
30 |
+
split='train',
|
31 |
+
reader_cfg=BoolQ_reader_cfg,
|
32 |
+
infer_cfg=BoolQ_infer_cfg,
|
33 |
+
eval_cfg=BoolQ_eval_cfg)
|
34 |
+
]
|
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .SuperGLUE_CB_gen_854c6c import CB_datasets # noqa: F401, F403
|
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CBDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
CB_reader_cfg = dict(
|
9 |
+
input_columns=['premise', 'hypothesis'],
|
10 |
+
output_column='label',
|
11 |
+
)
|
12 |
+
|
13 |
+
CB_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(
|
17 |
+
round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt=
|
21 |
+
'{premise}\n{hypothesis}\nWhat is the relation between the two sentences?\nA. Contradiction\nB. Entailment\nC. Neutral\nAnswer:'
|
22 |
+
),
|
23 |
+
], ),
|
24 |
+
),
|
25 |
+
retriever=dict(type=ZeroRetriever),
|
26 |
+
inferencer=dict(type=GenInferencer),
|
27 |
+
)
|
28 |
+
|
29 |
+
CB_eval_cfg = dict(
|
30 |
+
evaluator=dict(type=AccEvaluator),
|
31 |
+
pred_role='BOT',
|
32 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABC'),
|
33 |
+
)
|
34 |
+
|
35 |
+
CB_datasets = [
|
36 |
+
dict(
|
37 |
+
abbr='CB',
|
38 |
+
type=CBDatasetV2,
|
39 |
+
path='./data/SuperGLUE/CB/val.jsonl',
|
40 |
+
reader_cfg=CB_reader_cfg,
|
41 |
+
infer_cfg=CB_infer_cfg,
|
42 |
+
eval_cfg=CB_eval_cfg,
|
43 |
+
)
|
44 |
+
]
|
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .SuperGLUE_CB_ppl_0143fe import CB_datasets # noqa: F401, F403
|
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_0143fe.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
CB_reader_cfg = dict(
|
8 |
+
input_columns=['premise', 'hypothesis'],
|
9 |
+
output_column='label',
|
10 |
+
)
|
11 |
+
|
12 |
+
CB_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'contradiction':
|
17 |
+
dict(round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt=
|
21 |
+
'{premise}\n{hypothesis}\nWhat is the relation between the two sentences?'
|
22 |
+
),
|
23 |
+
dict(role='BOT', prompt='Contradiction'),
|
24 |
+
]),
|
25 |
+
'entailment':
|
26 |
+
dict(round=[
|
27 |
+
dict(
|
28 |
+
role='HUMAN',
|
29 |
+
prompt=
|
30 |
+
'{premise}\n{hypothesis}\nWhat is the relation between the two sentences?'
|
31 |
+
),
|
32 |
+
dict(role='BOT', prompt='Entailment'),
|
33 |
+
]),
|
34 |
+
'neutral':
|
35 |
+
dict(round=[
|
36 |
+
dict(
|
37 |
+
role='HUMAN',
|
38 |
+
prompt=
|
39 |
+
'{premise}\n{hypothesis}\nWhat is the relation between the two sentences?'
|
40 |
+
),
|
41 |
+
dict(role='BOT', prompt='Neutral'),
|
42 |
+
]),
|
43 |
+
},
|
44 |
+
),
|
45 |
+
retriever=dict(type=ZeroRetriever),
|
46 |
+
inferencer=dict(type=PPLInferencer),
|
47 |
+
)
|
48 |
+
|
49 |
+
CB_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
50 |
+
|
51 |
+
CB_datasets = [
|
52 |
+
dict(
|
53 |
+
type=HFDataset,
|
54 |
+
abbr='CB',
|
55 |
+
path='json',
|
56 |
+
split='train',
|
57 |
+
data_files='./data/SuperGLUE/CB/val.jsonl',
|
58 |
+
reader_cfg=CB_reader_cfg,
|
59 |
+
infer_cfg=CB_infer_cfg,
|
60 |
+
eval_cfg=CB_eval_cfg,
|
61 |
+
)
|
62 |
+
]
|
opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_ppl_11c175.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
CB_reader_cfg = dict(
|
8 |
+
input_columns=['premise', 'hypothesis'], output_column='label')
|
9 |
+
|
10 |
+
CB_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template={
|
14 |
+
'contradiction': '{premise}?contradiction, {hypothesis}',
|
15 |
+
'entailment': '{premise}?entailment, {hypothesis}',
|
16 |
+
'neutral': '{premise}?neutral, {hypothesis}'
|
17 |
+
}),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=PPLInferencer))
|
20 |
+
|
21 |
+
CB_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
22 |
+
|
23 |
+
CB_datasets = [
|
24 |
+
dict(
|
25 |
+
type=HFDataset,
|
26 |
+
abbr='CB',
|
27 |
+
path='json',
|
28 |
+
split='train',
|
29 |
+
data_files='./data/SuperGLUE/CB/val.jsonl',
|
30 |
+
reader_cfg=CB_reader_cfg,
|
31 |
+
infer_cfg=CB_infer_cfg,
|
32 |
+
eval_cfg=CB_eval_cfg)
|
33 |
+
]
|
opencompass/configs/datasets/apps/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# APPS
|
2 |
+
## Dataset Description
|
3 |
+
APPS is a benchmark for code generation with 10000 problems. It can be used to evaluate the ability of language models to generate code from natural language specifications.
|
4 |
+
|
5 |
+
## Dataset Structure
|
6 |
+
```python
|
7 |
+
DatasetDict({
|
8 |
+
train: Dataset({
|
9 |
+
features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
|
10 |
+
num_rows: 5000
|
11 |
+
})
|
12 |
+
test: Dataset({
|
13 |
+
features: ['problem_id', 'question', 'solutions', 'input_output', 'difficulty', 'url', 'starter_code'],
|
14 |
+
num_rows: 5000
|
15 |
+
})
|
16 |
+
})
|
17 |
+
```
|
18 |
+
We also offer an apps_mini subset, which includes 1500 questions divided proportionally of introductory, interview, and competition categories, with a ratio of 1:1:1(500 questions each).
|
19 |
+
|
20 |
+
## How to Use
|
21 |
+
You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level:
|
22 |
+
```python
|
23 |
+
ds = load_dataset("codeparrot/apps", split="train", difficulties=["competition"])
|
24 |
+
print(next(iter(ds))["question"])
|
25 |
+
```
|
26 |
+
## Evaluation results
|
27 |
+
|
28 |
+
|
29 |
+
| dataset | metric | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf |
|
30 |
+
|-----------------------|----------|-------------|-------------|-------------|-------------|
|
31 |
+
| apps_mini | pass@1 | 1.3 | 0.7 | 7.1 | 9.3 |
|
32 |
+
|
33 |
+
Please refer to Table 3 of [code llama](https://scontent-nrt1-2.xx.fbcdn.net/v/t39.2365-6/369856151_1754812304950972_1159666448927483931_n.pdf?_nc_cat=107&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=TxT1PKkNBZoAX8zMHbm&_nc_ht=scontent-nrt1-2.xx&oh=00_AfDmmQAPzqX1-QOKIDUV5lGKzaZqt0CZUVtxFjHtnh6ycQ&oe=65F5AF8F) for original results if needed.
|
34 |
+
|
35 |
+
## Citation
|
36 |
+
```
|
37 |
+
@article{hendrycksapps2021,
|
38 |
+
title={Measuring Coding Challenge Competence With APPS},
|
39 |
+
author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
|
40 |
+
journal={NeurIPS},
|
41 |
+
year={2021}
|
42 |
+
}
|
43 |
+
```
|
opencompass/configs/datasets/apps/apps_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .apps_gen_c7893a import APPS_datasets # noqa: F401, F403
|
opencompass/configs/datasets/apps/apps_gen_c7893a.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import APPSDataset, APPSEvaluator
|
5 |
+
|
6 |
+
APPS_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
|
7 |
+
|
8 |
+
APPS_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
|
12 |
+
retriever=dict(type=ZeroRetriever),
|
13 |
+
inferencer=dict(type=GenInferencer, max_out_len=512),
|
14 |
+
)
|
15 |
+
|
16 |
+
APPS_eval_cfg = dict(evaluator=dict(type=APPSEvaluator), pred_role='BOT')
|
17 |
+
|
18 |
+
APPS_datasets = [
|
19 |
+
dict(
|
20 |
+
type=APPSDataset,
|
21 |
+
abbr='apps',
|
22 |
+
path='codeparrot/apps',
|
23 |
+
num_repeats=1,
|
24 |
+
reader_cfg=APPS_reader_cfg,
|
25 |
+
infer_cfg=APPS_infer_cfg,
|
26 |
+
eval_cfg=APPS_eval_cfg,
|
27 |
+
)
|
28 |
+
]
|
opencompass/configs/datasets/apps/apps_mini_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .apps_mini_gen_c7893a import APPS_datasets # noqa: F401, F403
|
opencompass/configs/datasets/apps/apps_mini_gen_c7893a.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import APPS_miniDataset, APPSEvaluator
|
5 |
+
|
6 |
+
APPS_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
|
7 |
+
|
8 |
+
APPS_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
|
12 |
+
retriever=dict(type=ZeroRetriever),
|
13 |
+
inferencer=dict(type=GenInferencer, max_out_len=512),
|
14 |
+
)
|
15 |
+
|
16 |
+
APPS_eval_cfg = dict(evaluator=dict(type=APPSEvaluator), pred_role='BOT')
|
17 |
+
|
18 |
+
APPS_mini_datasets = [
|
19 |
+
dict(
|
20 |
+
type=APPS_miniDataset,
|
21 |
+
abbr='apps_mini',
|
22 |
+
path='./data/apps_mini',
|
23 |
+
num_repeats=1,
|
24 |
+
reader_cfg=APPS_reader_cfg,
|
25 |
+
infer_cfg=APPS_infer_cfg,
|
26 |
+
eval_cfg=APPS_eval_cfg,
|
27 |
+
)
|
28 |
+
]
|
opencompass/configs/datasets/apps/deprecated_apps_gen_5b4254.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
|
5 |
+
|
6 |
+
apps_reader_cfg = dict(
|
7 |
+
input_columns=['question'], output_column='problem_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
apps_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(role='HUMAN', prompt='Write a python program:\n{question}'),
|
15 |
+
])),
|
16 |
+
retriever=dict(type=ZeroRetriever),
|
17 |
+
inferencer=dict(type=GenInferencer))
|
18 |
+
|
19 |
+
apps_eval_cfg = dict(
|
20 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
21 |
+
pred_role='BOT',
|
22 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
23 |
+
pred_postprocessor=dict(type=humaneval_postprocess),
|
24 |
+
)
|
25 |
+
|
26 |
+
apps_datasets = [
|
27 |
+
dict(
|
28 |
+
type=HFDataset,
|
29 |
+
path='codeparrot/apps',
|
30 |
+
reader_cfg=apps_reader_cfg,
|
31 |
+
infer_cfg=apps_infer_cfg,
|
32 |
+
eval_cfg=apps_eval_cfg)
|
33 |
+
]
|
opencompass/configs/datasets/apps/deprecated_apps_gen_7fbb95.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
|
5 |
+
|
6 |
+
apps_reader_cfg = dict(
|
7 |
+
input_columns=['question'], output_column='problem_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
apps_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(
|
14 |
+
begin=[
|
15 |
+
dict(
|
16 |
+
role='SYSTEM',
|
17 |
+
fallback_role='HUMAN',
|
18 |
+
prompt='Write a python program:'),
|
19 |
+
],
|
20 |
+
round=[
|
21 |
+
dict(role='HUMAN', prompt='{question}'),
|
22 |
+
])),
|
23 |
+
retriever=dict(type=ZeroRetriever),
|
24 |
+
inferencer=dict(type=GenInferencer))
|
25 |
+
|
26 |
+
apps_eval_cfg = dict(
|
27 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
28 |
+
pred_role='BOT',
|
29 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
30 |
+
pred_postprocessor=dict(type=humaneval_postprocess),
|
31 |
+
)
|
32 |
+
|
33 |
+
apps_datasets = [
|
34 |
+
dict(
|
35 |
+
type=HFDataset,
|
36 |
+
path='codeparrot/apps',
|
37 |
+
reader_cfg=apps_reader_cfg,
|
38 |
+
infer_cfg=apps_infer_cfg,
|
39 |
+
eval_cfg=apps_eval_cfg)
|
40 |
+
]
|
opencompass/configs/datasets/apps/deprecated_apps_gen_b4dee3.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HFDataset, HumanEvalEvaluator, humaneval_postprocess
|
5 |
+
|
6 |
+
apps_reader_cfg = dict(
|
7 |
+
input_columns=['question'], output_column='problem_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
apps_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template='{question}'),
|
14 |
+
retriever=dict(type=ZeroRetriever),
|
15 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
16 |
+
|
17 |
+
apps_eval_cfg = dict(
|
18 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
19 |
+
k=[1, 10, 100],
|
20 |
+
pred_postprocessor=dict(type=humaneval_postprocess),
|
21 |
+
)
|
22 |
+
|
23 |
+
apps_datasets = [
|
24 |
+
dict(
|
25 |
+
type=HFDataset,
|
26 |
+
path='codeparrot/apps',
|
27 |
+
reader_cfg=apps_reader_cfg,
|
28 |
+
infer_cfg=apps_infer_cfg,
|
29 |
+
eval_cfg=apps_eval_cfg)
|
30 |
+
]
|
opencompass/configs/datasets/gsm8k_contamination/gsm8k_contamination_ppl_ecdd22.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLOnlyInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AveragePPLEvaluator
|
5 |
+
from opencompass.datasets import GSM8KDataset, GSM8KReferenceSkywork
|
6 |
+
|
7 |
+
gsm8k_datasets = []
|
8 |
+
|
9 |
+
gsm8k_infer_cfg = dict(
|
10 |
+
prompt_template=dict(type=PromptTemplate, template='{question} {answer}'),
|
11 |
+
retriever=dict(type=ZeroRetriever),
|
12 |
+
inferencer=dict(type=PPLOnlyInferencer),
|
13 |
+
)
|
14 |
+
|
15 |
+
gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
|
16 |
+
|
17 |
+
for split in ['train', 'test']:
|
18 |
+
gsm8k_reader_cfg = dict(
|
19 |
+
input_columns=['question', 'answer'],
|
20 |
+
output_column=None,
|
21 |
+
train_split=split,
|
22 |
+
test_split=split,
|
23 |
+
)
|
24 |
+
gsm8k_datasets.append(
|
25 |
+
dict(
|
26 |
+
abbr=f'gsm8k-{split}-ppl',
|
27 |
+
type=GSM8KDataset,
|
28 |
+
path='./data/gsm8k',
|
29 |
+
reader_cfg=gsm8k_reader_cfg,
|
30 |
+
infer_cfg=gsm8k_infer_cfg,
|
31 |
+
eval_cfg=gsm8k_eval_cfg)
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
gsm8k_infer_cfg = dict(
|
36 |
+
prompt_template=dict(type=PromptTemplate, template='{text}'),
|
37 |
+
retriever=dict(type=ZeroRetriever),
|
38 |
+
inferencer=dict(type=PPLOnlyInferencer),
|
39 |
+
)
|
40 |
+
|
41 |
+
gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
|
42 |
+
|
43 |
+
gsm8k_reader_cfg = dict(
|
44 |
+
input_columns=['text'],
|
45 |
+
output_column=None,
|
46 |
+
)
|
47 |
+
|
48 |
+
gsm8k_datasets.append(
|
49 |
+
dict(
|
50 |
+
abbr=f'gsm8k-ref-ppl',
|
51 |
+
type=GSM8KReferenceSkywork,
|
52 |
+
path='./data/gsm8k-extra/mock_gsm8k_test.jsonl',
|
53 |
+
reader_cfg=gsm8k_reader_cfg,
|
54 |
+
infer_cfg=gsm8k_infer_cfg,
|
55 |
+
eval_cfg=gsm8k_eval_cfg
|
56 |
+
)
|
57 |
+
)
|
opencompass/configs/datasets/lcsts/lcsts_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .lcsts_gen_8ee1fe import lcsts_datasets # noqa: F401, F403
|
opencompass/configs/datasets/lcsts/lcsts_gen_8ee1fe.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator
|
5 |
+
from opencompass.datasets import LCSTSDataset, lcsts_postprocess
|
6 |
+
|
7 |
+
lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst')
|
8 |
+
|
9 |
+
lcsts_infer_cfg = dict(
|
10 |
+
prompt_template=dict(
|
11 |
+
type=PromptTemplate,
|
12 |
+
template=dict(round=[
|
13 |
+
dict(role='HUMAN', prompt='阅读以下文章,并给出简短的摘要:{content}\n摘要如下:'),
|
14 |
+
])),
|
15 |
+
retriever=dict(type=ZeroRetriever),
|
16 |
+
inferencer=dict(type=GenInferencer))
|
17 |
+
|
18 |
+
lcsts_eval_cfg = dict(
|
19 |
+
evaluator=dict(type=JiebaRougeEvaluator),
|
20 |
+
pred_role='BOT',
|
21 |
+
pred_postprocessor=dict(type=lcsts_postprocess),
|
22 |
+
)
|
23 |
+
|
24 |
+
lcsts_datasets = [
|
25 |
+
dict(
|
26 |
+
type=LCSTSDataset,
|
27 |
+
abbr='lcsts',
|
28 |
+
path='opencompass/LCSTS',
|
29 |
+
reader_cfg=lcsts_reader_cfg,
|
30 |
+
infer_cfg=lcsts_infer_cfg,
|
31 |
+
eval_cfg=lcsts_eval_cfg)
|
32 |
+
]
|
opencompass/configs/datasets/lcsts/lcsts_gen_9b0b89.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import JiebaRougeEvaluator
|
5 |
+
from opencompass.datasets import LCSTSDataset, lcsts_postprocess
|
6 |
+
|
7 |
+
lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst')
|
8 |
+
|
9 |
+
lcsts_infer_cfg = dict(
|
10 |
+
prompt_template=dict(
|
11 |
+
type=PromptTemplate, template='阅读文章:{content}\n根据上文,给出简短的单个摘要:'),
|
12 |
+
retriever=dict(type=ZeroRetriever),
|
13 |
+
inferencer=dict(type=GenInferencer))
|
14 |
+
|
15 |
+
lcsts_eval_cfg = dict(
|
16 |
+
evaluator=dict(type=JiebaRougeEvaluator),
|
17 |
+
pred_postprocessor=dict(type=lcsts_postprocess),
|
18 |
+
)
|
19 |
+
|
20 |
+
lcsts_datasets = [
|
21 |
+
dict(
|
22 |
+
type=LCSTSDataset,
|
23 |
+
abbr='lcsts',
|
24 |
+
path='opencompass/LCSTS',
|
25 |
+
reader_cfg=lcsts_reader_cfg,
|
26 |
+
infer_cfg=lcsts_infer_cfg,
|
27 |
+
eval_cfg=lcsts_eval_cfg)
|
28 |
+
]
|
opencompass/configs/datasets/leval/leval.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .levalnaturalquestion.leval_naturalquestion_gen import LEval_nq_datasets
|
5 |
+
from .levalnarrativeqa.leval_narrativeqa_gen import LEval_narrativeqa_datasets
|
6 |
+
from .levalmultidocqa.leval_multidocqa_gen import LEval_multidocqa_datasets
|
7 |
+
from .levalcoursera.leval_coursera_gen import LEval_coursera_datasets
|
8 |
+
from .levaltpo.leval_tpo_gen import LEval_tpo_datasets
|
9 |
+
from .levalquality.leval_quality_gen import LEval_quality_datasets
|
10 |
+
from .levalgsm100.leval_gsm100_gen import LEval_gsm100_datasets
|
11 |
+
from .levaltopicretrieval.leval_topic_retrieval_gen import LEval_tr_datasets
|
12 |
+
from .levalfinancialqa.leval_financialqa_gen import LEval_financialqa_datasets
|
13 |
+
from .levalgovreportsumm.leval_gov_report_summ_gen import LEval_govreport_summ_datasets
|
14 |
+
from .levallegalcontractqa.leval_legalcontractqa_gen import LEval_legalqa_datasets
|
15 |
+
from .levalmeetingsumm.leval_meetingsumm_gen import LEval_meetingsumm_datasets
|
16 |
+
from .levalnewssumm.leval_newssumm_gen import LEval_newssumm_datasets
|
17 |
+
from .levalpaperassistant.leval_paper_assistant_gen import LEval_ps_summ_datasets
|
18 |
+
from .levalpatentsumm.leval_patent_summ_gen import LEval_patent_summ_datasets
|
19 |
+
from .levaltvshowsumm.leval_tvshow_summ_gen import LEval_tvshow_summ_datasets
|
20 |
+
from .levalscientificqa.leval_scientificqa_gen import LEval_scientificqa_datasets
|
21 |
+
from .levalreviewsumm.leval_review_summ_gen import LEval_review_summ_datasets
|
22 |
+
|
23 |
+
leval_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/math/deprecated_math_agent_evaluatorv2_gen_861b4f.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import AgentInferencer
|
4 |
+
from opencompass.datasets import (
|
5 |
+
MATHDataset, MATHAgentEvaluator, math_postprocess_v2
|
6 |
+
)
|
7 |
+
# use pal format but not perform well
|
8 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
9 |
+
|
10 |
+
math_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(
|
14 |
+
round=[
|
15 |
+
# # ################################### NEW SHOT ###################################
|
16 |
+
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
|
17 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify
|
18 |
+
|
19 |
+
def solution():
|
20 |
+
x = symbols('x')
|
21 |
+
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
|
22 |
+
simplified_expr = simplify(expr)
|
23 |
+
|
24 |
+
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
|
25 |
+
result = x3_coefficient
|
26 |
+
return result"""),
|
27 |
+
dict(role='SYSTEM', prompt='Response:26'),
|
28 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
|
29 |
+
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
|
30 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math
|
31 |
+
|
32 |
+
def solution():
|
33 |
+
radius = 6
|
34 |
+
|
35 |
+
# Surface area of the hemisphere
|
36 |
+
hemisphere_area = 2 * math.pi * radius**2
|
37 |
+
|
38 |
+
# Area of the circular base
|
39 |
+
base_area = math.pi * radius**2
|
40 |
+
|
41 |
+
# Total surface area
|
42 |
+
total_surface_area = hemisphere_area + base_area
|
43 |
+
|
44 |
+
# Formatting the result in LaTeX
|
45 |
+
result = r'{}\pi'.format(total_surface_area / math.pi)
|
46 |
+
return result"""),
|
47 |
+
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
|
48 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
|
49 |
+
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
|
50 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():
|
51 |
+
# Probabilities of each outcome
|
52 |
+
prime_prob = 1 / 6
|
53 |
+
composite_prob = 1 / 3
|
54 |
+
otherwise_prob = 1 / 6
|
55 |
+
|
56 |
+
# Expected value of each outcome
|
57 |
+
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
|
58 |
+
composite_expected_value = 0 * composite_prob
|
59 |
+
otherwise_expected_value = -3 * otherwise_prob
|
60 |
+
|
61 |
+
# Total expected value
|
62 |
+
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
|
63 |
+
|
64 |
+
# Dollar value to the nearest cent
|
65 |
+
result = "{:.2f}".format(total_expected_value)
|
66 |
+
return result"""),
|
67 |
+
dict(role='SYSTEM', prompt='Response:1.17'),
|
68 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
|
69 |
+
dict(role='HUMAN', prompt='{problem}'),
|
70 |
+
])),
|
71 |
+
retriever=dict(type=ZeroRetriever),
|
72 |
+
inferencer=dict(type=AgentInferencer),
|
73 |
+
)
|
74 |
+
|
75 |
+
math_eval_cfg = dict(
|
76 |
+
evaluator=dict(
|
77 |
+
type=MATHAgentEvaluator,
|
78 |
+
version='v2'),
|
79 |
+
pred_postprocessor=dict(type=math_postprocess_v2))
|
80 |
+
|
81 |
+
math_datasets = [
|
82 |
+
dict(
|
83 |
+
abbr='math-agent',
|
84 |
+
type=MATHDataset,
|
85 |
+
path='./data/math/math.json',
|
86 |
+
reader_cfg=math_reader_cfg,
|
87 |
+
infer_cfg=math_infer_cfg,
|
88 |
+
eval_cfg=math_eval_cfg,
|
89 |
+
)
|
90 |
+
]
|
opencompass/configs/datasets/math/math_0shot_gen_393424.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template=dict(
|
12 |
+
round=[
|
13 |
+
dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
14 |
+
]
|
15 |
+
),
|
16 |
+
),
|
17 |
+
retriever=dict(type=ZeroRetriever),
|
18 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
19 |
+
)
|
20 |
+
|
21 |
+
# postprocess v2
|
22 |
+
math_eval_cfg = dict(
|
23 |
+
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
|
24 |
+
)
|
25 |
+
|
26 |
+
math_datasets = [
|
27 |
+
dict(
|
28 |
+
type=MATHDataset,
|
29 |
+
abbr='math',
|
30 |
+
path='opencompass/math',
|
31 |
+
reader_cfg=math_reader_cfg,
|
32 |
+
infer_cfg=math_infer_cfg,
|
33 |
+
eval_cfg=math_eval_cfg,
|
34 |
+
)
|
35 |
+
]
|
opencompass/configs/datasets/math/math_4shot_base_gen_43d5b6.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
|
6 |
+
|
7 |
+
with read_base():
|
8 |
+
from .math_4shot_example_from_google_research import prompt
|
9 |
+
|
10 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
11 |
+
|
12 |
+
math_infer_cfg = dict(
|
13 |
+
prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'),
|
14 |
+
retriever=dict(type=ZeroRetriever),
|
15 |
+
inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Problem', '问题:']))
|
16 |
+
|
17 |
+
# postprocess v2
|
18 |
+
math_eval_cfg = dict(
|
19 |
+
evaluator=dict(type=MATHEvaluator, version='v2'),
|
20 |
+
pred_postprocessor=dict(type=math_postprocess_v2))
|
21 |
+
|
22 |
+
math_datasets = [
|
23 |
+
dict(
|
24 |
+
type=MATHDataset,
|
25 |
+
abbr='math',
|
26 |
+
path='opencompass/math',
|
27 |
+
reader_cfg=math_reader_cfg,
|
28 |
+
infer_cfg=math_infer_cfg,
|
29 |
+
eval_cfg=math_eval_cfg)
|
30 |
+
]
|
opencompass/configs/datasets/math/math_4shot_base_gen_db136b.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
|
6 |
+
|
7 |
+
with read_base():
|
8 |
+
from .math_4shot_example_from_google_research import prompt
|
9 |
+
|
10 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
11 |
+
|
12 |
+
math_infer_cfg = dict(
|
13 |
+
prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'),
|
14 |
+
retriever=dict(type=ZeroRetriever),
|
15 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=['Problem']))
|
16 |
+
|
17 |
+
# postprocess v2
|
18 |
+
math_eval_cfg = dict(
|
19 |
+
evaluator=dict(type=MATHEvaluator, version='v2'),
|
20 |
+
pred_postprocessor=dict(type=math_postprocess_v2))
|
21 |
+
|
22 |
+
math_datasets = [
|
23 |
+
dict(
|
24 |
+
type=MATHDataset,
|
25 |
+
abbr='math',
|
26 |
+
path='opencompass/math',
|
27 |
+
reader_cfg=math_reader_cfg,
|
28 |
+
infer_cfg=math_infer_cfg,
|
29 |
+
eval_cfg=math_eval_cfg)
|
30 |
+
]
|
opencompass/configs/datasets/math/math_4shot_example_from_google_research.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Solving Quantitative Reasoning Problems with Language Models
|
2 |
+
|
3 |
+
prompt = '''
|
4 |
+
Problem:
|
5 |
+
Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.
|
6 |
+
|
7 |
+
Solution:
|
8 |
+
The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.
|
9 |
+
Final Answer: The final answer is $[2,5)$. I hope it is correct.
|
10 |
+
|
11 |
+
Problem:
|
12 |
+
If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$
|
13 |
+
|
14 |
+
Solution:
|
15 |
+
We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$
|
16 |
+
Final Answer: The final answer is $24$. I hope it is correct.
|
17 |
+
|
18 |
+
Problem:
|
19 |
+
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
|
20 |
+
|
21 |
+
Solution:
|
22 |
+
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:
|
23 |
+
\\begin{align*}
|
24 |
+
30n&=480\\
|
25 |
+
\\Rightarrow\\qquad n&=480/30=\\boxed{16}
|
26 |
+
\\end{align*}
|
27 |
+
Final Answer: The final answer is $16$. I hope it is correct.
|
28 |
+
|
29 |
+
Problem:
|
30 |
+
If the system of equations
|
31 |
+
\\begin{align*}
|
32 |
+
6x-4y&=a,\\
|
33 |
+
6y-9x &=b.
|
34 |
+
\\end{align*}
|
35 |
+
has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.
|
36 |
+
|
37 |
+
Solution:
|
38 |
+
If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$
|
39 |
+
Final Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.
|
40 |
+
'''.strip()
|
opencompass/configs/datasets/math/math_agent_gen_861b4f.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import AgentInferencer
|
4 |
+
from opencompass.datasets import (
|
5 |
+
MATHDataset, MATHAgentEvaluator, math_postprocess
|
6 |
+
)
|
7 |
+
|
8 |
+
# use pal format but not perform well
|
9 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
10 |
+
|
11 |
+
math_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(
|
15 |
+
round=[
|
16 |
+
# # ################################### NEW SHOT ###################################
|
17 |
+
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
|
18 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify
|
19 |
+
|
20 |
+
def solution():
|
21 |
+
x = symbols('x')
|
22 |
+
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
|
23 |
+
simplified_expr = simplify(expr)
|
24 |
+
|
25 |
+
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
|
26 |
+
result = x3_coefficient
|
27 |
+
return result"""),
|
28 |
+
dict(role='SYSTEM', prompt='Response:26'),
|
29 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
|
30 |
+
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
|
31 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math
|
32 |
+
|
33 |
+
def solution():
|
34 |
+
radius = 6
|
35 |
+
|
36 |
+
# Surface area of the hemisphere
|
37 |
+
hemisphere_area = 2 * math.pi * radius**2
|
38 |
+
|
39 |
+
# Area of the circular base
|
40 |
+
base_area = math.pi * radius**2
|
41 |
+
|
42 |
+
# Total surface area
|
43 |
+
total_surface_area = hemisphere_area + base_area
|
44 |
+
|
45 |
+
# Formatting the result in LaTeX
|
46 |
+
result = r'{}\pi'.format(total_surface_area / math.pi)
|
47 |
+
return result"""),
|
48 |
+
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
|
49 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
|
50 |
+
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
|
51 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():
|
52 |
+
# Probabilities of each outcome
|
53 |
+
prime_prob = 1 / 6
|
54 |
+
composite_prob = 1 / 3
|
55 |
+
otherwise_prob = 1 / 6
|
56 |
+
|
57 |
+
# Expected value of each outcome
|
58 |
+
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
|
59 |
+
composite_expected_value = 0 * composite_prob
|
60 |
+
otherwise_expected_value = -3 * otherwise_prob
|
61 |
+
|
62 |
+
# Total expected value
|
63 |
+
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
|
64 |
+
|
65 |
+
# Dollar value to the nearest cent
|
66 |
+
result = "{:.2f}".format(total_expected_value)
|
67 |
+
return result"""),
|
68 |
+
dict(role='SYSTEM', prompt='Response:1.17'),
|
69 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
|
70 |
+
dict(role='HUMAN', prompt='{problem}'),
|
71 |
+
])),
|
72 |
+
retriever=dict(type=ZeroRetriever),
|
73 |
+
inferencer=dict(type=AgentInferencer),
|
74 |
+
)
|
75 |
+
|
76 |
+
math_eval_cfg = dict(
|
77 |
+
evaluator=dict(type=MATHAgentEvaluator),
|
78 |
+
pred_postprocessor=dict(type=math_postprocess),
|
79 |
+
)
|
80 |
+
|
81 |
+
math_datasets = [
|
82 |
+
dict(
|
83 |
+
abbr='math-agent',
|
84 |
+
type=MATHDataset,
|
85 |
+
path='opencompass/math',
|
86 |
+
reader_cfg=math_reader_cfg,
|
87 |
+
infer_cfg=math_infer_cfg,
|
88 |
+
eval_cfg=math_eval_cfg,
|
89 |
+
)
|
90 |
+
]
|
opencompass/configs/datasets/math/math_agent_gen_af2293.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import AgentInferencer
|
4 |
+
from opencompass.datasets import (
|
5 |
+
MATHDataset, MATHAgentEvaluator, math_postprocess
|
6 |
+
)
|
7 |
+
|
8 |
+
# use pal format but not perform well
|
9 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
10 |
+
|
11 |
+
math_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(
|
15 |
+
round=[
|
16 |
+
# # ################################### NEW SHOT ###################################
|
17 |
+
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
|
18 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify
|
19 |
+
|
20 |
+
def solution():
|
21 |
+
x = symbols('x')
|
22 |
+
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
|
23 |
+
simplified_expr = simplify(expr)
|
24 |
+
|
25 |
+
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
|
26 |
+
result = x3_coefficient
|
27 |
+
return result"""),
|
28 |
+
dict(role='SYSTEM', prompt='Response:26'),
|
29 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
|
30 |
+
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
|
31 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math
|
32 |
+
|
33 |
+
def solution():
|
34 |
+
radius = 6
|
35 |
+
|
36 |
+
# Surface area of the hemisphere
|
37 |
+
hemisphere_area = 2 * math.pi * radius**2
|
38 |
+
|
39 |
+
# Area of the circular base
|
40 |
+
base_area = math.pi * radius**2
|
41 |
+
|
42 |
+
# Total surface area
|
43 |
+
total_surface_area = hemisphere_area + base_area
|
44 |
+
|
45 |
+
# Formatting the result in LaTeX
|
46 |
+
result = r'{}\pi'.format(total_surface_area / math.pi)
|
47 |
+
return result"""),
|
48 |
+
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
|
49 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
|
50 |
+
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
|
51 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():
|
52 |
+
# Probabilities of each outcome
|
53 |
+
prime_prob = 1 / 6
|
54 |
+
composite_prob = 1 / 3
|
55 |
+
otherwise_prob = 1 / 6
|
56 |
+
|
57 |
+
# Expected value of each outcome
|
58 |
+
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
|
59 |
+
composite_expected_value = 0 * composite_prob
|
60 |
+
otherwise_expected_value = -3 * otherwise_prob
|
61 |
+
|
62 |
+
# Total expected value
|
63 |
+
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
|
64 |
+
|
65 |
+
# Dollar value to the nearest cent
|
66 |
+
result = "{:.2f}".format(total_expected_value)
|
67 |
+
return result"""),
|
68 |
+
dict(role='SYSTEM', prompt='Response:1.17'),
|
69 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
|
70 |
+
dict(role='HUMAN', prompt='Given $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$'),
|
71 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import numpy as np
|
72 |
+
|
73 |
+
def solution()
|
74 |
+
a = np.array([-7, 0, 1])
|
75 |
+
b = np.array([4, 2, -1])
|
76 |
+
|
77 |
+
result = a - 3 * b
|
78 |
+
|
79 |
+
result = r'\\begin{{pmatrix}} {} \ {} \ {} \end{{pmatrix}}'.format(result[0], result[1], result[2])
|
80 |
+
return result"""),
|
81 |
+
dict(role='SYSTEM', prompt='Response:\\begin{pmatrix} -19 \\ -6 \\ 4 \\end{pmatrix}'),
|
82 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \\end{pmatrix}$. I hope it is correct.'),
|
83 |
+
dict(role='HUMAN', prompt='{problem}'),
|
84 |
+
])),
|
85 |
+
retriever=dict(type=ZeroRetriever),
|
86 |
+
inferencer=dict(type=AgentInferencer),
|
87 |
+
)
|
88 |
+
|
89 |
+
math_eval_cfg = dict(
|
90 |
+
evaluator=dict(type=MATHAgentEvaluator),
|
91 |
+
pred_postprocessor=dict(type=math_postprocess),
|
92 |
+
)
|
93 |
+
|
94 |
+
math_datasets = [
|
95 |
+
dict(
|
96 |
+
abbr='math-agent',
|
97 |
+
type=MATHDataset,
|
98 |
+
path='opencompass/math',
|
99 |
+
reader_cfg=math_reader_cfg,
|
100 |
+
infer_cfg=math_infer_cfg,
|
101 |
+
eval_cfg=math_eval_cfg,
|
102 |
+
)
|
103 |
+
]
|
opencompass/configs/datasets/math/math_evaluatorv2_gen_cecb31.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template=dict(round=[
|
12 |
+
dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'),
|
13 |
+
dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'),
|
14 |
+
dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'),
|
15 |
+
dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'),
|
16 |
+
dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'),
|
17 |
+
dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'),
|
18 |
+
dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'),
|
19 |
+
dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'),
|
20 |
+
dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
|
21 |
+
])),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024))
|
24 |
+
|
25 |
+
# postprocess v2
|
26 |
+
math_eval_cfg = dict(
|
27 |
+
evaluator=dict(type=MATHEvaluator, version='v2'),
|
28 |
+
pred_postprocessor=dict(type=math_postprocess_v2))
|
29 |
+
|
30 |
+
math_datasets = [
|
31 |
+
dict(
|
32 |
+
type=MATHDataset,
|
33 |
+
abbr='math',
|
34 |
+
path='opencompass/math',
|
35 |
+
reader_cfg=math_reader_cfg,
|
36 |
+
infer_cfg=math_infer_cfg,
|
37 |
+
eval_cfg=math_eval_cfg)
|
38 |
+
]
|
opencompass/configs/datasets/math/math_gen_1ed9c2.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template=dict(round=[
|
12 |
+
dict(role='HUMAN', prompt='Problem:\nFind the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplified.\nSolution:'),
|
13 |
+
dict(role='BOT', prompt='Combine like terms to simplify the expression. The coefficient of $x^3$ is calculated as $$(-3+2\cdot(2+1))+(-5)\cdot(-4))$ = 26$. Thus, the coefficient of $x^3$ is $\\boxed{26}$.\nFinal Answer: The final answer is $26$. I hope it is correct.\n'),
|
14 |
+
dict(role='HUMAN', prompt='Problem:\nThe surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.\nSolution:'),
|
15 |
+
dict(role='BOT', prompt='The surface area of a hemisphere (not including the base) is half that of a sphere, so it is $2\pi r^2$. The area of the base is $\pi r^2$. Therefore, for a hemisphere with radius 6 cm, the total surface area is $2\pi (6)^2 + \pi (6)^2 = 108\pi$ square cm.\nFinal Answer: The final answer is $108\pi$ square cm. I hope it is correct.\n'),
|
16 |
+
dict(role='HUMAN', prompt='Problem:\nMonica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.\nSolution:'),
|
17 |
+
dict(role='BOT', prompt='The prime numbers rolled could be 2, 3, or 5, and each has a 1/6 chance of being rolled. The composite number 4 or 6 has a 2/6 chance of being rolled, but it results in $0 win. The remaining non-prime and non-composite number is 1 , and it results in a loss of $3, with a 1/6 chance. So, the expected winnings are $(2+3+5)(1/6)+0(2/6)+(-3)(1/6) = \$1.17$.\nFinal Answer: The final answer is $\$1.17$. I hope it is correct.\n'),
|
18 |
+
dict(role='HUMAN', prompt='Problem:\nGiven $\mathbf{a} = \\begin{pmatrix} -7 \\ 0 \\ 1 \end{pmatrix}$ and $\mathbf{b} = \\begin{pmatrix} 4 \\ 2 \\ -1 \end{pmatrix},$ find $\mathbf{a} - 3 \mathbf{b}.$\nSolution:'),
|
19 |
+
dict(role='BOT', prompt='We find $3 \mathbf{b}$ first, which is $\\begin{pmatrix} 12 \\ 6 \\ -3 \end{pmatrix}$. Then we subtract this vector from $\mathbf{a}$. So, $\mathbf{a} - 3 \mathbf{b} = \\begin{pmatrix} -7 - 12 \\ 0 - 6 \\ 1 - (-3) \end{pmatrix} = \\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}.$\nFinal Answer: The final answer is $\\begin{pmatrix} -19 \\ -6 \\ 4 \end{pmatrix}$. I hope it is correct.\n'),
|
20 |
+
dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
|
21 |
+
])),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
24 |
+
|
25 |
+
math_eval_cfg = dict(
|
26 |
+
evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
|
27 |
+
|
28 |
+
math_datasets = [
|
29 |
+
dict(
|
30 |
+
type=MATHDataset,
|
31 |
+
abbr='math',
|
32 |
+
path='opencompass/math',
|
33 |
+
reader_cfg=math_reader_cfg,
|
34 |
+
infer_cfg=math_infer_cfg,
|
35 |
+
eval_cfg=math_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/math/math_gen_559593.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template='''Problem:
|
12 |
+
Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}
|
13 |
+
Solution:
|
14 |
+
The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.
|
15 |
+
Final Answer: The final answer is $[2,5)$. I hope it is correct.
|
16 |
+
|
17 |
+
Problem:
|
18 |
+
If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$
|
19 |
+
Solution:
|
20 |
+
We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$
|
21 |
+
Final Answer: The final answer is $24$. I hope it is correct.
|
22 |
+
|
23 |
+
Problem:
|
24 |
+
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
|
25 |
+
Solution:
|
26 |
+
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}
|
27 |
+
Final Answer: The final answer is $16$. I hope it is correct.
|
28 |
+
|
29 |
+
Problem:
|
30 |
+
If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.
|
31 |
+
Solution:
|
32 |
+
If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$
|
33 |
+
Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.
|
34 |
+
|
35 |
+
Problem:
|
36 |
+
{problem}
|
37 |
+
Solution:
|
38 |
+
{solution}'''),
|
39 |
+
retriever=dict(type=ZeroRetriever),
|
40 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
41 |
+
|
42 |
+
math_eval_cfg = dict(
|
43 |
+
evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
|
44 |
+
|
45 |
+
math_datasets = [
|
46 |
+
dict(
|
47 |
+
type=MATHDataset,
|
48 |
+
abbr='math',
|
49 |
+
path='opencompass/math',
|
50 |
+
reader_cfg=math_reader_cfg,
|
51 |
+
infer_cfg=math_infer_cfg,
|
52 |
+
eval_cfg=math_eval_cfg)
|
53 |
+
]
|
opencompass/configs/datasets/math/math_gen_5e8458.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
|
5 |
+
|
6 |
+
math_infer_cfg = dict(
|
7 |
+
prompt_template=dict(
|
8 |
+
type=PromptTemplate,
|
9 |
+
template='''Problem:
|
10 |
+
Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}
|
11 |
+
Solution:
|
12 |
+
The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.
|
13 |
+
Final Answer: The final answer is $[2,5)$. I hope it is correct.
|
14 |
+
|
15 |
+
Problem:
|
16 |
+
If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$
|
17 |
+
Solution:
|
18 |
+
We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$
|
19 |
+
Final Answer: The final answer is $24$. I hope it is correct.
|
20 |
+
|
21 |
+
Problem:
|
22 |
+
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
|
23 |
+
Solution:
|
24 |
+
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}
|
25 |
+
Final Answer: The final answer is $16$. I hope it is correct.
|
26 |
+
|
27 |
+
Problem:
|
28 |
+
If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.
|
29 |
+
Solution:
|
30 |
+
If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$
|
31 |
+
Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.
|
32 |
+
|
33 |
+
Problem:
|
34 |
+
{problem}Solution:
|
35 |
+
{solution}'''),
|
36 |
+
retriever=dict(type=ZeroRetriever),
|
37 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
38 |
+
|
39 |
+
math_eval_cfg = dict(
|
40 |
+
evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
|
41 |
+
|
42 |
+
math_datasets = [
|
43 |
+
dict(
|
44 |
+
type=MATHDataset,
|
45 |
+
abbr='math',
|
46 |
+
path='opencompass/math',
|
47 |
+
reader_cfg=dict(
|
48 |
+
input_columns=['problem'],
|
49 |
+
output_column='solution',
|
50 |
+
),
|
51 |
+
infer_cfg=math_infer_cfg,
|
52 |
+
eval_cfg=math_eval_cfg)
|
53 |
+
]
|
opencompass/configs/datasets/math/math_gen_78ced2.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
|
5 |
+
|
6 |
+
QUERY_TEMPLATE = """
|
7 |
+
Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
|
8 |
+
|
9 |
+
{problem}
|
10 |
+
|
11 |
+
Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command.
|
12 |
+
""".strip()
|
13 |
+
|
14 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
15 |
+
|
16 |
+
math_infer_cfg = dict(
|
17 |
+
prompt_template=dict(
|
18 |
+
type=PromptTemplate,
|
19 |
+
|
20 |
+
template=dict(round=[
|
21 |
+
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
22 |
+
])),
|
23 |
+
retriever=dict(type=ZeroRetriever),
|
24 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
25 |
+
|
26 |
+
math_eval_cfg = dict(
|
27 |
+
evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
|
28 |
+
|
29 |
+
math_datasets = [
|
30 |
+
dict(
|
31 |
+
type=MATHDataset,
|
32 |
+
abbr='math',
|
33 |
+
path='opencompass/math',
|
34 |
+
reader_cfg=math_reader_cfg,
|
35 |
+
infer_cfg=math_infer_cfg,
|
36 |
+
eval_cfg=math_eval_cfg)
|
37 |
+
]
|