tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

tuandunghcmut commited on Apr 11

Commit

7dfedba

verified ·

1 Parent(s): cc8629b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py +4 -0
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py +53 -0
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl.py +4 -0
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_4b16c0.py +65 -0
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_9ef540.py +43 -0
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_e53034.py +59 -0
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py +4 -0
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py +52 -0
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py +4 -0
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py +60 -0
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py +44 -0
opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py +4 -0
opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py +304 -0
opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py +4 -0
opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py +356 -0
opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py +45 -0
opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py +44 -0
opencompass/configs/datasets/GaokaoBench/GaokaoBench_prompts.py +191 -0
opencompass/configs/datasets/GaokaoBench/README.md +191 -0
opencompass/configs/datasets/XLSum/XLSum_gen.py +4 -0
opencompass/configs/datasets/XLSum/XLSum_gen_2bb71c.py +29 -0
opencompass/configs/datasets/bbh/README.md +250 -0
opencompass/configs/datasets/bbh/bbh_gen.py +4 -0
opencompass/configs/datasets/bbh/bbh_gen_2879b0.py +56 -0
opencompass/configs/datasets/bbh/bbh_gen_4a31fa.py +99 -0
opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py +99 -0
opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py +99 -0
opencompass/configs/datasets/bbh/bbh_gen_98fba6.py +90 -0
opencompass/configs/datasets/bbh/bbh_subset_settings.py +29 -0
opencompass/configs/datasets/cmmlu/cmmlu_0shot_cot_gen_305931.py +130 -0
opencompass/configs/datasets/cmmlu/cmmlu_gen.py +4 -0
opencompass/configs/datasets/cmmlu/cmmlu_gen_c13365.py +123 -0
opencompass/configs/datasets/cmmlu/cmmlu_ppl.py +4 -0
opencompass/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py +117 -0
opencompass/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py +122 -0
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py +4 -0
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen_d380d0.py +50 -0
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl.py +4 -0
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl_971f48.py +52 -0
opencompass/configs/datasets/demo/demo_cmmlu_base_ppl.py +8 -0
opencompass/configs/datasets/demo/demo_cmmlu_chat_gen.py +8 -0
opencompass/configs/datasets/demo/demo_gsm8k_base_gen.py +7 -0
opencompass/configs/datasets/demo/demo_gsm8k_chat_gen.py +7 -0
opencompass/configs/datasets/demo/demo_math_base_gen.py +7 -0
opencompass/configs/datasets/demo/demo_math_chat_gen.py +7 -0
opencompass/configs/datasets/gpqa/README.md +69 -0
opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_4b5a83.py +49 -0
opencompass/configs/datasets/gpqa/gpqa_gen.py +4 -0
opencompass/configs/datasets/gpqa/gpqa_gen_015262.py +46 -0
opencompass/configs/datasets/gpqa/gpqa_gen_4baadb.py +46 -0

opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .FewCLUE_bustm_gen_634f41 import bustm_datasets  # noqa: F401, F403

opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import AFQMCDatasetV2
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+bustm_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+bustm_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=
+                '语句一：“{sentence1}”\n语句二：“{sentence2}”\n请判断语句一和语句二说的是否是一个意思？\nA. 无关\nB. 相关\n请从“A”，“B”中进行选择。\n答：',
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+bustm_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+bustm_datasets = [
+    dict(
+        abbr='bustm-dev',
+        type=AFQMCDatasetV2,  # bustm share the same format with AFQMC
+        path='./data/FewCLUE/bustm/dev_few_all.json',
+        local_mode=True,
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg,
+    ),
+    dict(
+        abbr='bustm-test',
+        type=AFQMCDatasetV2,  # bustm share the same format with AFQMC
+        path='./data/FewCLUE/bustm/test_public.json',
+        local_mode=True,
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg,
+    ),
+]

opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .FewCLUE_bustm_ppl_e53034 import bustm_datasets  # noqa: F401, F403

opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_4b16c0.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+bustm_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+bustm_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='请判断以下两句话说的是否是一个意思：')
+                ],
+                round=[
+                    dict(role='HUMAN', prompt='{sentence1}，{sentence2}'),
+                    dict(role='BOT', prompt='两句话说的毫不相关。')
+                ]),
+            1:
+            dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt='请判断以下两句话说的是否是一个意思：')
+                ],
+                round=[
+                    dict(role='HUMAN', prompt='{sentence1}，{sentence2}'),
+                    dict(role='BOT', prompt='两句话说是的一个意思。')
+                ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+bustm_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='bustm-dev',
+        path='json',
+        data_files='./data/FewCLUE/bustm/dev_few_all.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg),
+    dict(
+        type=HFDataset,
+        abbr='bustm-test',
+        path='json',
+        data_files='./data/FewCLUE/bustm/test_public.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg)
+]

opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_9ef540.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+bustm_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+bustm_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0: '{sentence1}。\n{sentence2}。\n两句话说的毫不相关。',
+            1: '{sentence1}。\n{sentence2}。\n两句话说的一个意思。'
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+bustm_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='bustm-dev',
+        path='json',
+        data_files='./data/FewCLUE/bustm/dev_few_all.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg),
+    dict(
+        type=HFDataset,
+        abbr='bustm-test',
+        path='json',
+        data_files='./data/FewCLUE/bustm/test_public.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg)
+]

opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_e53034.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+bustm_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+bustm_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    '语句一：“{sentence1}”\n语句二：“{sentence2}”\n请判断语句一和语句二说的是否是一个意思？'
+                ),
+                dict(role='BOT', prompt='两句话说的毫不相关。')
+            ]),
+            1:
+            dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    '语句一：“{sentence1}”\n语句二：“{sentence2}”\n请判断语句一和语句二说的是否是一个意思？'
+                ),
+                dict(role='BOT', prompt='两句话说是的一个意思。')
+            ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+bustm_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='bustm-dev',
+        path='json',
+        data_files='./data/FewCLUE/bustm/dev_few_all.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg),
+    dict(
+        type=HFDataset,
+        abbr='bustm-test',
+        path='json',
+        data_files='./data/FewCLUE/bustm/test_public.json',
+        split='train',
+        reader_cfg=bustm_reader_cfg,
+        infer_cfg=bustm_infer_cfg,
+        eval_cfg=bustm_eval_cfg)
+]

opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets  # noqa: F401, F403

opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMNLIDatasetV2
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+ocnli_fc_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+ocnli_fc_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=
+                '阅读文章：{sentence1}\n根据上文，回答如下问题：{sentence2}\nA. 对\nB. 错\nC. 可能\n请从“A”，“B”，“C”中进行选择。\n答：'
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+ocnli_fc_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+ocnli_fc_datasets = [
+    dict(
+        abbr='ocnli_fc-dev',
+        type=CMNLIDatasetV2,  # ocnli_fc share the same format with cmnli
+        path='./data/FewCLUE/ocnli/dev_few_all.json',
+        local_mode=True,
+        reader_cfg=ocnli_fc_reader_cfg,
+        infer_cfg=ocnli_fc_infer_cfg,
+        eval_cfg=ocnli_fc_eval_cfg,
+    ),
+    dict(
+        abbr='ocnli_fc-test',
+        type=CMNLIDatasetV2,  # ocnli_fc share the same format with cmnli
+        path='./data/FewCLUE/ocnli/test_public.json',
+        local_mode=True,
+        reader_cfg=ocnli_fc_reader_cfg,
+        infer_cfg=ocnli_fc_infer_cfg,
+        eval_cfg=ocnli_fc_eval_cfg,
+    ),
+]

opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets  # noqa: F401, F403

opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+ocnli_fc_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+ocnli_fc_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'contradiction':
+            dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt='阅读文章：{sentence1}\n根据上文，回答如下问题：{sentence2}？'),
+                dict(role='BOT', prompt='错')
+            ]),
+            'entailment':
+            dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt='阅读文章：{sentence1}\n根据上文，回答如下问题：{sentence2}？'),
+                dict(role='BOT', prompt='对')
+            ]),
+            'neutral':
+            dict(round=[
+                dict(
+                    role='HUMAN', prompt='如果{sentence1}为真，那么{sentence2}也为真吗？'),
+                dict(role='BOT', prompt='可能')
+            ]),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ocnli_fc_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='ocnli_fc-dev',
+        path='json',
+        split='train',
+        data_files='./data/FewCLUE/ocnli/dev_few_all.json',
+        reader_cfg=ocnli_fc_reader_cfg,
+        infer_cfg=ocnli_fc_infer_cfg,
+        eval_cfg=ocnli_fc_eval_cfg),
+    dict(
+        type=HFDataset,
+        abbr='ocnli_fc-test',
+        path='json',
+        split='train',
+        data_files='./data/FewCLUE/ocnli/test_public.json',
+        reader_cfg=ocnli_fc_reader_cfg,
+        infer_cfg=ocnli_fc_infer_cfg,
+        eval_cfg=ocnli_fc_eval_cfg)
+]

opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import HFDataset
+ocnli_fc_reader_cfg = dict(
+    input_columns=['sentence1', 'sentence2'],
+    output_column='label',
+    test_split='train')
+ocnli_fc_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'contradiction':
+            '阅读文章：{sentence1}\n根据上文，回答如下问题： {sentence2}？\n答：错',
+            'entailment': '阅读文章：{sentence1}\n根据上文，回答如下问题： {sentence2}？\n答：对',
+            'neutral': '如果{sentence1}为真，那么{sentence2}也为真吗?可能'
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ocnli_fc_datasets = [
+    dict(
+        type=HFDataset,
+        abbr='ocnli_fc-dev',
+        path='json',
+        split='train',
+        data_files='./data/FewCLUE/ocnli/dev_few_all.json',
+        reader_cfg=ocnli_fc_reader_cfg,
+        infer_cfg=ocnli_fc_infer_cfg,
+        eval_cfg=ocnli_fc_eval_cfg),
+    dict(
+        type=HFDataset,
+        abbr='ocnli_fc-test',
+        path='json',
+        split='train',
+        data_files='./data/FewCLUE/ocnli/test_public.json',
+        reader_cfg=ocnli_fc_reader_cfg,
+        infer_cfg=ocnli_fc_infer_cfg,
+        eval_cfg=ocnli_fc_eval_cfg)
+]

opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .GaokaoBench_gen_5cfe9e import GaokaoBench_datasets  # noqa: F401, F403

opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py ADDED Viewed

	@@ -0,0 +1,304 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+_MCQ_prompts = [
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Math_II_MCQs',
+        'prefix_prompt':
+        '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+        'comment': ''
+    },
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Math_I_MCQs',
+        'prefix_prompt':
+        '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+        'comment': ''
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2022_History_MCQs',
+        'prefix_prompt':
+        '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2022_Biology_MCQs',
+        'prefix_prompt':
+        '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2022_Political_Science_MCQs',
+        'prefix_prompt':
+        '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'multi_choice',
+        'keyword':
+        '2010-2022_Physics_MCQs',
+        'prefix_prompt':
+        '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出所有符合题意的答案，并写在【答案】和<eoa>之间。\n例如：【答案】 AB <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2022_Chemistry_MCQs',
+        'prefix_prompt':
+        '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2013_English_MCQs',
+        'prefix_prompt':
+        '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_Chinese_Modern_Lit',
+        'prefix_prompt':
+        '请你做一道语文阅读理解题，其中包含三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_English_Fill_in_Blanks',
+        'prefix_prompt':
+        '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'five_out_of_seven',
+        'keyword':
+        '2012-2022_English_Cloze_Test',
+        'prefix_prompt':
+        '请回答下面的问题，将符合题意的五个选项的字母写在【答案】和<eoa>之间，例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_Geography_MCQs',
+        'prefix_prompt':
+        '请你做一道地理选择题，其中包含两到三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_English_Reading_Comp',
+        'prefix_prompt':
+        '请你做一道英语阅读理解题，其中包含三到五个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_Chinese_Lang_and_Usage_MCQs',
+        'prefix_prompt':
+        '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n（1）【解析】 ... <eoe>\n【答案】 ... <eoa>\n（2）【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答\n题目如下：'
+    },
+]
+_FBQ_prompts = [{
+    'type': 'cloze',
+    'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
+    'prefix_prompt':
+    '请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
+    'comment': ''
+}, {
+    'type': 'cloze',
+    'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
+    'prefix_prompt':
+    '请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
+    'comment': ''
+}, {
+    'type': 'cloze',
+    'keyword':
+    '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
+    'prefix_prompt':
+    '请回答下面的语文填空题\n请你仔细阅读题目，先找到题目对应的中国名篇，再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+    'comment': ''
+}, {
+    'type': 'cloze',
+    'keyword': '2014-2022_English_Language_Cloze_Passage',
+    'prefix_prompt':
+    '请回答下面的英语短文填词题\n仔细阅读题目，空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考，将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+    'comment': ''
+}]
+_OEQ_prompts = [
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Geography_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果���止一道题，请分别作答。\n题目如下：',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chemistry_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Math_I_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_History_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的历史解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Biology_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Math_II_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Physics_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的物理解答题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Political_Science_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的政治解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'correction',
+        'keyword': '2012-2022_English_Language_Error_Correction',
+        'prefix_prompt':
+        '请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一���步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
+        # "prefix_prompt": [
+        #     "请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
+        #     "请比较下面两篇短文，找到第二篇和第一篇的10处不同，每处不同只涉及一个单词，请将结果写在【答案】和<eoa>之间。例如：【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下：【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
+        # ],
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
+        'prefix_prompt':
+        '请解答下面的语文古代诗歌阅读题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
+        'prefix_prompt':
+        '请解答下面的语文实用类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
+        'prefix_prompt':
+        '请解答下面的语文文学类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
+        'prefix_prompt':
+        '请解答下面的语文文言文阅读，仔细阅读题目，前三题是单选题，最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面，例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword':
+        '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的语文解答题，仔细阅读题目，注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    }
+]
+GaokaoBench_datasets = []
+for _folder, _prompts in [
+    ('Multiple-choice_Questions', _MCQ_prompts),
+    ('Fill-in-the-blank_Questions', _FBQ_prompts),
+    ('Open-ended_Questions', _OEQ_prompts),
+]:
+    for _p in _prompts:
+        _reader_cfg = {
+            'input_columns': ['question'],
+            'output_column': 'answer',
+        }
+        _infer_cfg = {
+            'ice_template': {
+                'type': PromptTemplate,
+                'template': {
+                    'round': [{
+                        'role': 'HUMAN',
+                        'prompt': _p['prefix_prompt'] + '{question}'
+                    }]
+                },
+                'ice_token': '</E>'
+            },
+            'retriever': {
+                'type': ZeroRetriever
+            },
+            'inferencer': {
+                'type': GenInferencer,
+                'max_out_len': 1024,
+            }
+        }
+        _eval_cfg = {
+            'evaluator': {
+                'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
+            },
+            'pred_role': 'BOT',
+        }
+        _base_path = 'opencompass/GAOKAO-BENCH'
+        _dataset = {
+            'type': GaokaoBenchDataset,
+            'abbr': 'GaokaoBench_' + _p['keyword'],
+            'path': _base_path,
+            'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
+            'name': _p['keyword'],
+            'reader_cfg': _reader_cfg,
+            'infer_cfg': _infer_cfg,
+            'eval_cfg': _eval_cfg,
+        }
+        GaokaoBench_datasets.append(_dataset)
+_temporary_variables = [k for k in globals() if k.startswith('_')]
+for _t in _temporary_variables:
+    del globals()[_t]
+del _temporary_variables, _t

opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .GaokaoBench_mixed_9af5ee import GaokaoBench_datasets  # noqa: F401, F403

opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py ADDED Viewed

	@@ -0,0 +1,356 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.datasets import GaokaoBenchDataset
+_MCQ_prompts = [
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Math_II_MCQs',
+        'prefix_prompt':
+        '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+        'comment': ''
+    },
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Math_I_MCQs',
+        'prefix_prompt':
+        '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+        'comment': ''
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2022_History_MCQs',
+        'prefix_prompt':
+        '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2022_Biology_MCQs',
+        'prefix_prompt':
+        '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2022_Political_Science_MCQs',
+        'prefix_prompt':
+        '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'multi_choice',
+        'keyword':
+        '2010-2022_Physics_MCQs',
+        'prefix_prompt':
+        '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出所有符合题意的答案，并写在【答案】和<eoa>之间。\n例如：【答案】 AB <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2022_Chemistry_MCQs',
+        'prefix_prompt':
+        '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'single_choice',
+        'keyword':
+        '2010-2013_English_MCQs',
+        'prefix_prompt':
+        '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_Chinese_Modern_Lit',
+        'prefix_prompt':
+        '请你做一道语文阅读理解题，其中包含三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_English_Fill_in_Blanks',
+        'prefix_prompt':
+        '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'five_out_of_seven',
+        'keyword':
+        '2012-2022_English_Cloze_Test',
+        'prefix_prompt':
+        '请回答下面的问题，将符合题意的五个选项的字母写在【答案】和<eoa>之间，例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_Geography_MCQs',
+        'prefix_prompt':
+        '请你做一道地理选择题，其中包含两到三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_English_Reading_Comp',
+        'prefix_prompt':
+        '请你做一道英语阅读理解题，其中包含三到五个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
+    },
+    {
+        'type':
+        'multi_question_choice',
+        'keyword':
+        '2010-2022_Chinese_Lang_and_Usage_MCQs',
+        'prefix_prompt':
+        '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n（1）【解析】 ... <eoe>\n【答案】 ... <eoa>\n（2）【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答\n题目如下：'
+    },
+]
+_FBQ_prompts = [{
+    'type': 'cloze',
+    'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
+    'prefix_prompt':
+    '请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
+    'comment': ''
+}, {
+    'type': 'cloze',
+    'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
+    'prefix_prompt':
+    '请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
+    'comment': ''
+}, {
+    'type': 'cloze',
+    'keyword':
+    '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
+    'prefix_prompt':
+    '请回答下面的语文填空题\n请你仔细阅读题目，先找到题目对应的中国名篇，再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+    'comment': ''
+}, {
+    'type': 'cloze',
+    'keyword': '2014-2022_English_Language_Cloze_Passage',
+    'prefix_prompt':
+    '请回答下面的英语短文填词题\n仔细阅读题目，空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考，将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+    'comment': ''
+}]
+_OEQ_prompts = [
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Geography_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作��，如果不止一道题，请分别作答。\n题目如下：',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chemistry_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Math_I_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_History_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的历史解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Biology_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Math_II_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Physics_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的物理解答题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Political_Science_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的政治解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'correction',
+        'keyword': '2012-2022_English_Language_Error_Correction',
+        'prefix_prompt':
+        '请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方��请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
+        # "prefix_prompt": [
+        #     "请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
+        #     "请比较下面两篇短文，找到第二篇和第一篇的10处不同，每处不同只涉及一个单词，请将结果写在【答案】和<eoa>之间。例如：【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下：【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
+        # ],
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
+        'prefix_prompt':
+        '请解答下面的语文古代诗歌阅读题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
+        'prefix_prompt':
+        '请解答下面的语文实用类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
+        'prefix_prompt':
+        '请解答下面的语文文学类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
+        'prefix_prompt':
+        '请解答下面的语文文言文阅读，仔细阅读题目，前三题是单选题，最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面，例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    },
+    {
+        'type': 'subjective',
+        'keyword':
+        '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
+        'prefix_prompt':
+        '请解答下面的语文解答题，仔细阅读题目，注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:',
+        'comment': ''
+    }
+]
+GaokaoBench_datasets = []
+for _folder, _prompts in [
+    ('Multiple-choice_Questions', _MCQ_prompts),
+    ('Fill-in-the-blank_Questions', _FBQ_prompts),
+    ('Open-ended_Questions', _OEQ_prompts),
+]:
+    for _p in _prompts:
+        if _p['type'] == 'single_choice':
+            continue
+        _reader_cfg = {
+            'input_columns': ['question'],
+            'output_column': 'answer',
+        }
+        _infer_cfg = {
+            'ice_template': {
+                'type': PromptTemplate,
+                'template': {
+                    'round': [{
+                        'role': 'HUMAN',
+                        'prompt': _p['prefix_prompt'] + '{question}'
+                    }]
+                },
+                'ice_token': '</E>'
+            },
+            'retriever': {
+                'type': ZeroRetriever
+            },
+            'inferencer': {
+                'type': GenInferencer,
+                'max_out_len': 1024,
+            }
+        }
+        _eval_cfg = {
+            'evaluator': {
+                'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
+            },
+            'pred_role': 'BOT',
+        }
+        _base_path = './data/GAOKAO-BENCH/data'
+        _dataset = {
+            'type': GaokaoBenchDataset,
+            'abbr': 'GaokaoBench_' + _p['keyword'],
+            'path': _base_path,
+            'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
+            'name': _p['keyword'],
+            'reader_cfg': _reader_cfg,
+            'infer_cfg': _infer_cfg,
+            'eval_cfg': _eval_cfg,
+        }
+        GaokaoBench_datasets.append(_dataset)
+_folder = 'Multiple-choice_Questions'
+for _p in _MCQ_prompts:
+    if _p['type'] != 'single_choice':
+        continue
+    _reader_cfg = {
+        'input_columns': ['question'],
+        'output_column': 'answer',
+    }
+    _infer_cfg = {
+        'ice_template': {
+            'type': PromptTemplate,
+            'template': {
+                answer: {
+                    'round': [{
+                        'role': 'HUMAN',
+                        'prompt': _p['prefix_prompt'] + '{question}'
+                    }, {
+                        'role': 'BOT',
+                        'prompt': f'【答案】{answer} <eoa>'
+                    }]
+                }
+                for answer in ['A', 'B', 'C', 'D']
+            },
+            'ice_token': '</E>'
+        },
+        'retriever': {
+            'type': ZeroRetriever
+        },
+        'inferencer': {
+            'type': PPLInferencer
+        }
+    }
+    _eval_cfg = {
+        'evaluator': {
+            'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
+        },
+        'pred_role': 'BOT',
+    }
+    _base_path = 'opencompass/GAOKAO-BENCH'
+    _dataset = {
+        'type': GaokaoBenchDataset,
+        'abbr': 'GaokaoBench_' + _p['keyword'],
+        'path': _base_path,
+        'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
+        'name': _p['keyword'],
+        'reader_cfg': _reader_cfg,
+        'infer_cfg': _infer_cfg,
+        'eval_cfg': _eval_cfg,
+    }
+    GaokaoBench_datasets.append(_dataset)
+_temporary_variables = [k for k in globals() if k.startswith('_')]
+for _t in _temporary_variables:
+    del globals()[_t]
+del _temporary_variables, _t

opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ('Multiple-choice_Questions', MCQ_prompts),
+    ('Fill-in-the-blank_Questions', FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            'input_columns': ['question'],
+            'output_column': 'answer',
+        }
+        infer_cfg = {
+            'ice_template': {
+                'type': PromptTemplate,
+                'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
+                'ice_token': '</E>',
+            },
+            'retriever': {'type': ZeroRetriever},
+            'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
+        }
+        eval_cfg = {
+            'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
+            'pred_role': 'BOT',
+        }
+        _base_path = 'opencompass/GAOKAO-BENCH'
+        dataset = {
+            'type': GaokaoBenchDataset,
+            'abbr': 'GaokaoBench_' + p['keyword'],
+            'path': _base_path,
+            'filename': '/' + folder + '/' + p['keyword'] + '.json',
+            'name': p['keyword'],
+            'reader_cfg': reader_cfg,
+            'infer_cfg': infer_cfg,
+            'eval_cfg': eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)

opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ('Multiple-choice_Questions', MCQ_prompts),
+    ('Fill-in-the-blank_Questions', FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            'input_columns': ['question'],
+            'output_column': 'answer',
+        }
+        infer_cfg = {
+            'prompt_template': {
+                'type': PromptTemplate,
+                'template': p['prefix_prompt'] + '{question}',
+            },
+            'retriever': {'type': ZeroRetriever},
+            'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
+        }
+        eval_cfg = {
+            'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
+            'pred_role': 'BOT',
+        }
+        _base_path = 'opencompass/GAOKAO-BENCH'
+        dataset = {
+            'type': GaokaoBenchDataset,
+            'abbr': 'GaokaoBench_' + p['keyword'],
+            'path': _base_path,
+            'filename': '/' + folder + '/' + p['keyword'] + '.json',
+            'name': p['keyword'],
+            'reader_cfg': reader_cfg,
+            'infer_cfg': infer_cfg,
+            'eval_cfg': eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)

opencompass/configs/datasets/GaokaoBench/GaokaoBench_prompts.py ADDED Viewed

	@@ -0,0 +1,191 @@

+MCQ_prompts = [
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Math_II_MCQs',
+        'prefix_prompt': '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+        'comment': '',
+    },
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Math_I_MCQs',
+        'prefix_prompt': '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+        'comment': '',
+    },
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_History_MCQs',
+        'prefix_prompt': '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+    },
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Biology_MCQs',
+        'prefix_prompt': '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+    },
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Political_Science_MCQs',
+        'prefix_prompt': '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+    },
+    {
+        'type': 'multi_choice',
+        'keyword': '2010-2022_Physics_MCQs',
+        'prefix_prompt': '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出所有符合题意的答案，并写在【答案】和<eoa>之间。\n例如：【答案】 AB <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n',
+    },
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2022_Chemistry_MCQs',
+        'prefix_prompt': '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+    },
+    {
+        'type': 'single_choice',
+        'keyword': '2010-2013_English_MCQs',
+        'prefix_prompt': '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下：',
+    },
+    {
+        'type': 'multi_question_choice',
+        'keyword': '2010-2022_Chinese_Modern_Lit',
+        'prefix_prompt': '请你做一道语文阅读理解题，其中包含三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
+    },
+    {
+        'type': 'multi_question_choice',
+        'keyword': '2010-2022_English_Fill_in_Blanks',
+        'prefix_prompt': '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
+    },
+    {
+        'type': 'five_out_of_seven',
+        'keyword': '2012-2022_English_Cloze_Test',
+        'prefix_prompt': '请回答下面��问题，将符合题意的五个选项的字母写在【答案】和<eoa>之间，例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n',
+    },
+    {
+        'type': 'multi_question_choice',
+        'keyword': '2010-2022_Geography_MCQs',
+        'prefix_prompt': '请你做一道地理选择题，其中包含两到三个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
+    },
+    {
+        'type': 'multi_question_choice',
+        'keyword': '2010-2022_English_Reading_Comp',
+        'prefix_prompt': '请你做一道英语阅读理解题，其中包含三到五个小题。\n请你一步一步思考。每一题你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：（1）【答案】 A <eoa>\n（2）【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
+    },
+    {
+        'type': 'multi_question_choice',
+        'keyword': '2010-2022_Chinese_Lang_and_Usage_MCQs',
+        'prefix_prompt': '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A，B，C，D中选出正确的答案，并写在【答案】和<eoa>之间。\n例如：【答案】: A <eoa>\n完整的题目回答的格式如下：\n（1）【解析】 ... <eoe>\n【答案】 ... <eoa>\n（2）【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答\n题目如下：',
+    },
+]
+FBQ_prompts = [
+    {
+        'type': 'cloze',
+        'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
+        'prefix_prompt': '请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'cloze',
+        'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
+        'prefix_prompt': '请解答下面的数学填空题\n仔细阅读题目，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'cloze',
+        'keyword': '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
+        'prefix_prompt': '请回答下面的语文填空题\n请你仔细阅读题目，先找到题目对应的中国名篇，再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'cloze',
+        'keyword': '2014-2022_English_Language_Cloze_Passage',
+        'prefix_prompt': '请回答下面的英语短文填词题\n仔细阅读题目，空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考，将思考过程写在【解析】和<eoe>之间，将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下：\n（1）【解析】 ...<eoe>\n【答案】...<eoa>\n（2）【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+]
+OEQ_prompts = [
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Geography_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下：',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chemistry_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解���】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Math_I_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_History_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的历史解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Biology_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Math_II_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间，答案需要有完整的解题步骤。\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Physics_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的物理解答题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Political_Science_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的政治解答题\n仔细阅读材料和题目，并充分结合你已有的知识，解答其中的问题，请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下：\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'correction',
+        'keyword': '2012-2022_English_Language_Error_Correction',
+        'prefix_prompt': '请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
+        # "prefix_prompt": [
+        #     "请解答下面的英语短文改错题，仔细阅读题目并充分结合你你已有的知识，找出其中10处需要改动的地方。请你一步步思考，把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下：【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
+        #     "请比较下面两篇短文，找到第二篇和第一篇的10处不同，每处不同只涉及一个单词，请将结果写在【答案】和<eoa>之间。例如：【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下：【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
+        # ],
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
+        'prefix_prompt': '请解答下面的语文古代诗歌阅读题，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
+        'prefix_prompt': '请解答下面的语文实用类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
+        'prefix_prompt': '请解答下面的语文文学类文本阅读，仔细阅读题目，注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
+        'prefix_prompt': '请解答下面的语文文言文阅读，仔细阅读题目，前三题是单选题，最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面，例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下：（1）[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答，如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+    {
+        'type': 'subjective',
+        'keyword': '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
+        'prefix_prompt': '请解答下面的语文解答题，仔细阅读题目，注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案，例如“【答案】A <eoa>”。\n完整的题目回答格式如下：（1）【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题，请分别作答。\n题目如下:',
+        'comment': '',
+    },
+]

opencompass/configs/datasets/GaokaoBench/README.md ADDED Viewed

	@@ -0,0 +1,191 @@

+# GaokaoBench
+```bash
+python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
+```
+## Base Models
+|          model           |   GaokaoBench |
+|:------------------------:|--------------:|
+|    llama-7b-turbomind    |         14.55 |
+|   llama-13b-turbomind    |         16.20 |
+|   llama-30b-turbomind    |         16.14 |
+|   llama-65b-turbomind    |         13.31 |
+|   llama-2-7b-turbomind   |         15.02 |
+|  llama-2-13b-turbomind   |         14.86 |
+|  llama-2-70b-turbomind   |         16.36 |
+|   llama-3-8b-turbomind   |         20.88 |
+|  llama-3-70b-turbomind   |         19.98 |
+| internlm2-1.8b-turbomind |         23.78 |
+|  internlm2-7b-turbomind  |         41.41 |
+| internlm2-20b-turbomind  |         58.99 |
+|   qwen-1.8b-turbomind    |         22.11 |
+|    qwen-7b-turbomind     |         35.32 |
+|    qwen-14b-turbomind    |         54.07 |
+|    qwen-72b-turbomind    |         77.56 |
+|     qwen1.5-0.5b-hf      |         30.67 |
+|     qwen1.5-1.8b-hf      |         35.66 |
+|      qwen1.5-4b-hf       |         54.31 |
+|      qwen1.5-7b-hf       |         65.99 |
+|      qwen1.5-14b-hf      |         66.60 |
+|      qwen1.5-32b-hf      |         79.01 |
+|      qwen1.5-72b-hf      |         80.26 |
+|   qwen1.5-moe-a2-7b-hf   |         52.79 |
+|    mistral-7b-v0.1-hf    |         14.35 |
+|    mistral-7b-v0.2-hf    |         11.10 |
+|   mixtral-8x7b-v0.1-hf   |          8.40 |
+|  mixtral-8x22b-v0.1-hf   |         16.23 |
+|         yi-6b-hf         |         31.70 |
+|        yi-34b-hf         |         30.51 |
+|   deepseek-7b-base-hf    |         17.02 |
+|   deepseek-67b-base-hf   |         10.14 |
+### Details
+|          model           |   2010-2022_Math_II_MCQs |   2010-2022_Math_I_MCQs |   2010-2022_History_MCQs |   2010-2022_Biology_MCQs |   2010-2022_Political_Science_MCQs |   2010-2022_Physics_MCQs |   2010-2022_Chemistry_MCQs |
+|:------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
+|    llama-7b-turbomind    |                    14.22 |                   13.55 |                    12.54 |                    18.67 |                              19.06 |                     2.34 |                      17.74 |
+|   llama-13b-turbomind    |                    18.81 |                   15.89 |                    21.25 |                    22.67 |                              15.62 |                     1.56 |                      25.81 |
+|   llama-30b-turbomind    |                    20.64 |                   19.16 |                    27.18 |                    16.67 |                              16.56 |                     2.34 |                      12.10 |
+|   llama-65b-turbomind    |                    21.10 |                   15.89 |                    11.50 |                    20.00 |                               5.94 |                     1.56 |                      21.77 |
+|   llama-2-7b-turbomind   |                    16.97 |                   16.36 |                    20.91 |                    22.00 |                              18.75 |                     2.34 |                      11.29 |
+|  llama-2-13b-turbomind   |                    14.68 |                   11.68 |                    26.13 |                    16.00 |                              17.81 |                     2.34 |                      20.97 |
+|  llama-2-70b-turbomind   |                    18.81 |                   12.15 |                    26.13 |                    16.00 |                              20.31 |                     4.69 |                      16.13 |
+|   llama-3-8b-turbomind   |                     4.13 |                    7.94 |                    37.63 |                    24.67 |                              26.25 |                     5.47 |                      21.77 |
+|  llama-3-70b-turbomind   |                     4.59 |                    3.12 |                    20.83 |                    10.94 |                              18.00 |                     6.25 |                      15.62 |
+| internlm2-1.8b-turbomind |                    20.64 |                   22.90 |                    39.72 |                    30.00 |                              25.94 |                    10.94 |                      31.45 |
+|  internlm2-7b-turbomind  |                    33.94 |                   35.51 |                    38.33 |                    59.33 |                              61.56 |                     2.34 |                      11.29 |
+| internlm2-20b-turbomind  |                    59.17 |                   51.40 |                    65.16 |                    74.00 |                              82.19 |                    28.91 |                      54.03 |
+|   qwen-1.8b-turbomind    |                    29.36 |                   30.84 |                    19.51 |                    26.00 |                              22.19 |                     5.47 |                      27.42 |
+|    qwen-7b-turbomind     |                    22.48 |                   28.04 |                    45.64 |                    43.33 |                              62.19 |                     3.91 |                      33.87 |
+|    qwen-14b-turbomind    |                    54.13 |                   56.25 |                    82.93 |                    72.00 |                              85.00 |                     4.69 |                      65.62 |
+|    qwen-72b-turbomind    |                    73.12 |                   64.49 |                    91.67 |                    90.62 |                              58.75 |                    44.53 |                      79.03 |
+|     qwen1.5-0.5b-hf      |                    26.61 |                   32.71 |                    32.40 |                    34.67 |                              53.44 |                    10.94 |                      28.23 |
+|     qwen1.5-1.8b-hf      |                    36.24 |                   33.18 |                    56.45 |                    36.00 |                              49.38 |                     6.25 |                      33.06 |
+|      qwen1.5-4b-hf       |                    45.41 |                   37.85 |                    68.29 |                    62.00 |                              87.81 |                     5.47 |                      47.58 |
+|      qwen1.5-7b-hf       |                    56.42 |                   53.74 |                    85.02 |                    69.33 |                              86.88 |                    28.12 |                      70.16 |
+|      qwen1.5-14b-hf      |                    69.27 |                   63.08 |                    54.01 |                    79.33 |                              76.56 |                    40.62 |                      79.84 |
+|      qwen1.5-32b-hf      |                    71.10 |                   61.68 |                    92.68 |                    93.33 |                              95.94 |                    45.31 |                      83.06 |
+|      qwen1.5-72b-hf      |                    71.15 |                   68.22 |                    94.44 |                    96.67 |                              95.00 |                    38.28 |                      75.00 |
+|   qwen1.5-moe-a2-7b-hf   |                    35.32 |                   29.44 |                    68.64 |                    44.67 |                              75.00 |                    17.97 |                      59.68 |
+|    mistral-7b-v0.1-hf    |                    13.76 |                   12.15 |                     9.76 |                     8.00 |                               5.94 |                     0.00 |                      17.74 |
+|    mistral-7b-v0.2-hf    |                     6.88 |                    5.61 |                    10.45 |                    12.00 |                               4.06 |                     0.78 |                      14.52 |
+|   mixtral-8x7b-v0.1-hf   |                     3.67 |                    1.87 |                     0.35 |                     0.00 |                               0.00 |                     0.78 |                       0.81 |
+|  mixtral-8x22b-v0.1-hf   |                    16.51 |                   15.89 |                     1.39 |                     3.33 |                               9.69 |                     0.00 |                      13.71 |
+|         yi-6b-hf         |                     6.25 |                    3.12 |                    40.74 |                    43.75 |                              35.94 |                     8.59 |                      31.25 |
+|        yi-34b-hf         |                    12.50 |                    4.17 |                    31.11 |                     5.00 |                              20.62 |                     2.34 |                       0.89 |
+|   deepseek-7b-base-hf    |                    14.22 |                   13.08 |                    25.78 |                    20.67 |                              20.31 |                     5.47 |                      18.55 |
+|   deepseek-67b-base-hf   |                     3.67 |                    4.21 |                     8.36 |                     7.33 |                               4.69 |                     1.56 |                       4.84 |
+|          model           |   2010-2013_English_MCQs |   2010-2022_Chinese_Modern_Lit |   2010-2022_English_Fill_in_Blanks |   2012-2022_English_Cloze_Test |   2010-2022_Geography_MCQs |   2010-2022_English_Reading_Comp |   2010-2022_Chinese_Lang_and_Usage_MCQs |
+|:------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
+|    llama-7b-turbomind    |                    19.05 |                           0.00 |                              15.00 |                          16.15 |                      22.11 |                            10.43 |                                   15.00 |
+|   llama-13b-turbomind    |                    22.86 |                           0.00 |                               8.50 |                           8.46 |                      24.21 |                             9.36 |                                   20.00 |
+|   llama-30b-turbomind    |                    28.57 |                           0.00 |                               6.33 |                          13.85 |                      23.16 |                            12.98 |                                   12.50 |
+|   llama-65b-turbomind    |                    21.90 |                           0.00 |                               8.00 |                          13.85 |                      16.84 |                            12.34 |                                   10.00 |
+|   llama-2-7b-turbomind   |                    20.95 |                           0.00 |                               6.17 |                          12.31 |                      22.11 |                            11.28 |                                   11.25 |
+|  llama-2-13b-turbomind   |                    16.19 |                           0.00 |                               9.83 |                          13.08 |                      22.11 |                             7.66 |                                   10.00 |
+|  llama-2-70b-turbomind   |                    31.43 |                           0.00 |                               4.17 |                          13.08 |                      25.26 |                            20.43 |                                    7.50 |
+|   llama-3-8b-turbomind   |                     1.90 |                           1.15 |                              42.00 |                           7.69 |                      29.47 |                            17.66 |                                   17.50 |
+|  llama-3-70b-turbomind   |                    18.75 |                           3.45 |                              53.67 |                          76.15 |                      18.60 |                            36.76 |                                    8.75 |
+| internlm2-1.8b-turbomind |                    33.33 |                           3.45 |                              15.67 |                          13.85 |                      32.63 |                            10.43 |                                   25.00 |
+|  internlm2-7b-turbomind  |                    61.90 |                          20.69 |                              57.33 |                          20.77 |                      61.05 |                            40.21 |                                   47.50 |
+| internlm2-20b-turbomind  |                    72.38 |                          37.93 |                              62.33 |                          19.23 |                      74.74 |                            38.51 |                                   48.75 |
+|   qwen-1.8b-turbomind    |                    47.62 |                           9.20 |                              13.50 |                          12.31 |                      25.26 |                            16.38 |                                   21.25 |
+|    qwen-7b-turbomind     |                    42.86 |                          12.64 |                              35.83 |                          26.15 |                      51.58 |                            17.87 |                                   30.00 |
+|    qwen-14b-turbomind    |                    89.58 |                           3.45 |                               5.00 |                          23.85 |                      93.02 |                            21.10 |                                   40.62 |
+|    qwen-72b-turbomind    |                    71.43 |                          81.25 |                              88.17 |                          96.25 |                      95.79 |                            79.57 |                                   90.00 |
+|     qwen1.5-0.5b-hf      |                    40.95 |                          22.99 |                              21.67 |                          21.54 |                      38.95 |                            17.02 |                                   22.50 |
+|     qwen1.5-1.8b-hf      |                    85.71 |                          29.89 |                              22.17 |                          30.00 |                      34.74 |                            20.43 |                                   27.50 |
+|      qwen1.5-4b-hf       |                    88.57 |                          35.63 |                              41.00 |                          67.69 |                      64.21 |                            41.28 |                                   68.75 |
+|      qwen1.5-7b-hf       |                    93.33 |                          14.94 |                              59.33 |                          70.00 |                      61.05 |                            67.87 |                                   61.25 |
+|      qwen1.5-14b-hf      |                    94.29 |                          16.09 |                              59.67 |                          76.92 |                      90.53 |                            59.57 |                                   77.50 |
+|      qwen1.5-32b-hf      |                    94.29 |                          43.68 |                              82.83 |                          38.46 |                      97.89 |                            75.96 |                                   67.50 |
+|      qwen1.5-72b-hf      |                    99.05 |                          28.74 |                              85.62 |                          77.69 |                      94.74 |                            72.77 |                                   87.50 |
+|   qwen1.5-moe-a2-7b-hf   |                    65.71 |                          36.78 |                              51.67 |                          75.38 |                      72.63 |                            61.28 |                                   33.75 |
+|    mistral-7b-v0.1-hf    |                    17.14 |                           8.05 |                              28.33 |                           6.92 |                      24.21 |                            30.43 |                                   12.50 |
+|    mistral-7b-v0.2-hf    |                     7.62 |                           9.20 |                              23.17 |                           6.15 |                      25.26 |                            19.15 |                                    7.50 |
+|   mixtral-8x7b-v0.1-hf   |                     0.00 |                           4.60 |                              33.83 |                          10.77 |                      37.89 |                            25.96 |                                    3.75 |
+|  mixtral-8x22b-v0.1-hf   |                     7.62 |                           4.17 |                              51.33 |                          14.62 |                      53.68 |                            21.91 |                                   10.00 |
+|         yi-6b-hf         |                    17.14 |                          52.87 |                              50.83 |                          36.25 |                      36.84 |                            48.09 |                                   36.25 |
+|        yi-34b-hf         |                     0.00 |                          59.77 |                              76.67 |                          86.92 |                      67.44 |                            61.06 |                                   81.25 |
+|   deepseek-7b-base-hf    |                    20.95 |                           2.30 |                              17.83 |                          12.31 |                      25.26 |                            12.55 |                                    8.75 |
+|   deepseek-67b-base-hf   |                     1.90 |                           9.20 |                              27.33 |                          30.00 |                      40.00 |                            13.19 |                                    3.75 |
+## Chat Models
+|             model             |   GaokaoBench |
+|:-----------------------------:|--------------:|
+|     qwen1.5-0.5b-chat-hf      |         21.51 |
+|     qwen1.5-1.8b-chat-hf      |         46.19 |
+|      qwen1.5-4b-chat-hf       |         59.11 |
+|      qwen1.5-7b-chat-hf       |         70.55 |
+|      qwen1.5-14b-chat-hf      |         80.39 |
+|      qwen1.5-32b-chat-hf      |         86.15 |
+|      qwen1.5-72b-chat-hf      |         88.58 |
+|     qwen1.5-110b-chat-hf      |         89.59 |
+|    internlm2-chat-1.8b-hf     |         29.73 |
+|  internlm2-chat-1.8b-sft-hf   |         28.79 |
+|     internlm2-chat-7b-hf      |         54.54 |
+|   internlm2-chat-7b-sft-hf    |         55.39 |
+|     internlm2-chat-20b-hf     |         57.95 |
+|   internlm2-chat-20b-sft-hf   |         57.62 |
+|    llama-3-8b-instruct-hf     |         45.48 |
+|    llama-3-70b-instruct-hf    |         65.91 |
+| llama-3-8b-instruct-lmdeploy  |         44.48 |
+| llama-3-70b-instruct-lmdeploy |         67.06 |
+|  mistral-7b-instruct-v0.1-hf  |         26.21 |
+|  mistral-7b-instruct-v0.2-hf  |         32.17 |
+| mixtral-8x7b-instruct-v0.1-hf |         42.46 |
+### Details
+|             model             |   2010-2022_Math_II_MCQs |   2010-2022_Math_I_MCQs |   2010-2022_History_MCQs |   2010-2022_Biology_MCQs |   2010-2022_Political_Science_MCQs |   2010-2022_Physics_MCQs |   2010-2022_Chemistry_MCQs |
+|:-----------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                    25.23 |                   25.70 |                    39.02 |                    24.67 |                              25.00 |                     0.78 |                      25.00 |
+|     qwen1.5-1.8b-chat-hf      |                    30.28 |                   26.64 |                    61.32 |                    55.33 |                              77.81 |                    11.72 |                      40.32 |
+|      qwen1.5-4b-chat-hf       |                    38.53 |                   35.05 |                    70.73 |                    70.00 |                              83.44 |                    25.00 |                      41.13 |
+|      qwen1.5-7b-chat-hf       |                    49.54 |                   39.72 |                    81.88 |                    82.67 |                              90.62 |                    46.88 |                      61.29 |
+|      qwen1.5-14b-chat-hf      |                    64.68 |                   54.21 |                    87.80 |                    90.67 |                              94.69 |                    44.53 |                      69.35 |
+|      qwen1.5-32b-chat-hf      |                    70.92 |                   66.14 |                    98.02 |                    97.74 |                              96.07 |                    57.81 |                      72.92 |
+|      qwen1.5-72b-chat-hf      |                    76.61 |                   68.22 |                    95.47 |                    96.00 |                              97.19 |                    64.06 |                      86.29 |
+|     qwen1.5-110b-chat-hf      |                    80.36 |                   66.67 |                   100.00 |                   100.00 |                              96.25 |                    65.62 |                      75.00 |
+|    internlm2-chat-1.8b-hf     |                    28.44 |                   28.50 |                    46.69 |                    39.33 |                              44.38 |                    10.16 |                      26.61 |
+|  internlm2-chat-1.8b-sft-hf   |                    23.85 |                   20.09 |                    55.75 |                    40.67 |                              53.12 |                    14.84 |                      30.65 |
+|     internlm2-chat-7b-hf      |                    45.87 |                   42.52 |                    77.70 |                    75.33 |                              76.56 |                    16.41 |                      38.71 |
+|   internlm2-chat-7b-sft-hf    |                    49.08 |                   39.72 |                    80.84 |                    68.67 |                              81.25 |                    29.69 |                      42.74 |
+|     internlm2-chat-20b-hf     |                    53.21 |                   46.73 |                    80.49 |                    74.00 |                              85.00 |                    31.25 |                      37.10 |
+|   internlm2-chat-20b-sft-hf   |                    51.83 |                   47.20 |                    86.06 |                    78.00 |                              88.12 |                    35.16 |                      45.16 |
+|    llama-3-8b-instruct-hf     |                    37.16 |                   31.31 |                    60.98 |                    48.67 |                              51.25 |                    11.72 |                      39.52 |
+|    llama-3-70b-instruct-hf    |                    58.26 |                   52.34 |                    63.76 |                    75.33 |                              75.31 |                    36.72 |                      53.23 |
+| llama-3-8b-instruct-lmdeploy  |                    37.61 |                   35.51 |                    55.05 |                    53.33 |                              52.19 |                     7.81 |                      34.68 |
+| llama-3-70b-instruct-lmdeploy |                    75.00 |                   55.56 |                    61.11 |                    73.68 |                              70.00 |                    40.62 |                      43.75 |
+|  mistral-7b-instruct-v0.1-hf  |                    23.39 |                   21.03 |                    35.19 |                    18.00 |                              26.56 |                     5.47 |                      30.65 |
+|  mistral-7b-instruct-v0.2-hf  |                    31.19 |                   19.63 |                    38.33 |                    40.00 |                              35.94 |                    20.31 |                      34.68 |
+| mixtral-8x7b-instruct-v0.1-hf |                    41.28 |                   37.85 |                    52.26 |                    47.33 |                              50.00 |                    25.78 |                      43.55 |
+|             model             |   2010-2013_English_MCQs |   2010-2022_Chinese_Modern_Lit |   2010-2022_English_Fill_in_Blanks |   2012-2022_English_Cloze_Test |   2010-2022_Geography_MCQs |   2010-2022_English_Reading_Comp |   2010-2022_Chinese_Lang_and_Usage_MCQs |
+|:-----------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                    32.38 |                          10.34 |                               0.00 |                           2.31 |                      27.37 |                            15.11 |                                   18.75 |
+|     qwen1.5-1.8b-chat-hf      |                    69.52 |                          42.53 |                              56.33 |                           2.31 |                      61.05 |                            32.98 |                                   35.00 |
+|      qwen1.5-4b-chat-hf       |                    70.48 |                          58.62 |                              82.33 |                          16.15 |                      68.42 |                            68.51 |                                   47.50 |
+|      qwen1.5-7b-chat-hf       |                    83.81 |                          71.26 |                              85.17 |                          57.69 |                      81.05 |                            78.94 |                                   66.25 |
+|      qwen1.5-14b-chat-hf      |                    93.33 |                          78.16 |                              97.17 |                          71.54 |                      91.58 |                            94.26 |                                   81.25 |
+|      qwen1.5-32b-chat-hf      |                   100.00 |                          81.61 |                              95.83 |                          90.00 |                      97.89 |                            92.43 |                                   92.86 |
+|      qwen1.5-72b-chat-hf      |                    98.10 |                          83.91 |                              98.00 |                          90.77 |                      94.74 |                            96.38 |                                   96.25 |
+|     qwen1.5-110b-chat-hf      |                   100.00 |                          91.95 |                              98.50 |                          97.69 |                      95.35 |                            98.44 |                                  100.00 |
+|    internlm2-chat-1.8b-hf     |                    38.10 |                           6.90 |                               0.67 |                           1.54 |                      56.84 |                            23.19 |                                   30.00 |
+|  internlm2-chat-1.8b-sft-hf   |                    50.48 |                           0.00 |                               0.00 |                           0.00 |                      27.37 |                            11.91 |                                   32.50 |
+|     internlm2-chat-7b-hf      |                    60.95 |                          67.82 |                               7.00 |                           7.69 |                      70.53 |                            79.79 |                                   38.75 |
+|   internlm2-chat-7b-sft-hf    |                    60.00 |                          71.26 |                               6.50 |                           0.77 |                      68.42 |                            77.02 |                                   42.50 |
+|     internlm2-chat-20b-hf     |                    60.95 |                          43.68 |                              34.83 |                           4.62 |                      71.58 |                            62.55 |                                   43.75 |
+|   internlm2-chat-20b-sft-hf   |                    75.24 |                          47.13 |                               1.00 |                           2.31 |                      80.00 |                            65.96 |                                   37.50 |
+|    llama-3-8b-instruct-hf     |                    50.48 |                          36.78 |                              30.83 |                          21.54 |                      57.89 |                            81.70 |                                   28.75 |
+|    llama-3-70b-instruct-hf    |                    73.33 |                          59.77 |                              82.83 |                          24.62 |                      73.68 |                            91.28 |                                   45.00 |
+| llama-3-8b-instruct-lmdeploy  |                    52.38 |                          42.53 |                              21.33 |                          18.46 |                      58.95 |                            81.28 |                                   26.25 |
+| llama-3-70b-instruct-lmdeploy |                    87.50 |                          62.07 |                              84.38 |                          26.92 |                      72.63 |                            91.20 |                                   56.25 |
+|  mistral-7b-instruct-v0.1-hf  |                    38.10 |                          18.39 |                              30.50 |                           6.15 |                      31.58 |                            38.72 |                                   18.75 |
+|  mistral-7b-instruct-v0.2-hf  |                    41.90 |                          31.03 |                              28.00 |                          20.77 |                      29.47 |                            42.13 |                                   15.00 |
+| mixtral-8x7b-instruct-v0.1-hf |                    49.52 |                          39.08 |                              41.33 |                           9.23 |                      44.21 |                            43.19 |                                   21.25 |

opencompass/configs/datasets/XLSum/XLSum_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .XLSum_gen_2bb71c import XLSum_datasets  # noqa: F401, F403

opencompass/configs/datasets/XLSum/XLSum_gen_2bb71c.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import RougeEvaluator
+from opencompass.datasets import XLSUMDataset, Xsum_postprocess
+XLSum_reader_cfg = dict(input_columns=['text'], output_column='summary')
+XLSum_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='Document：{text}\n'
+        'Based on the previous text, provide a brief single summary:'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+XLSum_eval_cfg = dict(
+    evaluator=dict(type=RougeEvaluator),
+    pred_postprocessor=dict(type=Xsum_postprocess),
+)
+XLSum_datasets = [
+    dict(
+        type=XLSUMDataset,
+        path='csebuetnlp/xlsum',
+        reader_cfg=XLSum_reader_cfg,
+        infer_cfg=XLSum_infer_cfg,
+        eval_cfg=XLSum_eval_cfg)
+]

opencompass/configs/datasets/bbh/README.md ADDED Viewed

	@@ -0,0 +1,250 @@

+# BBH
+```bash
+python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
+```
+## Base Models
+|          model           |   bbh |
+|:------------------------:|------:|
+|    llama-7b-turbomind    | 33.34 |
+|   llama-13b-turbomind    | 37.99 |
+|   llama-30b-turbomind    | 49.86 |
+|   llama-65b-turbomind    | 58.26 |
+|   llama-2-7b-turbomind   | 38.27 |
+|  llama-2-13b-turbomind   | 45.68 |
+|  llama-2-70b-turbomind   | 64.78 |
+|   llama-3-8b-turbomind   | 59.69 |
+|  llama-3-70b-turbomind   | 79.16 |
+| internlm2-1.8b-turbomind | 36.03 |
+|  internlm2-7b-turbomind  | 63.56 |
+| internlm2-20b-turbomind  | 71.29 |
+|   qwen-1.8b-turbomind    | 22.53 |
+|    qwen-7b-turbomind     | 45.89 |
+|    qwen-14b-turbomind    | 56.75 |
+|    qwen-72b-turbomind    | 63.35 |
+|     qwen1.5-0.5b-hf      | 20.54 |
+|     qwen1.5-1.8b-hf      | 27.01 |
+|      qwen1.5-4b-hf       | 34.81 |
+|      qwen1.5-7b-hf       | 39.87 |
+|      qwen1.5-14b-hf      | 50.38 |
+|      qwen1.5-32b-hf      | 67.47 |
+|      qwen1.5-72b-hf      | 58.81 |
+|   qwen1.5-moe-a2-7b-hf   | 39.46 |
+|    mistral-7b-v0.1-hf    | 56.71 |
+|    mistral-7b-v0.2-hf    | 57.32 |
+|   mixtral-8x7b-v0.1-hf   | 68.46 |
+|  mixtral-8x22b-v0.1-hf   | 79.48 |
+|         yi-6b-hf         | 44.82 |
+|        yi-34b-hf         | 66.37 |
+|   deepseek-7b-base-hf    | 42.88 |
+|   deepseek-67b-base-hf   | 71.86 |
+### Details
+|          model           |   temporal_sequences |   disambiguation_qa |   date_understanding |   tracking_shuffled_objects_three_objects |   penguins_in_a_table |   geometric_shapes |   snarks |   ruin_names |   tracking_shuffled_objects_seven_objects |
+|:------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
+|    llama-7b-turbomind    |                23.60 |               46.00 |                44.80 |                                     36.40 |                 30.14 |               0.00 |    46.07 |        21.60 |                                     15.20 |
+|   llama-13b-turbomind    |                16.80 |               50.00 |                56.80 |                                     36.40 |                 43.15 |               0.00 |    60.67 |        29.20 |                                     15.20 |
+|   llama-30b-turbomind    |                33.60 |               60.00 |                76.40 |                                     29.20 |                 57.53 |               0.00 |    59.55 |        62.40 |                                     17.20 |
+|   llama-65b-turbomind    |                84.00 |               76.00 |                84.40 |                                     50.00 |                 65.75 |               0.00 |    62.92 |        69.60 |                                     31.60 |
+|   llama-2-7b-turbomind   |                12.00 |               46.80 |                60.00 |                                     34.00 |                 32.19 |               0.00 |    49.44 |        32.80 |                                     18.40 |
+|  llama-2-13b-turbomind   |                24.00 |               40.80 |                73.20 |                                     36.00 |                 45.89 |               0.00 |    55.06 |        37.60 |                                     22.40 |
+|  llama-2-70b-turbomind   |                75.60 |               66.80 |                88.80 |                                     73.60 |                 69.86 |               0.00 |    73.60 |        60.80 |                                     57.60 |
+|   llama-3-8b-turbomind   |                65.60 |               42.00 |                78.80 |                                     56.80 |                 69.86 |               0.00 |    56.18 |        66.00 |                                     30.80 |
+|  llama-3-70b-turbomind   |               100.00 |               82.80 |                91.60 |                                    100.00 |                 86.30 |               0.00 |    81.46 |        77.20 |                                     94.40 |
+| internlm2-1.8b-turbomind |                31.20 |               44.00 |                60.00 |                                     36.00 |                 35.62 |               0.00 |    44.94 |        27.20 |                                     12.80 |
+|  internlm2-7b-turbomind  |                94.80 |               75.60 |                86.40 |                                     53.60 |                 69.18 |               0.00 |    59.55 |        68.00 |                                     46.00 |
+| internlm2-20b-turbomind  |                98.40 |               83.60 |                84.00 |                                     72.00 |                 71.92 |               0.00 |    81.46 |        78.40 |                                     74.40 |
+|   qwen-1.8b-turbomind    |                26.40 |               39.60 |                33.20 |                                     28.40 |                 28.08 |               0.00 |    44.94 |        21.60 |                                     12.40 |
+|    qwen-7b-turbomind     |                38.80 |               42.80 |                64.40 |                                     30.80 |                 45.89 |               0.00 |    55.62 |        44.00 |                                     14.40 |
+|    qwen-14b-turbomind    |                57.60 |               59.20 |                67.20 |                                     46.40 |                 67.12 |               0.00 |    51.12 |        63.60 |                                     30.40 |
+|    qwen-72b-turbomind    |                72.00 |               66.80 |                77.60 |                                     81.20 |                 84.93 |               0.00 |    78.09 |        67.20 |                                     63.60 |
+|     qwen1.5-0.5b-hf      |                15.20 |               37.20 |                20.40 |                                     30.40 |                 18.49 |               8.40 |    44.94 |        11.20 |                                     14.00 |
+|     qwen1.5-1.8b-hf      |                27.60 |               40.80 |                36.00 |                                     24.40 |                 32.19 |               0.00 |    50.56 |        20.80 |                                     11.20 |
+|      qwen1.5-4b-hf       |                10.40 |               44.40 |                47.20 |                                     36.80 |                 44.52 |              24.80 |    46.63 |        20.80 |                                     14.80 |
+|      qwen1.5-7b-hf       |                37.20 |               42.40 |                52.00 |                                     52.40 |                 56.85 |               6.80 |    48.31 |        23.60 |                                     18.40 |
+|      qwen1.5-14b-hf      |                38.80 |               62.80 |                73.60 |                                     24.80 |                 69.86 |              26.80 |    66.29 |        52.80 |                                      2.00 |
+|      qwen1.5-32b-hf      |                93.60 |               77.20 |                68.40 |                                     70.00 |                 82.88 |              36.80 |    47.75 |        70.40 |                                     71.20 |
+|      qwen1.5-72b-hf      |                75.60 |               66.00 |                78.80 |                                     72.80 |                 80.82 |               0.00 |    75.84 |        64.80 |                                     44.40 |
+|   qwen1.5-moe-a2-7b-hf   |                23.20 |               59.60 |                43.20 |                                     27.60 |                 46.58 |              25.20 |    48.88 |        16.80 |                                     13.20 |
+|    mistral-7b-v0.1-hf    |                73.60 |               53.60 |                76.40 |                                     45.20 |                 56.85 |              28.00 |    64.04 |        66.00 |                                     21.60 |
+|    mistral-7b-v0.2-hf    |                76.80 |               42.00 |                73.20 |                                     47.20 |                 60.27 |              26.00 |    66.85 |        60.80 |                                     26.40 |
+|   mixtral-8x7b-v0.1-hf   |                89.60 |               70.80 |                84.80 |                                     81.20 |                 70.55 |              25.60 |    66.29 |        71.20 |                                     58.80 |
+|  mixtral-8x22b-v0.1-hf   |                98.80 |               77.60 |                92.00 |                                     98.80 |                 83.56 |              35.60 |    80.34 |        79.20 |                                     82.00 |
+|         yi-6b-hf         |                32.80 |               46.40 |                64.40 |                                     34.40 |                 47.26 |              28.80 |    60.11 |        45.60 |                                     14.00 |
+|        yi-34b-hf         |                86.00 |               76.00 |                84.80 |                                     54.80 |                 67.81 |              24.80 |    73.60 |        66.00 |                                     65.60 |
+|   deepseek-7b-base-hf    |                27.60 |               42.00 |                64.40 |                                     31.20 |                 40.41 |              33.60 |    52.25 |        46.00 |                                     13.20 |
+|   deepseek-67b-base-hf   |                95.60 |               75.60 |                86.40 |                                     86.40 |                 76.71 |              39.20 |    76.40 |        77.20 |                                     82.00 |
+|          model           |   tracking_shuffled_objects_five_objects |   logical_deduction_three_objects |   hyperbaton |   logical_deduction_five_objects |   logical_deduction_seven_objects |   movie_recommendation |   salient_translation_error_detection |   reasoning_about_colored_objects |   multistep_arithmetic_two |
+|:------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
+|    llama-7b-turbomind    |                                    18.40 |                             42.80 |        58.00 |                            23.20 |                             13.20 |                  40.00 |                                 16.40 |                             30.40 |                       0.00 |
+|   llama-13b-turbomind    |                                    16.00 |                             48.80 |        53.60 |                            30.40 |                             16.40 |                  61.60 |                                 11.20 |                             44.80 |                       0.80 |
+|   llama-30b-turbomind    |                                    22.40 |                             66.40 |        73.20 |                            43.60 |                             31.60 |                  84.40 |                                 43.60 |                             57.60 |                       2.80 |
+|   llama-65b-turbomind    |                                    41.60 |                             79.20 |        74.40 |                            48.40 |                             39.20 |                  91.20 |                                 40.40 |                             67.20 |                      20.00 |
+|   llama-2-7b-turbomind   |                                    17.20 |                             54.80 |        51.60 |                            32.80 |                             23.60 |                  74.40 |                                 19.60 |                             45.60 |                       1.20 |
+|  llama-2-13b-turbomind   |                                    23.20 |                             63.60 |        52.40 |                            46.00 |                             42.00 |                  68.00 |                                 21.60 |                             62.00 |                       2.00 |
+|  llama-2-70b-turbomind   |                                    72.40 |                             86.40 |        84.40 |                            55.20 |                             43.20 |                  95.60 |                                 50.80 |                             76.80 |                      20.80 |
+|   llama-3-8b-turbomind   |                                    40.80 |                             76.40 |        93.20 |                            45.20 |                             36.80 |                  88.80 |                                 53.60 |                             72.80 |                      30.80 |
+|  llama-3-70b-turbomind   |                                    99.20 |                             94.00 |        98.00 |                            58.40 |                             42.80 |                  93.60 |                                 63.60 |                             88.40 |                      79.20 |
+| internlm2-1.8b-turbomind |                                    16.80 |                             47.60 |        63.60 |                            21.60 |                             12.00 |                  69.20 |                                 16.80 |                             45.20 |                       5.60 |
+|  internlm2-7b-turbomind  |                                    51.20 |                             78.80 |        90.40 |                            52.00 |                             41.20 |                  95.60 |                                 58.80 |                             74.40 |                      44.40 |
+| internlm2-20b-turbomind  |                                    81.20 |                             95.60 |        83.60 |                            62.40 |                             48.00 |                  94.80 |                                 57.60 |                             75.60 |                      72.80 |
+|   qwen-1.8b-turbomind    |                                    14.80 |                             35.60 |        51.20 |                            22.40 |                             15.20 |                  31.20 |                                 12.40 |                             22.00 |                       3.20 |
+|    qwen-7b-turbomind     |                                    20.80 |                             54.80 |        76.00 |                            37.60 |                             27.60 |                  74.80 |                                 41.20 |                             57.60 |                      23.60 |
+|    qwen-14b-turbomind    |                                    35.60 |                             81.20 |        78.40 |                            45.20 |                             40.80 |                  80.00 |                                 44.80 |                             70.40 |                      65.60 |
+|    qwen-72b-turbomind    |                                    66.40 |                             89.20 |        90.40 |                            60.00 |                             50.80 |                  81.60 |                                 56.40 |                             88.00 |                      70.40 |
+|     qwen1.5-0.5b-hf      |                                    20.00 |                             34.80 |        46.80 |                            18.80 |                             15.60 |                  24.40 |                                 15.20 |                             16.00 |                       1.20 |
+|     qwen1.5-1.8b-hf      |                                    18.00 |                             32.80 |        66.00 |                            18.80 |                             11.20 |                  24.80 |                                 13.60 |                             27.60 |                       4.80 |
+|      qwen1.5-4b-hf       |                                    18.40 |                             56.40 |        56.80 |                            30.00 |                             20.80 |                  40.80 |                                 46.80 |                             44.80 |                      41.20 |
+|      qwen1.5-7b-hf       |                                    32.40 |                             58.40 |        67.20 |                            36.00 |                             28.00 |                  62.80 |                                 49.20 |                             60.40 |                      48.00 |
+|      qwen1.5-14b-hf      |                                     7.20 |                             78.40 |        75.20 |                            41.20 |                             27.60 |                  74.40 |                                 46.00 |                             81.60 |                       8.00 |
+|      qwen1.5-32b-hf      |                                    71.60 |                             88.40 |        97.60 |                            58.80 |                             46.40 |                  68.00 |                                 51.60 |                             88.40 |                      66.80 |
+|      qwen1.5-72b-hf      |                                    61.20 |                             88.40 |        96.00 |                            60.40 |                             49.20 |                  86.40 |                                 34.80 |                             86.80 |                      53.60 |
+|   qwen1.5-moe-a2-7b-hf   |                                    22.80 |                             49.20 |        68.00 |                            28.40 |                             22.40 |                  58.40 |                                 40.80 |                             42.00 |                      33.60 |
+|    mistral-7b-v0.1-hf    |                                    30.40 |                             79.60 |        70.80 |                            54.40 |                             42.80 |                  77.60 |                                 47.20 |                             70.00 |                      30.40 |
+|    mistral-7b-v0.2-hf    |                                    32.80 |                             74.00 |        77.60 |                            48.00 |                             40.40 |                  84.00 |                                 49.20 |                             76.00 |                      35.20 |
+|   mixtral-8x7b-v0.1-hf   |                                    66.80 |                             86.00 |        94.80 |                            50.40 |                             40.40 |                  86.40 |                                 53.20 |                             82.80 |                      60.80 |
+|  mixtral-8x22b-v0.1-hf   |                                    87.60 |                             95.20 |        99.60 |                            70.00 |                             54.00 |                  95.20 |                                 58.40 |                             95.20 |                      82.00 |
+|         yi-6b-hf         |                                    17.20 |                             49.20 |        72.40 |                            34.40 |                             28.00 |                  76.80 |                                 32.40 |                             56.80 |                       9.20 |
+|        yi-34b-hf         |                                    67.20 |                             85.60 |        79.60 |                            49.20 |                             39.60 |                  86.80 |                                 56.00 |                             81.20 |                      33.20 |
+|   deepseek-7b-base-hf    |                                    17.60 |                             51.20 |        72.40 |                            28.80 |                             20.00 |                  78.40 |                                 28.80 |                             46.80 |                       1.60 |
+|   deepseek-67b-base-hf   |                                    82.40 |                             90.00 |        78.80 |                            60.40 |                             44.80 |                  88.80 |                                 56.80 |                             86.40 |                      38.00 |
+|          model           |   navigate |   dyck_languages |   word_sorting |   sports_understanding |   boolean_expressions |   object_counting |   formal_fallacies |   causal_judgement |   web_of_lies |
+|:------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
+|    llama-7b-turbomind    |      45.20 |             1.60 |           8.40 |                  81.60 |                 66.00 |             47.20 |              46.00 |              40.64 |         57.20 |
+|   llama-13b-turbomind    |      59.20 |             0.80 |          14.40 |                  76.40 |                 69.20 |             46.40 |              47.20 |              53.48 |         66.80 |
+|   llama-30b-turbomind    |      64.80 |             2.40 |          17.20 |                  93.60 |                 78.40 |             71.20 |              43.20 |              55.61 |         98.40 |
+|   llama-65b-turbomind    |      72.40 |             6.80 |          21.60 |                  98.80 |                 81.60 |             70.00 |              40.80 |              55.61 |         99.60 |
+|   llama-2-7b-turbomind   |      54.40 |             1.20 |          10.80 |                  88.80 |                 68.40 |             49.20 |              48.40 |              52.41 |         53.20 |
+|  llama-2-13b-turbomind   |      74.40 |             2.80 |          18.80 |                  97.60 |                 74.40 |             52.80 |              46.40 |              54.55 |         96.00 |
+|  llama-2-70b-turbomind   |      82.40 |            13.60 |          30.40 |                  98.40 |                 81.60 |             83.20 |              43.60 |              63.64 |        100.00 |
+|   llama-3-8b-turbomind   |      90.00 |             9.20 |          38.80 |                  95.20 |                 87.60 |             84.80 |              51.20 |              50.27 |        100.00 |
+|  llama-3-70b-turbomind   |      96.80 |            48.40 |          48.80 |                  99.60 |                 92.40 |             99.60 |              62.40 |              58.29 |        100.00 |
+| internlm2-1.8b-turbomind |      64.40 |             0.40 |           3.20 |                  66.40 |                 54.00 |             50.00 |              49.20 |              48.13 |         46.80 |
+|  internlm2-7b-turbomind  |      78.80 |             2.40 |          35.20 |                  95.60 |                 85.60 |             75.60 |              48.00 |              63.10 |         92.00 |
+| internlm2-20b-turbomind  |      88.80 |            15.60 |          36.00 |                  96.80 |                 88.80 |             76.00 |              50.40 |              56.68 |        100.00 |
+|   qwen-1.8b-turbomind    |      50.00 |             0.00 |           0.80 |                  62.80 |                 29.20 |              2.40 |               6.00 |              12.83 |          1.60 |
+|    qwen-7b-turbomind     |      62.80 |             1.60 |          18.00 |                  81.60 |                 75.20 |             68.80 |              50.00 |              63.64 |         66.80 |
+|    qwen-14b-turbomind    |      75.60 |             1.20 |          26.80 |                  88.80 |                 80.40 |             74.40 |              50.00 |              53.48 |         96.80 |
+|    qwen-72b-turbomind    |      56.00 |            14.40 |          35.20 |                  87.60 |                 91.60 |             81.60 |               5.60 |              31.55 |         62.40 |
+|     qwen1.5-0.5b-hf      |      25.60 |             0.00 |           0.40 |                  41.60 |                 51.60 |             16.80 |               4.40 |               1.07 |         20.00 |
+|     qwen1.5-1.8b-hf      |      55.60 |             0.00 |           1.60 |                  63.60 |                 55.20 |             47.60 |               4.40 |              28.88 |         11.20 |
+|      qwen1.5-4b-hf       |      61.60 |             0.40 |           8.80 |                   0.80 |                 76.00 |             54.40 |               0.80 |              28.34 |         62.40 |
+|      qwen1.5-7b-hf       |      63.60 |             2.40 |          20.80 |                  72.40 |                 69.60 |             26.80 |               0.00 |              40.64 |          0.00 |
+|      qwen1.5-14b-hf      |      82.40 |             1.20 |          27.60 |                  78.40 |                 87.20 |             48.00 |              54.00 |              24.06 |        100.00 |
+|      qwen1.5-32b-hf      |      86.80 |             5.60 |          36.80 |                  90.00 |                 86.40 |             66.40 |              35.60 |              62.57 |         95.60 |
+|      qwen1.5-72b-hf      |      48.40 |            13.20 |          34.40 |                  87.60 |                  8.00 |             67.60 |              13.60 |              39.57 |         99.60 |
+|   qwen1.5-moe-a2-7b-hf   |      56.80 |             2.00 |           8.80 |                  79.60 |                 73.60 |             66.80 |               4.00 |              53.48 |         50.40 |
+|    mistral-7b-v0.1-hf    |      73.60 |             4.00 |          26.40 |                  97.20 |                 82.00 |             67.60 |              43.20 |              48.66 |        100.00 |
+|    mistral-7b-v0.2-hf    |      72.80 |             4.00 |          30.40 |                  97.20 |                 81.20 |             66.80 |              46.00 |              52.41 |        100.00 |
+|   mixtral-8x7b-v0.1-hf   |      85.60 |            18.80 |          33.60 |                  98.00 |                 90.80 |             85.20 |              49.60 |              55.61 |         90.80 |
+|  mixtral-8x22b-v0.1-hf   |      92.80 |            51.60 |          40.00 |                  98.40 |                 91.60 |             95.60 |              54.80 |              56.15 |        100.00 |
+|         yi-6b-hf         |      66.40 |             1.20 |          16.00 |                  92.80 |                 59.60 |             53.20 |              53.20 |              52.41 |         65.20 |
+|        yi-34b-hf         |      81.20 |            18.80 |          36.40 |                  97.60 |                 85.60 |             84.00 |              51.20 |              59.89 |         99.60 |
+|   deepseek-7b-base-hf    |      59.20 |             3.20 |           6.40 |                  92.00 |                 73.20 |             49.60 |              50.80 |              52.41 |         74.80 |
+|   deepseek-67b-base-hf   |      85.20 |            30.00 |          33.20 |                  99.60 |                 84.80 |             82.40 |              46.80 |              56.68 |         99.60 |
+## Chat Models
+|             model             |   bbh |
+|:-----------------------------:|------:|
+|     qwen1.5-0.5b-chat-hf      | 24.12 |
+|     qwen1.5-1.8b-chat-hf      | 26.82 |
+|      qwen1.5-4b-chat-hf       | 43.15 |
+|      qwen1.5-7b-chat-hf       | 38.12 |
+|      qwen1.5-14b-chat-hf      | 55.38 |
+|      qwen1.5-32b-chat-hf      | 69.28 |
+|      qwen1.5-72b-chat-hf      | 72.97 |
+|     qwen1.5-110b-chat-hf      | 71.04 |
+|    internlm2-chat-1.8b-hf     | 37.69 |
+|  internlm2-chat-1.8b-sft-hf   | 37.12 |
+|     internlm2-chat-7b-hf      | 57.83 |
+|   internlm2-chat-7b-sft-hf    | 57.19 |
+|     internlm2-chat-20b-hf     | 68.24 |
+|   internlm2-chat-20b-sft-hf   | 69.38 |
+|    llama-3-8b-instruct-hf     | 52.85 |
+|    llama-3-70b-instruct-hf    | 82.42 |
+| llama-3-8b-instruct-lmdeploy  | 53.54 |
+| llama-3-70b-instruct-lmdeploy | 82.58 |
+|  mistral-7b-instruct-v0.1-hf  | 32.88 |
+|  mistral-7b-instruct-v0.2-hf  | 48.84 |
+| mixtral-8x7b-instruct-v0.1-hf | 59.64 |
+### Details
+|             model             |   temporal_sequences |   disambiguation_qa |   date_understanding |   tracking_shuffled_objects_three_objects |   penguins_in_a_table |   geometric_shapes |   snarks |   ruin_names |   tracking_shuffled_objects_seven_objects |
+|:-----------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                25.60 |               42.00 |                20.00 |                                     31.20 |                 15.07 |              14.40 |    46.07 |        24.80 |                                     13.20 |
+|     qwen1.5-1.8b-chat-hf      |                28.80 |               36.00 |                30.40 |                                     35.20 |                 19.18 |               7.60 |    46.63 |        24.00 |                                      9.60 |
+|      qwen1.5-4b-chat-hf       |                 8.00 |               56.00 |                64.80 |                                     28.40 |                 48.63 |              19.60 |    60.67 |        34.00 |                                     14.40 |
+|      qwen1.5-7b-chat-hf       |                39.60 |               37.60 |                62.40 |                                     36.80 |                 60.96 |              30.80 |    54.49 |        38.00 |                                     20.00 |
+|      qwen1.5-14b-chat-hf      |                61.60 |               63.60 |                70.00 |                                     54.00 |                 74.66 |              33.60 |    67.42 |        61.20 |                                     35.60 |
+|      qwen1.5-32b-chat-hf      |                94.40 |               77.60 |                78.00 |                                     66.00 |                 93.84 |              46.00 |    82.58 |        73.60 |                                     61.60 |
+|      qwen1.5-72b-chat-hf      |                70.40 |               72.40 |                84.40 |                                     67.20 |                 89.73 |              52.00 |    79.21 |        86.40 |                                     68.80 |
+|     qwen1.5-110b-chat-hf      |                74.80 |               71.20 |                82.80 |                                     74.80 |                 89.04 |              48.00 |    90.45 |        87.60 |                                     73.60 |
+|    internlm2-chat-1.8b-hf     |                35.60 |               52.40 |                48.80 |                                     29.60 |                 39.73 |              24.40 |    51.69 |        27.20 |                                     13.20 |
+|  internlm2-chat-1.8b-sft-hf   |                37.20 |               53.60 |                44.00 |                                     30.00 |                 34.93 |              22.40 |    56.74 |        28.00 |                                     12.00 |
+|     internlm2-chat-7b-hf      |                72.00 |               66.40 |                73.60 |                                     65.20 |                 60.27 |              50.00 |    62.92 |        52.40 |                                     44.40 |
+|   internlm2-chat-7b-sft-hf    |                67.20 |               66.80 |                58.00 |                                     63.20 |                 48.63 |              45.60 |    64.04 |        59.60 |                                     42.80 |
+|     internlm2-chat-20b-hf     |                80.40 |               76.00 |                77.60 |                                     88.80 |                 78.08 |              36.40 |    71.91 |        71.60 |                                     77.20 |
+|   internlm2-chat-20b-sft-hf   |                80.00 |               70.80 |                78.00 |                                     87.60 |                 82.88 |              41.20 |    76.40 |        72.80 |                                     71.60 |
+|    llama-3-8b-instruct-hf     |                70.40 |               42.80 |                28.40 |                                     81.20 |                 13.01 |              49.20 |    44.94 |        73.20 |                                     42.40 |
+|    llama-3-70b-instruct-hf    |               100.00 |               84.00 |                91.60 |                                     95.60 |                 78.08 |              52.40 |    87.08 |        89.60 |                                     97.60 |
+| llama-3-8b-instruct-lmdeploy  |                73.20 |               45.60 |                34.00 |                                     79.60 |                 31.51 |              48.40 |    47.75 |        76.80 |                                     47.60 |
+| llama-3-70b-instruct-lmdeploy |               100.00 |               84.00 |                90.00 |                                     96.80 |                 83.56 |              56.00 |    87.08 |        89.20 |                                     97.20 |
+|  mistral-7b-instruct-v0.1-hf  |                32.00 |               22.40 |                52.40 |                                     35.20 |                 30.82 |              23.20 |    38.76 |        46.00 |                                     18.40 |
+|  mistral-7b-instruct-v0.2-hf  |                66.00 |               58.40 |                50.40 |                                     48.40 |                 48.63 |              37.20 |    65.73 |        40.40 |                                     29.20 |
+| mixtral-8x7b-instruct-v0.1-hf |                63.20 |               68.40 |                65.20 |                                     60.00 |                 78.08 |              40.40 |    74.16 |        64.00 |                                     46.00 |
+|             model             |   tracking_shuffled_objects_five_objects |   logical_deduction_three_objects |   hyperbaton |   logical_deduction_five_objects |   logical_deduction_seven_objects |   movie_recommendation |   salient_translation_error_detection |   reasoning_about_colored_objects |   multistep_arithmetic_two |
+|:-----------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                                    20.40 |                             34.40 |        51.60 |                            21.20 |                             13.20 |                  26.00 |                                 20.80 |                             17.20 |                       1.20 |
+|     qwen1.5-1.8b-chat-hf      |                                    18.00 |                             34.80 |        48.40 |                            21.20 |                             16.40 |                  34.80 |                                 24.00 |                             28.80 |                       4.40 |
+|      qwen1.5-4b-chat-hf       |                                    19.20 |                             56.80 |        65.20 |                            36.40 |                             35.60 |                  51.60 |                                 40.40 |                             55.20 |                      29.20 |
+|      qwen1.5-7b-chat-hf       |                                    31.60 |                             58.80 |        53.20 |                            35.60 |                             27.20 |                  56.00 |                                 44.80 |                             62.00 |                      50.00 |
+|      qwen1.5-14b-chat-hf      |                                    43.20 |                             75.20 |        52.80 |                            52.40 |                             50.80 |                  76.40 |                                 48.80 |                             83.60 |                      65.20 |
+|      qwen1.5-32b-chat-hf      |                                    68.40 |                             84.00 |        81.20 |                            57.20 |                             46.00 |                  78.80 |                                 54.40 |                             86.00 |                      86.00 |
+|      qwen1.5-72b-chat-hf      |                                    76.80 |                             94.40 |        85.20 |                            62.80 |                             54.00 |                  78.40 |                                 63.60 |                             86.40 |                      82.80 |
+|     qwen1.5-110b-chat-hf      |                                    79.20 |                             91.60 |        88.80 |                            61.20 |                             50.00 |                  82.40 |                                 59.60 |                             88.80 |                      78.00 |
+|    internlm2-chat-1.8b-hf     |                                    20.00 |                             48.40 |        56.00 |                            24.40 |                             26.80 |                  65.20 |                                 18.00 |                             39.60 |                       7.60 |
+|  internlm2-chat-1.8b-sft-hf   |                                    18.40 |                             48.00 |        51.20 |                            20.40 |                             25.20 |                  63.20 |                                 22.00 |                             38.80 |                       6.00 |
+|     internlm2-chat-7b-hf      |                                    48.40 |                             75.20 |        84.80 |                            42.00 |                             36.80 |                  79.60 |                                 53.20 |                             65.60 |                      26.40 |
+|   internlm2-chat-7b-sft-hf    |                                    44.00 |                             72.40 |        85.60 |                            41.60 |                             37.20 |                  82.40 |                                 55.60 |                             52.80 |                      32.00 |
+|     internlm2-chat-20b-hf     |                                    88.00 |                             88.80 |        88.80 |                            52.80 |                             50.40 |                  85.20 |                                 56.80 |                             79.60 |                      40.00 |
+|   internlm2-chat-20b-sft-hf   |                                    83.20 |                             90.00 |        90.40 |                            55.60 |                             48.80 |                  84.40 |                                 57.60 |                             79.20 |                      38.40 |
+|    llama-3-8b-instruct-hf     |                                    49.60 |                             85.60 |        76.00 |                            54.00 |                             29.20 |                  57.60 |                                 46.00 |                             44.80 |                      52.00 |
+|    llama-3-70b-instruct-hf    |                                    99.20 |                             96.80 |        95.20 |                            77.20 |                             65.20 |                  80.00 |                                 69.60 |                             94.80 |                      84.00 |
+| llama-3-8b-instruct-lmdeploy  |                                    57.20 |                             78.00 |        75.60 |                            36.00 |                             13.20 |                  59.20 |                                 53.60 |                             54.80 |                      52.80 |
+| llama-3-70b-instruct-lmdeploy |                                    98.80 |                             96.40 |        96.80 |                            75.20 |                             68.80 |                  79.60 |                                 67.60 |                             94.00 |                      84.80 |
+|  mistral-7b-instruct-v0.1-hf  |                                    26.00 |                             46.00 |        60.00 |                            38.00 |                             24.00 |                  59.20 |                                  1.20 |                              6.00 |                      12.40 |
+|  mistral-7b-instruct-v0.2-hf  |                                    39.60 |                             63.60 |        64.00 |                            44.00 |                             33.20 |                  56.00 |                                 42.40 |                             68.40 |                      14.00 |
+| mixtral-8x7b-instruct-v0.1-hf |                                    46.40 |                             71.60 |        88.80 |                            48.00 |                             36.80 |                  60.00 |                                 50.00 |                             81.20 |                      59.20 |
+|             model             |   navigate |   dyck_languages |   word_sorting |   sports_understanding |   boolean_expressions |   object_counting |   formal_fallacies |   causal_judgement |   web_of_lies |
+|:-----------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
+|     qwen1.5-0.5b-chat-hf      |      45.60 |             0.00 |           1.20 |                  17.20 |                 50.40 |             16.40 |              11.60 |              42.78 |         27.60 |
+|     qwen1.5-1.8b-chat-hf      |      58.40 |             0.00 |           2.00 |                  34.00 |                 44.80 |             30.40 |              11.60 |              24.60 |         50.00 |
+|      qwen1.5-4b-chat-hf       |      64.00 |             3.20 |           6.80 |                  80.40 |                 77.60 |             48.80 |              41.20 |              55.61 |         63.20 |
+|      qwen1.5-7b-chat-hf       |      54.40 |             0.40 |           8.00 |                  55.60 |                 47.60 |             31.20 |               0.00 |               2.14 |         30.00 |
+|      qwen1.5-14b-chat-hf      |      74.40 |             6.40 |          26.40 |                  72.40 |                 76.40 |             61.60 |               0.80 |              25.67 |         81.20 |
+|      qwen1.5-32b-chat-hf      |      90.00 |            10.40 |          28.40 |                  82.40 |                 92.80 |             76.80 |              32.40 |              41.71 |        100.00 |
+|      qwen1.5-72b-chat-hf      |      81.20 |            18.40 |          37.60 |                  95.20 |                 92.80 |             76.00 |              50.40 |              63.64 |        100.00 |
+|     qwen1.5-110b-chat-hf      |      91.60 |            18.00 |          39.60 |                  82.80 |                 80.80 |             75.20 |              22.40 |              35.83 |        100.00 |
+|    internlm2-chat-1.8b-hf     |      63.20 |             0.00 |           6.00 |                  58.00 |                 56.80 |             48.80 |              54.80 |              52.94 |         48.40 |
+|  internlm2-chat-1.8b-sft-hf   |      63.20 |             0.00 |           5.60 |                  58.00 |                 56.80 |             50.00 |              52.40 |              56.68 |         47.60 |
+|     internlm2-chat-7b-hf      |      73.60 |             3.60 |          18.00 |                  55.20 |                 83.60 |             62.80 |              50.00 |              58.29 |         97.20 |
+|   internlm2-chat-7b-sft-hf    |      71.60 |             4.40 |          20.00 |                  82.00 |                 84.00 |             60.00 |              51.60 |              52.94 |         98.00 |
+|     internlm2-chat-20b-hf     |      82.40 |             8.00 |          36.00 |                  55.60 |                 84.40 |             78.00 |              50.40 |              59.36 |        100.00 |
+|   internlm2-chat-20b-sft-hf   |      81.60 |            10.40 |          36.40 |                  89.20 |                 82.40 |             80.40 |              48.40 |              55.61 |        100.00 |
+|    llama-3-8b-instruct-hf     |      82.80 |             8.80 |          37.20 |                  94.40 |                 78.80 |             89.60 |              45.20 |              24.06 |         25.60 |
+|    llama-3-70b-instruct-hf    |      95.20 |            18.80 |          49.20 |                  98.00 |                 94.00 |             90.00 |              73.20 |              68.98 |        100.00 |
+| llama-3-8b-instruct-lmdeploy  |      83.60 |            10.00 |          40.40 |                  96.00 |                 77.20 |             89.20 |              43.60 |              37.43 |          3.20 |
+| llama-3-70b-instruct-lmdeploy |      95.60 |            22.40 |          48.80 |                  96.80 |                 91.60 |             87.20 |              72.00 |              69.52 |        100.00 |
+|  mistral-7b-instruct-v0.1-hf  |      70.80 |             0.80 |           5.20 |                  68.80 |                 69.60 |             51.60 |               3.20 |              12.30 |         33.60 |
+|  mistral-7b-instruct-v0.2-hf  |      62.40 |             4.00 |          15.60 |                  81.20 |                 70.40 |             50.40 |              32.00 |              34.76 |         98.40 |
+| mixtral-8x7b-instruct-v0.1-hf |      76.40 |            12.80 |          23.20 |                  55.20 |                 85.60 |             83.60 |              40.00 |              43.32 |         88.80 |

opencompass/configs/datasets/bbh/bbh_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .bbh_gen_5b92b0 import bbh_datasets  # noqa: F401, F403

opencompass/configs/datasets/bbh/bbh_gen_2879b0.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import BBHDataset, bbh_mcq_postprocess, BBHEvaluator, BBHEvaluator_mcq
+with read_base():
+    from .bbh_subset_settings import settings
+bbh_datasets = []
+for name, test_type in settings:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{name}.txt'), 'r') as f:
+        hint = f.read()
+    task_prompt, body = hint.split('\n\nQ:', 1)
+    sections = ('Q:' + body).split('\n\n')
+    prompt_rounds = []
+    for index, section in enumerate(sections):
+        question, answer = section.split('\nA:')
+        answer = 'A:' + answer
+        if index == 0:
+            desc = task_prompt.strip() + '\n'
+        else:
+            desc = ''
+        prompt_rounds.append(dict(role='HUMAN', prompt=f'{desc}{question.strip()}'))
+        prompt_rounds.append(dict(role='BOT', prompt=answer.strip()))
+    prompt_rounds.append(dict(role='HUMAN', prompt='Q: {input}'))
+    bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+    bbh_infer_cfg = dict(
+        prompt_template=dict(type=PromptTemplate, template=dict(round=prompt_rounds)),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    if test_type == 'mcq':
+        bbh_eval_cfg = dict(
+            evaluator=dict(type=BBHEvaluator_mcq),
+            pred_role='BOT',
+            pred_postprocessor=dict(type=bbh_mcq_postprocess),
+            dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    else:
+        bbh_eval_cfg = dict(
+            evaluator=dict(type=BBHEvaluator),
+            pred_role='BOT')
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=name,
+            abbr='bbh-' + name,
+            reader_cfg=bbh_reader_cfg.copy(),
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))

opencompass/configs/datasets/bbh/bbh_gen_4a31fa.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))

opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))

opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))

opencompass/configs/datasets/bbh/bbh_gen_98fba6.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))

opencompass/configs/datasets/bbh/bbh_subset_settings.py ADDED Viewed

	@@ -0,0 +1,29 @@

+settings = [
+    ('temporal_sequences', 'mcq'),
+    ('disambiguation_qa', 'mcq'),
+    ('date_understanding', 'mcq'),
+    ('tracking_shuffled_objects_three_objects', 'mcq'),
+    ('penguins_in_a_table', 'mcq'),
+    ('geometric_shapes', 'mcq'),
+    ('snarks', 'mcq'),
+    ('ruin_names', 'mcq'),
+    ('tracking_shuffled_objects_seven_objects', 'mcq'),
+    ('tracking_shuffled_objects_five_objects', 'mcq'),
+    ('logical_deduction_three_objects', 'mcq'),
+    ('hyperbaton', 'mcq'),
+    ('logical_deduction_five_objects', 'mcq'),
+    ('logical_deduction_seven_objects', 'mcq'),
+    ('movie_recommendation', 'mcq'),
+    ('salient_translation_error_detection', 'mcq'),
+    ('reasoning_about_colored_objects', 'mcq'),
+    ('multistep_arithmetic_two', 'free_form'),
+    ('navigate', 'free_form'),
+    ('dyck_languages', 'free_form'),
+    ('word_sorting', 'free_form'),
+    ('sports_understanding', 'free_form'),
+    ('boolean_expressions', 'free_form'),
+    ('object_counting', 'free_form'),
+    ('formal_fallacies', 'free_form'),
+    ('causal_judgement', 'free_form'),
+    ('web_of_lies', 'free_form'),
+]

opencompass/configs/datasets/cmmlu/cmmlu_0shot_cot_gen_305931.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+QUERY_TEMPLATE = """
+你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一. 请在回答之前一步步思考.
+{question}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
+    cmmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(
+            type=match_answer_pattern,
+            # answer_pattern=r'(?i)答案\s*:\s*([A-D])'
+            answer_pattern=r'(?i)答案\s*:\s*[\W]*([A-D])[\W]*',
+        )
+    )
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+del _name, _ch_name

opencompass/configs/datasets/cmmlu/cmmlu_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .cmmlu_gen_c13365 import cmmlu_datasets  # noqa: F401, F403

opencompass/configs/datasets/cmmlu/cmmlu_gen_c13365.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    cmmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=
+                        f'以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。\n题目：{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
+                    ),
+                    dict(role='BOT', prompt='答案是: {answer}'),
+                ]),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(type=AccwithDetailsEvaluator),
+        pred_postprocessor=dict(type=first_capital_postprocess))
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+del _name, _ch_name

opencompass/configs/datasets/cmmlu/cmmlu_ppl.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .cmmlu_ppl_8b9c76 import cmmlu_datasets  # noqa: F401, F403

opencompass/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    hint = f'以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。'
+    question_and_options = '题目：{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
+    cmmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template={answer: f'{question_and_options}\n答案是: {answer}\n' for answer in ['A', 'B', 'C', 'D']},
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template={answer: f'{hint}\n</E>{question_and_options}\n答案是: {answer}' for answer in ['A', 'B', 'C', 'D']},
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=PPLInferencer),
+    )
+    cmmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+del _name, _ch_name

opencompass/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    cmmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template={
+                answer: dict(
+                    begin='</E>',
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=f'以下是关于{_ch_name}的单项选择题，请直接给出正确答案的选项。\n题目：{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
+                        ),
+                        dict(role='BOT', prompt=f'答案是: {answer}'),
+                    ])
+                for answer in ['A', 'B', 'C', 'D']
+            },
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=PPLInferencer),
+    )
+    cmmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+        ))
+del _name, _ch_name

opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .commonsenseqacn_gen_d380d0 import commonsenseqacn_datasets  # noqa: F401, F403

opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen_d380d0.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CommonsenseQADataset_CN
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+commonsenseqacn_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation',
+)
+_ice_template = dict(
+    type=PromptTemplate,
+    template=dict(
+        begin='</E>',
+        round=[
+            dict(
+                role='HUMAN',
+                prompt='{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\n答案：',
+            ),
+            dict(role='BOT', prompt='{answerKey}'),
+        ],
+    ),
+    ice_token='</E>',
+)
+commonsenseqacn_infer_cfg = dict(
+    prompt_template=_ice_template,
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+commonsenseqacn_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+commonsenseqacn_datasets = [
+    dict(
+        abbr='commonsenseqa_cn',
+        type=CommonsenseQADataset_CN,
+        path='./data/commonsenseqa_cn/validation.jsonl',
+        reader_cfg=commonsenseqacn_reader_cfg,
+        infer_cfg=commonsenseqacn_infer_cfg,
+        eval_cfg=commonsenseqacn_eval_cfg,
+    )
+]

opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .commonsenseqacn_ppl_971f48 import commonsenseqacn_datasets  # noqa: F401, F403

opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl_971f48.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CommonsenseQADataset_CN
+commonsenseqacn_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
+    output_column='answerKey',
+    test_split='validation',
+)
+_ice_template = dict(
+    type=PromptTemplate,
+    template={
+        ans: dict(
+            begin='</E>',
+            round=[
+                dict(role='HUMAN', prompt='问题: {question}\n答案: '),
+                dict(role='BOT', prompt=ans_token),
+            ],
+        )
+        for ans, ans_token in [
+            ['A', '{A}'],
+            ['B', '{B}'],
+            ['C', '{C}'],
+            ['D', '{D}'],
+            ['E', '{E}'],
+        ]
+    },
+    ice_token='</E>',
+)
+commonsenseqacn_infer_cfg = dict(
+    prompt_template=_ice_template,
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer),
+)
+commonsenseqacn_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+commonsenseqacn_datasets = [
+    dict(
+        abbr='commonsenseqa_cn',
+        type=CommonsenseQADataset_CN,
+        path='./data/commonsenseqa_cn/validation.jsonl',
+        reader_cfg=commonsenseqacn_reader_cfg,
+        infer_cfg=commonsenseqacn_infer_cfg,
+        eval_cfg=commonsenseqacn_eval_cfg,
+    )
+]

opencompass/configs/datasets/demo/demo_cmmlu_base_ppl.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from mmengine import read_base
+with read_base():
+    from ..cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets
+for d in cmmlu_datasets:
+    d['abbr'] = 'demo_' + d['abbr']
+    d['reader_cfg']['test_range'] = '[0:4]'

opencompass/configs/datasets/demo/demo_cmmlu_chat_gen.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from mmengine import read_base
+with read_base():
+    from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+for d in cmmlu_datasets:
+    d['abbr'] = 'demo_' + d['abbr']
+    d['reader_cfg']['test_range'] = '[0:4]'

opencompass/configs/datasets/demo/demo_gsm8k_base_gen.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from mmengine import read_base
+with read_base():
+    from ..gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets
+gsm8k_datasets[0]['abbr'] = 'demo_' + gsm8k_datasets[0]['abbr']
+gsm8k_datasets[0]['reader_cfg']['test_range'] = '[0:64]'

opencompass/configs/datasets/demo/demo_gsm8k_chat_gen.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from mmengine import read_base
+with read_base():
+    from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+gsm8k_datasets[0]['abbr'] = 'demo_' + gsm8k_datasets[0]['abbr']
+gsm8k_datasets[0]['reader_cfg']['test_range'] = '[0:64]'

opencompass/configs/datasets/demo/demo_math_base_gen.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from mmengine import read_base
+with read_base():
+    from ..math.math_4shot_base_gen_db136b import math_datasets
+math_datasets[0]['abbr'] = 'demo_' + math_datasets[0]['abbr']
+math_datasets[0]['reader_cfg']['test_range'] = '[0:64]'

opencompass/configs/datasets/demo/demo_math_chat_gen.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from mmengine import read_base
+with read_base():
+    from ..math.math_0shot_gen_393424 import math_datasets
+math_datasets[0]['abbr'] = 'demo_' + math_datasets[0]['abbr']
+math_datasets[0]['reader_cfg']['test_range'] = '[0:64]'

opencompass/configs/datasets/gpqa/README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# GPQA
+```bash
+python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
+```
+## Base Models
+|          model           |   GPQA_diamond |
+|:------------------------:|---------------:|
+|    llama-7b-turbomind    |          24.24 |
+|   llama-13b-turbomind    |          25.25 |
+|   llama-30b-turbomind    |          22.73 |
+|   llama-65b-turbomind    |          21.72 |
+|   llama-2-7b-turbomind   |          25.25 |
+|  llama-2-13b-turbomind   |          23.74 |
+|  llama-2-70b-turbomind   |          28.28 |
+|   llama-3-8b-turbomind   |          31.82 |
+|  llama-3-70b-turbomind   |          40.91 |
+| internlm2-1.8b-turbomind |          24.24 |
+|  internlm2-7b-turbomind  |          28.28 |
+| internlm2-20b-turbomind  |          31.31 |
+|   qwen-1.8b-turbomind    |          28.79 |
+|    qwen-7b-turbomind     |          24.75 |
+|    qwen-14b-turbomind    |          27.78 |
+|    qwen-72b-turbomind    |          31.31 |
+|     qwen1.5-0.5b-hf      |          23.74 |
+|     qwen1.5-1.8b-hf      |          28.79 |
+|      qwen1.5-4b-hf       |          23.23 |
+|      qwen1.5-7b-hf       |          20.71 |
+|      qwen1.5-14b-hf      |          32.32 |
+|      qwen1.5-32b-hf      |          30.81 |
+|      qwen1.5-72b-hf      |          31.82 |
+|   qwen1.5-moe-a2-7b-hf   |          28.79 |
+|    mistral-7b-v0.1-hf    |          24.75 |
+|    mistral-7b-v0.2-hf    |          23.74 |
+|   mixtral-8x7b-v0.1-hf   |          28.79 |
+|  mixtral-8x22b-v0.1-hf   |          36.36 |
+|         yi-6b-hf         |          28.28 |
+|        yi-34b-hf         |          35.86 |
+|   deepseek-7b-base-hf    |          20.71 |
+|   deepseek-67b-base-hf   |          25.25 |
+## Chat Models
+|             model             |   GPQA_diamond |
+|:-----------------------------:|---------------:|
+|     qwen1.5-0.5b-chat-hf      |          19.70 |
+|     qwen1.5-1.8b-chat-hf      |          29.80 |
+|      qwen1.5-4b-chat-hf       |          25.25 |
+|      qwen1.5-7b-chat-hf       |          31.82 |
+|      qwen1.5-14b-chat-hf      |          30.30 |
+|      qwen1.5-32b-chat-hf      |          31.31 |
+|      qwen1.5-72b-chat-hf      |          32.83 |
+|     qwen1.5-110b-chat-hf      |          35.86 |
+|    internlm2-chat-1.8b-hf     |          25.76 |
+|  internlm2-chat-1.8b-sft-hf   |          26.26 |
+|     internlm2-chat-7b-hf      |          28.28 |
+|   internlm2-chat-7b-sft-hf    |          27.27 |
+|     internlm2-chat-20b-hf     |          30.30 |
+|   internlm2-chat-20b-sft-hf   |          29.29 |
+|    llama-3-8b-instruct-hf     |          25.76 |
+|    llama-3-70b-instruct-hf    |          37.88 |
+| llama-3-8b-instruct-lmdeploy  |          25.76 |
+| llama-3-70b-instruct-lmdeploy |          37.88 |
+|  mistral-7b-instruct-v0.1-hf  |          30.30 |
+|  mistral-7b-instruct-v0.2-hf  |          25.25 |
+| mixtral-8x7b-instruct-v0.1-hf |          30.30 |

opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_4b5a83.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import GPQADataset, GPQAEvaluator
+from opencompass.utils import first_option_postprocess
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+hint = f'For the multiple choice question below, please provide the correct answer option directly.'
+question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
+gpqa_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={
+        opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
+        ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+        opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
+        },
+        ice_token='</E>'
+        ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+    inferencer=dict(type=PPLInferencer))
+gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
+gpqa_datasets = []
+gpqa_subsets = {
+    # 'extended': 'gpqa_extended.csv',
+    # 'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+for split in list(gpqa_subsets.keys()):
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg)
+    )

opencompass/configs/datasets/gpqa/gpqa_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets

opencompass/configs/datasets/gpqa/gpqa_gen_015262.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GPQADataset, GPQAEvaluator
+from opencompass.utils import first_option_postprocess
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+gpqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
+                                          '(A){A}\n'
+                                          '(B){B}\n'
+                                          '(C){C}\n'
+                                          '(D){D}\n'
+                                          'Format your response as follows: "The correct answer is (insert answer here)"'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
+                     pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
+gpqa_datasets = []
+gpqa_subsets = {
+    'extended': 'gpqa_extended.csv',
+    'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+for split in list(gpqa_subsets.keys()):
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg)
+    )

opencompass/configs/datasets/gpqa/gpqa_gen_4baadb.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GPQADataset, GPQAEvaluator
+from opencompass.utils import first_option_postprocess
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+gpqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
+                                          '(A){A}\n'
+                                          '(B){B}\n'
+                                          '(C){C}\n'
+                                          '(D){D}\n'
+                                          'Format your response as follows: "The correct answer is (insert answer here)"'),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
+                     pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
+gpqa_datasets = []
+gpqa_subsets = {
+    # 'extended': 'gpqa_extended.csv',
+    # 'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+for split in list(gpqa_subsets.keys()):
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg)
+    )