tuandunghcmut commited on
Commit
7dfedba
·
verified ·
1 Parent(s): cc8629b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py +4 -0
  2. opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py +53 -0
  3. opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl.py +4 -0
  4. opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_4b16c0.py +65 -0
  5. opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_9ef540.py +43 -0
  6. opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_e53034.py +59 -0
  7. opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py +4 -0
  8. opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py +52 -0
  9. opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py +4 -0
  10. opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py +60 -0
  11. opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py +44 -0
  12. opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py +4 -0
  13. opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py +304 -0
  14. opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py +4 -0
  15. opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py +356 -0
  16. opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py +45 -0
  17. opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py +44 -0
  18. opencompass/configs/datasets/GaokaoBench/GaokaoBench_prompts.py +191 -0
  19. opencompass/configs/datasets/GaokaoBench/README.md +191 -0
  20. opencompass/configs/datasets/XLSum/XLSum_gen.py +4 -0
  21. opencompass/configs/datasets/XLSum/XLSum_gen_2bb71c.py +29 -0
  22. opencompass/configs/datasets/bbh/README.md +250 -0
  23. opencompass/configs/datasets/bbh/bbh_gen.py +4 -0
  24. opencompass/configs/datasets/bbh/bbh_gen_2879b0.py +56 -0
  25. opencompass/configs/datasets/bbh/bbh_gen_4a31fa.py +99 -0
  26. opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py +99 -0
  27. opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py +99 -0
  28. opencompass/configs/datasets/bbh/bbh_gen_98fba6.py +90 -0
  29. opencompass/configs/datasets/bbh/bbh_subset_settings.py +29 -0
  30. opencompass/configs/datasets/cmmlu/cmmlu_0shot_cot_gen_305931.py +130 -0
  31. opencompass/configs/datasets/cmmlu/cmmlu_gen.py +4 -0
  32. opencompass/configs/datasets/cmmlu/cmmlu_gen_c13365.py +123 -0
  33. opencompass/configs/datasets/cmmlu/cmmlu_ppl.py +4 -0
  34. opencompass/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py +117 -0
  35. opencompass/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py +122 -0
  36. opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py +4 -0
  37. opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen_d380d0.py +50 -0
  38. opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl.py +4 -0
  39. opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl_971f48.py +52 -0
  40. opencompass/configs/datasets/demo/demo_cmmlu_base_ppl.py +8 -0
  41. opencompass/configs/datasets/demo/demo_cmmlu_chat_gen.py +8 -0
  42. opencompass/configs/datasets/demo/demo_gsm8k_base_gen.py +7 -0
  43. opencompass/configs/datasets/demo/demo_gsm8k_chat_gen.py +7 -0
  44. opencompass/configs/datasets/demo/demo_math_base_gen.py +7 -0
  45. opencompass/configs/datasets/demo/demo_math_chat_gen.py +7 -0
  46. opencompass/configs/datasets/gpqa/README.md +69 -0
  47. opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_4b5a83.py +49 -0
  48. opencompass/configs/datasets/gpqa/gpqa_gen.py +4 -0
  49. opencompass/configs/datasets/gpqa/gpqa_gen_015262.py +46 -0
  50. opencompass/configs/datasets/gpqa/gpqa_gen_4baadb.py +46 -0
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_bustm_gen_634f41 import bustm_datasets # noqa: F401, F403
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import AFQMCDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ bustm_reader_cfg = dict(
9
+ input_columns=['sentence1', 'sentence2'],
10
+ output_column='label',
11
+ test_split='train')
12
+
13
+ bustm_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt=
20
+ '语句一:“{sentence1}”\n语句二:“{sentence2}”\n请判断语句一和语句二说的是否是一个意思?\nA. 无关\nB. 相关\n请从“A”,“B”中进行选择。\n答:',
21
+ ),
22
+ ]),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ bustm_eval_cfg = dict(
29
+ evaluator=dict(type=AccEvaluator),
30
+ pred_role='BOT',
31
+ pred_postprocessor=dict(type=first_capital_postprocess),
32
+ )
33
+
34
+ bustm_datasets = [
35
+ dict(
36
+ abbr='bustm-dev',
37
+ type=AFQMCDatasetV2, # bustm share the same format with AFQMC
38
+ path='./data/FewCLUE/bustm/dev_few_all.json',
39
+ local_mode=True,
40
+ reader_cfg=bustm_reader_cfg,
41
+ infer_cfg=bustm_infer_cfg,
42
+ eval_cfg=bustm_eval_cfg,
43
+ ),
44
+ dict(
45
+ abbr='bustm-test',
46
+ type=AFQMCDatasetV2, # bustm share the same format with AFQMC
47
+ path='./data/FewCLUE/bustm/test_public.json',
48
+ local_mode=True,
49
+ reader_cfg=bustm_reader_cfg,
50
+ infer_cfg=bustm_infer_cfg,
51
+ eval_cfg=bustm_eval_cfg,
52
+ ),
53
+ ]
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_bustm_ppl_e53034 import bustm_datasets # noqa: F401, F403
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_4b16c0.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ bustm_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ bustm_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0:
17
+ dict(
18
+ begin=[
19
+ dict(
20
+ role='SYSTEM',
21
+ fallback_role='HUMAN',
22
+ prompt='请判断以下两句话说的是否是一个意思:')
23
+ ],
24
+ round=[
25
+ dict(role='HUMAN', prompt='{sentence1},{sentence2}'),
26
+ dict(role='BOT', prompt='两句话说的毫不相关。')
27
+ ]),
28
+ 1:
29
+ dict(
30
+ begin=[
31
+ dict(
32
+ role='SYSTEM',
33
+ fallback_role='HUMAN',
34
+ prompt='请判断以下两句话说的是否是一个意思:')
35
+ ],
36
+ round=[
37
+ dict(role='HUMAN', prompt='{sentence1},{sentence2}'),
38
+ dict(role='BOT', prompt='两句话说是的一个意思。')
39
+ ]),
40
+ }),
41
+ retriever=dict(type=ZeroRetriever),
42
+ inferencer=dict(type=PPLInferencer))
43
+
44
+ bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
45
+
46
+ bustm_datasets = [
47
+ dict(
48
+ type=HFDataset,
49
+ abbr='bustm-dev',
50
+ path='json',
51
+ data_files='./data/FewCLUE/bustm/dev_few_all.json',
52
+ split='train',
53
+ reader_cfg=bustm_reader_cfg,
54
+ infer_cfg=bustm_infer_cfg,
55
+ eval_cfg=bustm_eval_cfg),
56
+ dict(
57
+ type=HFDataset,
58
+ abbr='bustm-test',
59
+ path='json',
60
+ data_files='./data/FewCLUE/bustm/test_public.json',
61
+ split='train',
62
+ reader_cfg=bustm_reader_cfg,
63
+ infer_cfg=bustm_infer_cfg,
64
+ eval_cfg=bustm_eval_cfg)
65
+ ]
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_9ef540.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ bustm_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ bustm_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0: '{sentence1}。\n{sentence2}。\n两句话说的毫不相关。',
17
+ 1: '{sentence1}。\n{sentence2}。\n两句话说的一个意思。'
18
+ }),
19
+ retriever=dict(type=ZeroRetriever),
20
+ inferencer=dict(type=PPLInferencer))
21
+
22
+ bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
23
+
24
+ bustm_datasets = [
25
+ dict(
26
+ type=HFDataset,
27
+ abbr='bustm-dev',
28
+ path='json',
29
+ data_files='./data/FewCLUE/bustm/dev_few_all.json',
30
+ split='train',
31
+ reader_cfg=bustm_reader_cfg,
32
+ infer_cfg=bustm_infer_cfg,
33
+ eval_cfg=bustm_eval_cfg),
34
+ dict(
35
+ type=HFDataset,
36
+ abbr='bustm-test',
37
+ path='json',
38
+ data_files='./data/FewCLUE/bustm/test_public.json',
39
+ split='train',
40
+ reader_cfg=bustm_reader_cfg,
41
+ infer_cfg=bustm_infer_cfg,
42
+ eval_cfg=bustm_eval_cfg)
43
+ ]
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_e53034.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ bustm_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ bustm_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0:
17
+ dict(round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt=
21
+ '语句一:“{sentence1}”\n语句二:“{sentence2}”\n请判断语句一和语句二说的是否是一个意思?'
22
+ ),
23
+ dict(role='BOT', prompt='两句话说的毫不相关。')
24
+ ]),
25
+ 1:
26
+ dict(round=[
27
+ dict(
28
+ role='HUMAN',
29
+ prompt=
30
+ '语句一:“{sentence1}”\n语句二:“{sentence2}”\n请判断语句一和语句二说的是否是一个意思?'
31
+ ),
32
+ dict(role='BOT', prompt='两句话说是的一个意思。')
33
+ ]),
34
+ }),
35
+ retriever=dict(type=ZeroRetriever),
36
+ inferencer=dict(type=PPLInferencer))
37
+
38
+ bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
39
+
40
+ bustm_datasets = [
41
+ dict(
42
+ type=HFDataset,
43
+ abbr='bustm-dev',
44
+ path='json',
45
+ data_files='./data/FewCLUE/bustm/dev_few_all.json',
46
+ split='train',
47
+ reader_cfg=bustm_reader_cfg,
48
+ infer_cfg=bustm_infer_cfg,
49
+ eval_cfg=bustm_eval_cfg),
50
+ dict(
51
+ type=HFDataset,
52
+ abbr='bustm-test',
53
+ path='json',
54
+ data_files='./data/FewCLUE/bustm/test_public.json',
55
+ split='train',
56
+ reader_cfg=bustm_reader_cfg,
57
+ infer_cfg=bustm_infer_cfg,
58
+ eval_cfg=bustm_eval_cfg)
59
+ ]
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets # noqa: F401, F403
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CMNLIDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ ocnli_fc_reader_cfg = dict(
9
+ input_columns=['sentence1', 'sentence2'],
10
+ output_column='label',
11
+ test_split='train')
12
+
13
+ ocnli_fc_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt=
20
+ '阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}\nA. 对\nB. 错\nC. 可能\n请从“A”,“B”,“C”中进行选择。\n答:'
21
+ ),
22
+ ]),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+ ocnli_fc_eval_cfg = dict(
28
+ evaluator=dict(type=AccEvaluator),
29
+ pred_role='BOT',
30
+ pred_postprocessor=dict(type=first_capital_postprocess),
31
+ )
32
+
33
+ ocnli_fc_datasets = [
34
+ dict(
35
+ abbr='ocnli_fc-dev',
36
+ type=CMNLIDatasetV2, # ocnli_fc share the same format with cmnli
37
+ path='./data/FewCLUE/ocnli/dev_few_all.json',
38
+ local_mode=True,
39
+ reader_cfg=ocnli_fc_reader_cfg,
40
+ infer_cfg=ocnli_fc_infer_cfg,
41
+ eval_cfg=ocnli_fc_eval_cfg,
42
+ ),
43
+ dict(
44
+ abbr='ocnli_fc-test',
45
+ type=CMNLIDatasetV2, # ocnli_fc share the same format with cmnli
46
+ path='./data/FewCLUE/ocnli/test_public.json',
47
+ local_mode=True,
48
+ reader_cfg=ocnli_fc_reader_cfg,
49
+ infer_cfg=ocnli_fc_infer_cfg,
50
+ eval_cfg=ocnli_fc_eval_cfg,
51
+ ),
52
+ ]
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets # noqa: F401, F403
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ ocnli_fc_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ ocnli_fc_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 'contradiction':
17
+ dict(round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
21
+ dict(role='BOT', prompt='错')
22
+ ]),
23
+ 'entailment':
24
+ dict(round=[
25
+ dict(
26
+ role='HUMAN',
27
+ prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
28
+ dict(role='BOT', prompt='对')
29
+ ]),
30
+ 'neutral':
31
+ dict(round=[
32
+ dict(
33
+ role='HUMAN', prompt='如果{sentence1}为真,那么{sentence2}也为真吗?'),
34
+ dict(role='BOT', prompt='可能')
35
+ ]),
36
+ }),
37
+ retriever=dict(type=ZeroRetriever),
38
+ inferencer=dict(type=PPLInferencer))
39
+ ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
40
+
41
+ ocnli_fc_datasets = [
42
+ dict(
43
+ type=HFDataset,
44
+ abbr='ocnli_fc-dev',
45
+ path='json',
46
+ split='train',
47
+ data_files='./data/FewCLUE/ocnli/dev_few_all.json',
48
+ reader_cfg=ocnli_fc_reader_cfg,
49
+ infer_cfg=ocnli_fc_infer_cfg,
50
+ eval_cfg=ocnli_fc_eval_cfg),
51
+ dict(
52
+ type=HFDataset,
53
+ abbr='ocnli_fc-test',
54
+ path='json',
55
+ split='train',
56
+ data_files='./data/FewCLUE/ocnli/test_public.json',
57
+ reader_cfg=ocnli_fc_reader_cfg,
58
+ infer_cfg=ocnli_fc_infer_cfg,
59
+ eval_cfg=ocnli_fc_eval_cfg)
60
+ ]
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ ocnli_fc_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ ocnli_fc_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 'contradiction':
17
+ '阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:错',
18
+ 'entailment': '阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:对',
19
+ 'neutral': '如果{sentence1}为真,那么{sentence2}也为真吗?可能'
20
+ }),
21
+ retriever=dict(type=ZeroRetriever),
22
+ inferencer=dict(type=PPLInferencer))
23
+ ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
24
+
25
+ ocnli_fc_datasets = [
26
+ dict(
27
+ type=HFDataset,
28
+ abbr='ocnli_fc-dev',
29
+ path='json',
30
+ split='train',
31
+ data_files='./data/FewCLUE/ocnli/dev_few_all.json',
32
+ reader_cfg=ocnli_fc_reader_cfg,
33
+ infer_cfg=ocnli_fc_infer_cfg,
34
+ eval_cfg=ocnli_fc_eval_cfg),
35
+ dict(
36
+ type=HFDataset,
37
+ abbr='ocnli_fc-test',
38
+ path='json',
39
+ split='train',
40
+ data_files='./data/FewCLUE/ocnli/test_public.json',
41
+ reader_cfg=ocnli_fc_reader_cfg,
42
+ infer_cfg=ocnli_fc_infer_cfg,
43
+ eval_cfg=ocnli_fc_eval_cfg)
44
+ ]
opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .GaokaoBench_gen_5cfe9e import GaokaoBench_datasets # noqa: F401, F403
opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import GaokaoBenchDataset
5
+
6
+
7
+ _MCQ_prompts = [
8
+ {
9
+ 'type': 'single_choice',
10
+ 'keyword': '2010-2022_Math_II_MCQs',
11
+ 'prefix_prompt':
12
+ '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
13
+ 'comment': ''
14
+ },
15
+ {
16
+ 'type': 'single_choice',
17
+ 'keyword': '2010-2022_Math_I_MCQs',
18
+ 'prefix_prompt':
19
+ '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
20
+ 'comment': ''
21
+ },
22
+ {
23
+ 'type':
24
+ 'single_choice',
25
+ 'keyword':
26
+ '2010-2022_History_MCQs',
27
+ 'prefix_prompt':
28
+ '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
29
+ },
30
+ {
31
+ 'type':
32
+ 'single_choice',
33
+ 'keyword':
34
+ '2010-2022_Biology_MCQs',
35
+ 'prefix_prompt':
36
+ '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
37
+ },
38
+ {
39
+ 'type':
40
+ 'single_choice',
41
+ 'keyword':
42
+ '2010-2022_Political_Science_MCQs',
43
+ 'prefix_prompt':
44
+ '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
45
+ },
46
+ {
47
+ 'type':
48
+ 'multi_choice',
49
+ 'keyword':
50
+ '2010-2022_Physics_MCQs',
51
+ 'prefix_prompt':
52
+ '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
53
+ },
54
+ {
55
+ 'type':
56
+ 'single_choice',
57
+ 'keyword':
58
+ '2010-2022_Chemistry_MCQs',
59
+ 'prefix_prompt':
60
+ '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
61
+ },
62
+ {
63
+ 'type':
64
+ 'single_choice',
65
+ 'keyword':
66
+ '2010-2013_English_MCQs',
67
+ 'prefix_prompt':
68
+ '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
69
+ },
70
+ {
71
+ 'type':
72
+ 'multi_question_choice',
73
+ 'keyword':
74
+ '2010-2022_Chinese_Modern_Lit',
75
+ 'prefix_prompt':
76
+ '请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
77
+ },
78
+ {
79
+ 'type':
80
+ 'multi_question_choice',
81
+ 'keyword':
82
+ '2010-2022_English_Fill_in_Blanks',
83
+ 'prefix_prompt':
84
+ '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
85
+ },
86
+ {
87
+ 'type':
88
+ 'five_out_of_seven',
89
+ 'keyword':
90
+ '2012-2022_English_Cloze_Test',
91
+ 'prefix_prompt':
92
+ '请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
93
+ },
94
+ {
95
+ 'type':
96
+ 'multi_question_choice',
97
+ 'keyword':
98
+ '2010-2022_Geography_MCQs',
99
+ 'prefix_prompt':
100
+ '请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
101
+ },
102
+ {
103
+ 'type':
104
+ 'multi_question_choice',
105
+ 'keyword':
106
+ '2010-2022_English_Reading_Comp',
107
+ 'prefix_prompt':
108
+ '请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
109
+ },
110
+ {
111
+ 'type':
112
+ 'multi_question_choice',
113
+ 'keyword':
114
+ '2010-2022_Chinese_Lang_and_Usage_MCQs',
115
+ 'prefix_prompt':
116
+ '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:'
117
+ },
118
+ ]
119
+ _FBQ_prompts = [{
120
+ 'type': 'cloze',
121
+ 'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
122
+ 'prefix_prompt':
123
+ '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
124
+ 'comment': ''
125
+ }, {
126
+ 'type': 'cloze',
127
+ 'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
128
+ 'prefix_prompt':
129
+ '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
130
+ 'comment': ''
131
+ }, {
132
+ 'type': 'cloze',
133
+ 'keyword':
134
+ '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
135
+ 'prefix_prompt':
136
+ '请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
137
+ 'comment': ''
138
+ }, {
139
+ 'type': 'cloze',
140
+ 'keyword': '2014-2022_English_Language_Cloze_Passage',
141
+ 'prefix_prompt':
142
+ '请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
143
+ 'comment': ''
144
+ }]
145
+ _OEQ_prompts = [
146
+ {
147
+ 'type': 'subjective',
148
+ 'keyword': '2010-2022_Geography_Open-ended_Questions',
149
+ 'prefix_prompt':
150
+ '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果���止一道题,请分别作答。\n题目如下:',
151
+ 'comment': ''
152
+ },
153
+ {
154
+ 'type': 'subjective',
155
+ 'keyword': '2010-2022_Chemistry_Open-ended_Questions',
156
+ 'prefix_prompt':
157
+ '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
158
+ 'comment': ''
159
+ },
160
+ {
161
+ 'type': 'subjective',
162
+ 'keyword': '2010-2022_Math_I_Open-ended_Questions',
163
+ 'prefix_prompt':
164
+ '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
165
+ 'comment': ''
166
+ },
167
+ {
168
+ 'type': 'subjective',
169
+ 'keyword': '2010-2022_History_Open-ended_Questions',
170
+ 'prefix_prompt':
171
+ '请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
172
+ 'comment': ''
173
+ },
174
+ {
175
+ 'type': 'subjective',
176
+ 'keyword': '2010-2022_Biology_Open-ended_Questions',
177
+ 'prefix_prompt':
178
+ '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
179
+ 'comment': ''
180
+ },
181
+ {
182
+ 'type': 'subjective',
183
+ 'keyword': '2010-2022_Math_II_Open-ended_Questions',
184
+ 'prefix_prompt':
185
+ '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
186
+ 'comment': ''
187
+ },
188
+ {
189
+ 'type': 'subjective',
190
+ 'keyword': '2010-2022_Physics_Open-ended_Questions',
191
+ 'prefix_prompt':
192
+ '请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
193
+ 'comment': ''
194
+ },
195
+ {
196
+ 'type': 'subjective',
197
+ 'keyword': '2010-2022_Political_Science_Open-ended_Questions',
198
+ 'prefix_prompt':
199
+ '请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
200
+ 'comment': ''
201
+ },
202
+ {
203
+ 'type': 'correction',
204
+ 'keyword': '2012-2022_English_Language_Error_Correction',
205
+ 'prefix_prompt':
206
+ '请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一���步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
207
+ # "prefix_prompt": [
208
+ # "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
209
+ # "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
210
+ # ],
211
+ 'comment': ''
212
+ },
213
+ {
214
+ 'type': 'subjective',
215
+ 'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
216
+ 'prefix_prompt':
217
+ '请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
218
+ 'comment': ''
219
+ },
220
+ {
221
+ 'type': 'subjective',
222
+ 'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
223
+ 'prefix_prompt':
224
+ '请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
225
+ 'comment': ''
226
+ },
227
+ {
228
+ 'type': 'subjective',
229
+ 'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
230
+ 'prefix_prompt':
231
+ '请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
232
+ 'comment': ''
233
+ },
234
+ {
235
+ 'type': 'subjective',
236
+ 'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
237
+ 'prefix_prompt':
238
+ '请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
239
+ 'comment': ''
240
+ },
241
+ {
242
+ 'type': 'subjective',
243
+ 'keyword':
244
+ '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
245
+ 'prefix_prompt':
246
+ '请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
247
+ 'comment': ''
248
+ }
249
+ ]
250
+
251
+ GaokaoBench_datasets = []
252
+ for _folder, _prompts in [
253
+ ('Multiple-choice_Questions', _MCQ_prompts),
254
+ ('Fill-in-the-blank_Questions', _FBQ_prompts),
255
+ ('Open-ended_Questions', _OEQ_prompts),
256
+ ]:
257
+ for _p in _prompts:
258
+ _reader_cfg = {
259
+ 'input_columns': ['question'],
260
+ 'output_column': 'answer',
261
+ }
262
+ _infer_cfg = {
263
+ 'ice_template': {
264
+ 'type': PromptTemplate,
265
+ 'template': {
266
+ 'round': [{
267
+ 'role': 'HUMAN',
268
+ 'prompt': _p['prefix_prompt'] + '{question}'
269
+ }]
270
+ },
271
+ 'ice_token': '</E>'
272
+ },
273
+ 'retriever': {
274
+ 'type': ZeroRetriever
275
+ },
276
+ 'inferencer': {
277
+ 'type': GenInferencer,
278
+ 'max_out_len': 1024,
279
+ }
280
+ }
281
+ _eval_cfg = {
282
+ 'evaluator': {
283
+ 'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
284
+ },
285
+ 'pred_role': 'BOT',
286
+ }
287
+ _base_path = 'opencompass/GAOKAO-BENCH'
288
+ _dataset = {
289
+ 'type': GaokaoBenchDataset,
290
+ 'abbr': 'GaokaoBench_' + _p['keyword'],
291
+ 'path': _base_path,
292
+ 'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
293
+ 'name': _p['keyword'],
294
+ 'reader_cfg': _reader_cfg,
295
+ 'infer_cfg': _infer_cfg,
296
+ 'eval_cfg': _eval_cfg,
297
+ }
298
+
299
+ GaokaoBench_datasets.append(_dataset)
300
+
301
+ _temporary_variables = [k for k in globals() if k.startswith('_')]
302
+ for _t in _temporary_variables:
303
+ del globals()[_t]
304
+ del _temporary_variables, _t
opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .GaokaoBench_mixed_9af5ee import GaokaoBench_datasets # noqa: F401, F403
opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
4
+ from opencompass.datasets import GaokaoBenchDataset
5
+ _MCQ_prompts = [
6
+ {
7
+ 'type': 'single_choice',
8
+ 'keyword': '2010-2022_Math_II_MCQs',
9
+ 'prefix_prompt':
10
+ '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
11
+ 'comment': ''
12
+ },
13
+ {
14
+ 'type': 'single_choice',
15
+ 'keyword': '2010-2022_Math_I_MCQs',
16
+ 'prefix_prompt':
17
+ '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
18
+ 'comment': ''
19
+ },
20
+ {
21
+ 'type':
22
+ 'single_choice',
23
+ 'keyword':
24
+ '2010-2022_History_MCQs',
25
+ 'prefix_prompt':
26
+ '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
27
+ },
28
+ {
29
+ 'type':
30
+ 'single_choice',
31
+ 'keyword':
32
+ '2010-2022_Biology_MCQs',
33
+ 'prefix_prompt':
34
+ '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
35
+ },
36
+ {
37
+ 'type':
38
+ 'single_choice',
39
+ 'keyword':
40
+ '2010-2022_Political_Science_MCQs',
41
+ 'prefix_prompt':
42
+ '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
43
+ },
44
+ {
45
+ 'type':
46
+ 'multi_choice',
47
+ 'keyword':
48
+ '2010-2022_Physics_MCQs',
49
+ 'prefix_prompt':
50
+ '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
51
+ },
52
+ {
53
+ 'type':
54
+ 'single_choice',
55
+ 'keyword':
56
+ '2010-2022_Chemistry_MCQs',
57
+ 'prefix_prompt':
58
+ '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
59
+ },
60
+ {
61
+ 'type':
62
+ 'single_choice',
63
+ 'keyword':
64
+ '2010-2013_English_MCQs',
65
+ 'prefix_prompt':
66
+ '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
67
+ },
68
+ {
69
+ 'type':
70
+ 'multi_question_choice',
71
+ 'keyword':
72
+ '2010-2022_Chinese_Modern_Lit',
73
+ 'prefix_prompt':
74
+ '请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
75
+ },
76
+ {
77
+ 'type':
78
+ 'multi_question_choice',
79
+ 'keyword':
80
+ '2010-2022_English_Fill_in_Blanks',
81
+ 'prefix_prompt':
82
+ '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
83
+ },
84
+ {
85
+ 'type':
86
+ 'five_out_of_seven',
87
+ 'keyword':
88
+ '2012-2022_English_Cloze_Test',
89
+ 'prefix_prompt':
90
+ '请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
91
+ },
92
+ {
93
+ 'type':
94
+ 'multi_question_choice',
95
+ 'keyword':
96
+ '2010-2022_Geography_MCQs',
97
+ 'prefix_prompt':
98
+ '请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
99
+ },
100
+ {
101
+ 'type':
102
+ 'multi_question_choice',
103
+ 'keyword':
104
+ '2010-2022_English_Reading_Comp',
105
+ 'prefix_prompt':
106
+ '请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
107
+ },
108
+ {
109
+ 'type':
110
+ 'multi_question_choice',
111
+ 'keyword':
112
+ '2010-2022_Chinese_Lang_and_Usage_MCQs',
113
+ 'prefix_prompt':
114
+ '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:'
115
+ },
116
+ ]
117
+ _FBQ_prompts = [{
118
+ 'type': 'cloze',
119
+ 'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
120
+ 'prefix_prompt':
121
+ '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
122
+ 'comment': ''
123
+ }, {
124
+ 'type': 'cloze',
125
+ 'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
126
+ 'prefix_prompt':
127
+ '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
128
+ 'comment': ''
129
+ }, {
130
+ 'type': 'cloze',
131
+ 'keyword':
132
+ '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
133
+ 'prefix_prompt':
134
+ '请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
135
+ 'comment': ''
136
+ }, {
137
+ 'type': 'cloze',
138
+ 'keyword': '2014-2022_English_Language_Cloze_Passage',
139
+ 'prefix_prompt':
140
+ '请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
141
+ 'comment': ''
142
+ }]
143
+ _OEQ_prompts = [
144
+ {
145
+ 'type': 'subjective',
146
+ 'keyword': '2010-2022_Geography_Open-ended_Questions',
147
+ 'prefix_prompt':
148
+ '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作��,如果不止一道题,请分别作答。\n题目如下:',
149
+ 'comment': ''
150
+ },
151
+ {
152
+ 'type': 'subjective',
153
+ 'keyword': '2010-2022_Chemistry_Open-ended_Questions',
154
+ 'prefix_prompt':
155
+ '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
156
+ 'comment': ''
157
+ },
158
+ {
159
+ 'type': 'subjective',
160
+ 'keyword': '2010-2022_Math_I_Open-ended_Questions',
161
+ 'prefix_prompt':
162
+ '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
163
+ 'comment': ''
164
+ },
165
+ {
166
+ 'type': 'subjective',
167
+ 'keyword': '2010-2022_History_Open-ended_Questions',
168
+ 'prefix_prompt':
169
+ '请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
170
+ 'comment': ''
171
+ },
172
+ {
173
+ 'type': 'subjective',
174
+ 'keyword': '2010-2022_Biology_Open-ended_Questions',
175
+ 'prefix_prompt':
176
+ '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
177
+ 'comment': ''
178
+ },
179
+ {
180
+ 'type': 'subjective',
181
+ 'keyword': '2010-2022_Math_II_Open-ended_Questions',
182
+ 'prefix_prompt':
183
+ '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
184
+ 'comment': ''
185
+ },
186
+ {
187
+ 'type': 'subjective',
188
+ 'keyword': '2010-2022_Physics_Open-ended_Questions',
189
+ 'prefix_prompt':
190
+ '请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
191
+ 'comment': ''
192
+ },
193
+ {
194
+ 'type': 'subjective',
195
+ 'keyword': '2010-2022_Political_Science_Open-ended_Questions',
196
+ 'prefix_prompt':
197
+ '请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
198
+ 'comment': ''
199
+ },
200
+ {
201
+ 'type': 'correction',
202
+ 'keyword': '2012-2022_English_Language_Error_Correction',
203
+ 'prefix_prompt':
204
+ '请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方��请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
205
+ # "prefix_prompt": [
206
+ # "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
207
+ # "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
208
+ # ],
209
+ 'comment': ''
210
+ },
211
+ {
212
+ 'type': 'subjective',
213
+ 'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
214
+ 'prefix_prompt':
215
+ '请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
216
+ 'comment': ''
217
+ },
218
+ {
219
+ 'type': 'subjective',
220
+ 'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
221
+ 'prefix_prompt':
222
+ '请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
223
+ 'comment': ''
224
+ },
225
+ {
226
+ 'type': 'subjective',
227
+ 'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
228
+ 'prefix_prompt':
229
+ '请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
230
+ 'comment': ''
231
+ },
232
+ {
233
+ 'type': 'subjective',
234
+ 'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
235
+ 'prefix_prompt':
236
+ '请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
237
+ 'comment': ''
238
+ },
239
+ {
240
+ 'type': 'subjective',
241
+ 'keyword':
242
+ '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
243
+ 'prefix_prompt':
244
+ '请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
245
+ 'comment': ''
246
+ }
247
+ ]
248
+
249
+ GaokaoBench_datasets = []
250
+ for _folder, _prompts in [
251
+ ('Multiple-choice_Questions', _MCQ_prompts),
252
+ ('Fill-in-the-blank_Questions', _FBQ_prompts),
253
+ ('Open-ended_Questions', _OEQ_prompts),
254
+ ]:
255
+ for _p in _prompts:
256
+ if _p['type'] == 'single_choice':
257
+ continue
258
+ _reader_cfg = {
259
+ 'input_columns': ['question'],
260
+ 'output_column': 'answer',
261
+ }
262
+ _infer_cfg = {
263
+ 'ice_template': {
264
+ 'type': PromptTemplate,
265
+ 'template': {
266
+ 'round': [{
267
+ 'role': 'HUMAN',
268
+ 'prompt': _p['prefix_prompt'] + '{question}'
269
+ }]
270
+ },
271
+ 'ice_token': '</E>'
272
+ },
273
+ 'retriever': {
274
+ 'type': ZeroRetriever
275
+ },
276
+ 'inferencer': {
277
+ 'type': GenInferencer,
278
+ 'max_out_len': 1024,
279
+ }
280
+ }
281
+ _eval_cfg = {
282
+ 'evaluator': {
283
+ 'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
284
+ },
285
+ 'pred_role': 'BOT',
286
+ }
287
+ _base_path = './data/GAOKAO-BENCH/data'
288
+ _dataset = {
289
+ 'type': GaokaoBenchDataset,
290
+ 'abbr': 'GaokaoBench_' + _p['keyword'],
291
+ 'path': _base_path,
292
+ 'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
293
+ 'name': _p['keyword'],
294
+ 'reader_cfg': _reader_cfg,
295
+ 'infer_cfg': _infer_cfg,
296
+ 'eval_cfg': _eval_cfg,
297
+ }
298
+
299
+ GaokaoBench_datasets.append(_dataset)
300
+
301
+ _folder = 'Multiple-choice_Questions'
302
+ for _p in _MCQ_prompts:
303
+ if _p['type'] != 'single_choice':
304
+ continue
305
+ _reader_cfg = {
306
+ 'input_columns': ['question'],
307
+ 'output_column': 'answer',
308
+ }
309
+ _infer_cfg = {
310
+ 'ice_template': {
311
+ 'type': PromptTemplate,
312
+ 'template': {
313
+ answer: {
314
+ 'round': [{
315
+ 'role': 'HUMAN',
316
+ 'prompt': _p['prefix_prompt'] + '{question}'
317
+ }, {
318
+ 'role': 'BOT',
319
+ 'prompt': f'【答案】{answer} <eoa>'
320
+ }]
321
+ }
322
+ for answer in ['A', 'B', 'C', 'D']
323
+ },
324
+ 'ice_token': '</E>'
325
+ },
326
+ 'retriever': {
327
+ 'type': ZeroRetriever
328
+ },
329
+ 'inferencer': {
330
+ 'type': PPLInferencer
331
+ }
332
+ }
333
+ _eval_cfg = {
334
+ 'evaluator': {
335
+ 'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
336
+ },
337
+ 'pred_role': 'BOT',
338
+ }
339
+ _base_path = 'opencompass/GAOKAO-BENCH'
340
+ _dataset = {
341
+ 'type': GaokaoBenchDataset,
342
+ 'abbr': 'GaokaoBench_' + _p['keyword'],
343
+ 'path': _base_path,
344
+ 'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
345
+ 'name': _p['keyword'],
346
+ 'reader_cfg': _reader_cfg,
347
+ 'infer_cfg': _infer_cfg,
348
+ 'eval_cfg': _eval_cfg,
349
+ }
350
+
351
+ GaokaoBench_datasets.append(_dataset)
352
+
353
+ _temporary_variables = [k for k in globals() if k.startswith('_')]
354
+ for _t in _temporary_variables:
355
+ del globals()[_t]
356
+ del _temporary_variables, _t
opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.datasets import GaokaoBenchDataset
6
+ from mmengine.config import read_base
7
+
8
+ with read_base():
9
+ from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
10
+
11
+ GaokaoBench_datasets = []
12
+ for folder, prompts in [
13
+ ('Multiple-choice_Questions', MCQ_prompts),
14
+ ('Fill-in-the-blank_Questions', FBQ_prompts),
15
+ ]:
16
+ for p in prompts:
17
+ reader_cfg = {
18
+ 'input_columns': ['question'],
19
+ 'output_column': 'answer',
20
+ }
21
+ infer_cfg = {
22
+ 'ice_template': {
23
+ 'type': PromptTemplate,
24
+ 'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
25
+ 'ice_token': '</E>',
26
+ },
27
+ 'retriever': {'type': ZeroRetriever},
28
+ 'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
29
+ }
30
+ eval_cfg = {
31
+ 'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
32
+ 'pred_role': 'BOT',
33
+ }
34
+ _base_path = 'opencompass/GAOKAO-BENCH'
35
+ dataset = {
36
+ 'type': GaokaoBenchDataset,
37
+ 'abbr': 'GaokaoBench_' + p['keyword'],
38
+ 'path': _base_path,
39
+ 'filename': '/' + folder + '/' + p['keyword'] + '.json',
40
+ 'name': p['keyword'],
41
+ 'reader_cfg': reader_cfg,
42
+ 'infer_cfg': infer_cfg,
43
+ 'eval_cfg': eval_cfg,
44
+ }
45
+ GaokaoBench_datasets.append(dataset)
opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.datasets import GaokaoBenchDataset
6
+ from mmengine.config import read_base
7
+
8
+ with read_base():
9
+ from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
10
+
11
+ GaokaoBench_datasets = []
12
+ for folder, prompts in [
13
+ ('Multiple-choice_Questions', MCQ_prompts),
14
+ ('Fill-in-the-blank_Questions', FBQ_prompts),
15
+ ]:
16
+ for p in prompts:
17
+ reader_cfg = {
18
+ 'input_columns': ['question'],
19
+ 'output_column': 'answer',
20
+ }
21
+ infer_cfg = {
22
+ 'prompt_template': {
23
+ 'type': PromptTemplate,
24
+ 'template': p['prefix_prompt'] + '{question}',
25
+ },
26
+ 'retriever': {'type': ZeroRetriever},
27
+ 'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
28
+ }
29
+ eval_cfg = {
30
+ 'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
31
+ 'pred_role': 'BOT',
32
+ }
33
+ _base_path = 'opencompass/GAOKAO-BENCH'
34
+ dataset = {
35
+ 'type': GaokaoBenchDataset,
36
+ 'abbr': 'GaokaoBench_' + p['keyword'],
37
+ 'path': _base_path,
38
+ 'filename': '/' + folder + '/' + p['keyword'] + '.json',
39
+ 'name': p['keyword'],
40
+ 'reader_cfg': reader_cfg,
41
+ 'infer_cfg': infer_cfg,
42
+ 'eval_cfg': eval_cfg,
43
+ }
44
+ GaokaoBench_datasets.append(dataset)
opencompass/configs/datasets/GaokaoBench/GaokaoBench_prompts.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ MCQ_prompts = [
3
+ {
4
+ 'type': 'single_choice',
5
+ 'keyword': '2010-2022_Math_II_MCQs',
6
+ 'prefix_prompt': '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
7
+ 'comment': '',
8
+ },
9
+ {
10
+ 'type': 'single_choice',
11
+ 'keyword': '2010-2022_Math_I_MCQs',
12
+ 'prefix_prompt': '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
13
+ 'comment': '',
14
+ },
15
+ {
16
+ 'type': 'single_choice',
17
+ 'keyword': '2010-2022_History_MCQs',
18
+ 'prefix_prompt': '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
19
+ },
20
+ {
21
+ 'type': 'single_choice',
22
+ 'keyword': '2010-2022_Biology_MCQs',
23
+ 'prefix_prompt': '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
24
+ },
25
+ {
26
+ 'type': 'single_choice',
27
+ 'keyword': '2010-2022_Political_Science_MCQs',
28
+ 'prefix_prompt': '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
29
+ },
30
+ {
31
+ 'type': 'multi_choice',
32
+ 'keyword': '2010-2022_Physics_MCQs',
33
+ 'prefix_prompt': '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n',
34
+ },
35
+ {
36
+ 'type': 'single_choice',
37
+ 'keyword': '2010-2022_Chemistry_MCQs',
38
+ 'prefix_prompt': '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
39
+ },
40
+ {
41
+ 'type': 'single_choice',
42
+ 'keyword': '2010-2013_English_MCQs',
43
+ 'prefix_prompt': '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
44
+ },
45
+ {
46
+ 'type': 'multi_question_choice',
47
+ 'keyword': '2010-2022_Chinese_Modern_Lit',
48
+ 'prefix_prompt': '请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
49
+ },
50
+ {
51
+ 'type': 'multi_question_choice',
52
+ 'keyword': '2010-2022_English_Fill_in_Blanks',
53
+ 'prefix_prompt': '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
54
+ },
55
+ {
56
+ 'type': 'five_out_of_seven',
57
+ 'keyword': '2012-2022_English_Cloze_Test',
58
+ 'prefix_prompt': '请回答下面��问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n',
59
+ },
60
+ {
61
+ 'type': 'multi_question_choice',
62
+ 'keyword': '2010-2022_Geography_MCQs',
63
+ 'prefix_prompt': '请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
64
+ },
65
+ {
66
+ 'type': 'multi_question_choice',
67
+ 'keyword': '2010-2022_English_Reading_Comp',
68
+ 'prefix_prompt': '请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
69
+ },
70
+ {
71
+ 'type': 'multi_question_choice',
72
+ 'keyword': '2010-2022_Chinese_Lang_and_Usage_MCQs',
73
+ 'prefix_prompt': '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:',
74
+ },
75
+ ]
76
+ FBQ_prompts = [
77
+ {
78
+ 'type': 'cloze',
79
+ 'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
80
+ 'prefix_prompt': '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
81
+ 'comment': '',
82
+ },
83
+ {
84
+ 'type': 'cloze',
85
+ 'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
86
+ 'prefix_prompt': '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
87
+ 'comment': '',
88
+ },
89
+ {
90
+ 'type': 'cloze',
91
+ 'keyword': '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
92
+ 'prefix_prompt': '请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
93
+ 'comment': '',
94
+ },
95
+ {
96
+ 'type': 'cloze',
97
+ 'keyword': '2014-2022_English_Language_Cloze_Passage',
98
+ 'prefix_prompt': '请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
99
+ 'comment': '',
100
+ },
101
+ ]
102
+ OEQ_prompts = [
103
+ {
104
+ 'type': 'subjective',
105
+ 'keyword': '2010-2022_Geography_Open-ended_Questions',
106
+ 'prefix_prompt': '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
107
+ 'comment': '',
108
+ },
109
+ {
110
+ 'type': 'subjective',
111
+ 'keyword': '2010-2022_Chemistry_Open-ended_Questions',
112
+ 'prefix_prompt': '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解���】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
113
+ 'comment': '',
114
+ },
115
+ {
116
+ 'type': 'subjective',
117
+ 'keyword': '2010-2022_Math_I_Open-ended_Questions',
118
+ 'prefix_prompt': '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
119
+ 'comment': '',
120
+ },
121
+ {
122
+ 'type': 'subjective',
123
+ 'keyword': '2010-2022_History_Open-ended_Questions',
124
+ 'prefix_prompt': '请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
125
+ 'comment': '',
126
+ },
127
+ {
128
+ 'type': 'subjective',
129
+ 'keyword': '2010-2022_Biology_Open-ended_Questions',
130
+ 'prefix_prompt': '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
131
+ 'comment': '',
132
+ },
133
+ {
134
+ 'type': 'subjective',
135
+ 'keyword': '2010-2022_Math_II_Open-ended_Questions',
136
+ 'prefix_prompt': '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
137
+ 'comment': '',
138
+ },
139
+ {
140
+ 'type': 'subjective',
141
+ 'keyword': '2010-2022_Physics_Open-ended_Questions',
142
+ 'prefix_prompt': '请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
143
+ 'comment': '',
144
+ },
145
+ {
146
+ 'type': 'subjective',
147
+ 'keyword': '2010-2022_Political_Science_Open-ended_Questions',
148
+ 'prefix_prompt': '请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
149
+ 'comment': '',
150
+ },
151
+ {
152
+ 'type': 'correction',
153
+ 'keyword': '2012-2022_English_Language_Error_Correction',
154
+ 'prefix_prompt': '请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
155
+ # "prefix_prompt": [
156
+ # "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
157
+ # "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
158
+ # ],
159
+ 'comment': '',
160
+ },
161
+ {
162
+ 'type': 'subjective',
163
+ 'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
164
+ 'prefix_prompt': '请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
165
+ 'comment': '',
166
+ },
167
+ {
168
+ 'type': 'subjective',
169
+ 'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
170
+ 'prefix_prompt': '请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
171
+ 'comment': '',
172
+ },
173
+ {
174
+ 'type': 'subjective',
175
+ 'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
176
+ 'prefix_prompt': '请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
177
+ 'comment': '',
178
+ },
179
+ {
180
+ 'type': 'subjective',
181
+ 'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
182
+ 'prefix_prompt': '请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
183
+ 'comment': '',
184
+ },
185
+ {
186
+ 'type': 'subjective',
187
+ 'keyword': '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
188
+ 'prefix_prompt': '请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
189
+ 'comment': '',
190
+ },
191
+ ]
opencompass/configs/datasets/GaokaoBench/README.md ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GaokaoBench
2
+
3
+ ```bash
4
+ python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
5
+ python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
6
+ ```
7
+
8
+ ## Base Models
9
+
10
+ | model | GaokaoBench |
11
+ |:------------------------:|--------------:|
12
+ | llama-7b-turbomind | 14.55 |
13
+ | llama-13b-turbomind | 16.20 |
14
+ | llama-30b-turbomind | 16.14 |
15
+ | llama-65b-turbomind | 13.31 |
16
+ | llama-2-7b-turbomind | 15.02 |
17
+ | llama-2-13b-turbomind | 14.86 |
18
+ | llama-2-70b-turbomind | 16.36 |
19
+ | llama-3-8b-turbomind | 20.88 |
20
+ | llama-3-70b-turbomind | 19.98 |
21
+ | internlm2-1.8b-turbomind | 23.78 |
22
+ | internlm2-7b-turbomind | 41.41 |
23
+ | internlm2-20b-turbomind | 58.99 |
24
+ | qwen-1.8b-turbomind | 22.11 |
25
+ | qwen-7b-turbomind | 35.32 |
26
+ | qwen-14b-turbomind | 54.07 |
27
+ | qwen-72b-turbomind | 77.56 |
28
+ | qwen1.5-0.5b-hf | 30.67 |
29
+ | qwen1.5-1.8b-hf | 35.66 |
30
+ | qwen1.5-4b-hf | 54.31 |
31
+ | qwen1.5-7b-hf | 65.99 |
32
+ | qwen1.5-14b-hf | 66.60 |
33
+ | qwen1.5-32b-hf | 79.01 |
34
+ | qwen1.5-72b-hf | 80.26 |
35
+ | qwen1.5-moe-a2-7b-hf | 52.79 |
36
+ | mistral-7b-v0.1-hf | 14.35 |
37
+ | mistral-7b-v0.2-hf | 11.10 |
38
+ | mixtral-8x7b-v0.1-hf | 8.40 |
39
+ | mixtral-8x22b-v0.1-hf | 16.23 |
40
+ | yi-6b-hf | 31.70 |
41
+ | yi-34b-hf | 30.51 |
42
+ | deepseek-7b-base-hf | 17.02 |
43
+ | deepseek-67b-base-hf | 10.14 |
44
+
45
+ ### Details
46
+
47
+ | model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
48
+ |:------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
49
+ | llama-7b-turbomind | 14.22 | 13.55 | 12.54 | 18.67 | 19.06 | 2.34 | 17.74 |
50
+ | llama-13b-turbomind | 18.81 | 15.89 | 21.25 | 22.67 | 15.62 | 1.56 | 25.81 |
51
+ | llama-30b-turbomind | 20.64 | 19.16 | 27.18 | 16.67 | 16.56 | 2.34 | 12.10 |
52
+ | llama-65b-turbomind | 21.10 | 15.89 | 11.50 | 20.00 | 5.94 | 1.56 | 21.77 |
53
+ | llama-2-7b-turbomind | 16.97 | 16.36 | 20.91 | 22.00 | 18.75 | 2.34 | 11.29 |
54
+ | llama-2-13b-turbomind | 14.68 | 11.68 | 26.13 | 16.00 | 17.81 | 2.34 | 20.97 |
55
+ | llama-2-70b-turbomind | 18.81 | 12.15 | 26.13 | 16.00 | 20.31 | 4.69 | 16.13 |
56
+ | llama-3-8b-turbomind | 4.13 | 7.94 | 37.63 | 24.67 | 26.25 | 5.47 | 21.77 |
57
+ | llama-3-70b-turbomind | 4.59 | 3.12 | 20.83 | 10.94 | 18.00 | 6.25 | 15.62 |
58
+ | internlm2-1.8b-turbomind | 20.64 | 22.90 | 39.72 | 30.00 | 25.94 | 10.94 | 31.45 |
59
+ | internlm2-7b-turbomind | 33.94 | 35.51 | 38.33 | 59.33 | 61.56 | 2.34 | 11.29 |
60
+ | internlm2-20b-turbomind | 59.17 | 51.40 | 65.16 | 74.00 | 82.19 | 28.91 | 54.03 |
61
+ | qwen-1.8b-turbomind | 29.36 | 30.84 | 19.51 | 26.00 | 22.19 | 5.47 | 27.42 |
62
+ | qwen-7b-turbomind | 22.48 | 28.04 | 45.64 | 43.33 | 62.19 | 3.91 | 33.87 |
63
+ | qwen-14b-turbomind | 54.13 | 56.25 | 82.93 | 72.00 | 85.00 | 4.69 | 65.62 |
64
+ | qwen-72b-turbomind | 73.12 | 64.49 | 91.67 | 90.62 | 58.75 | 44.53 | 79.03 |
65
+ | qwen1.5-0.5b-hf | 26.61 | 32.71 | 32.40 | 34.67 | 53.44 | 10.94 | 28.23 |
66
+ | qwen1.5-1.8b-hf | 36.24 | 33.18 | 56.45 | 36.00 | 49.38 | 6.25 | 33.06 |
67
+ | qwen1.5-4b-hf | 45.41 | 37.85 | 68.29 | 62.00 | 87.81 | 5.47 | 47.58 |
68
+ | qwen1.5-7b-hf | 56.42 | 53.74 | 85.02 | 69.33 | 86.88 | 28.12 | 70.16 |
69
+ | qwen1.5-14b-hf | 69.27 | 63.08 | 54.01 | 79.33 | 76.56 | 40.62 | 79.84 |
70
+ | qwen1.5-32b-hf | 71.10 | 61.68 | 92.68 | 93.33 | 95.94 | 45.31 | 83.06 |
71
+ | qwen1.5-72b-hf | 71.15 | 68.22 | 94.44 | 96.67 | 95.00 | 38.28 | 75.00 |
72
+ | qwen1.5-moe-a2-7b-hf | 35.32 | 29.44 | 68.64 | 44.67 | 75.00 | 17.97 | 59.68 |
73
+ | mistral-7b-v0.1-hf | 13.76 | 12.15 | 9.76 | 8.00 | 5.94 | 0.00 | 17.74 |
74
+ | mistral-7b-v0.2-hf | 6.88 | 5.61 | 10.45 | 12.00 | 4.06 | 0.78 | 14.52 |
75
+ | mixtral-8x7b-v0.1-hf | 3.67 | 1.87 | 0.35 | 0.00 | 0.00 | 0.78 | 0.81 |
76
+ | mixtral-8x22b-v0.1-hf | 16.51 | 15.89 | 1.39 | 3.33 | 9.69 | 0.00 | 13.71 |
77
+ | yi-6b-hf | 6.25 | 3.12 | 40.74 | 43.75 | 35.94 | 8.59 | 31.25 |
78
+ | yi-34b-hf | 12.50 | 4.17 | 31.11 | 5.00 | 20.62 | 2.34 | 0.89 |
79
+ | deepseek-7b-base-hf | 14.22 | 13.08 | 25.78 | 20.67 | 20.31 | 5.47 | 18.55 |
80
+ | deepseek-67b-base-hf | 3.67 | 4.21 | 8.36 | 7.33 | 4.69 | 1.56 | 4.84 |
81
+
82
+ | model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
83
+ |:------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
84
+ | llama-7b-turbomind | 19.05 | 0.00 | 15.00 | 16.15 | 22.11 | 10.43 | 15.00 |
85
+ | llama-13b-turbomind | 22.86 | 0.00 | 8.50 | 8.46 | 24.21 | 9.36 | 20.00 |
86
+ | llama-30b-turbomind | 28.57 | 0.00 | 6.33 | 13.85 | 23.16 | 12.98 | 12.50 |
87
+ | llama-65b-turbomind | 21.90 | 0.00 | 8.00 | 13.85 | 16.84 | 12.34 | 10.00 |
88
+ | llama-2-7b-turbomind | 20.95 | 0.00 | 6.17 | 12.31 | 22.11 | 11.28 | 11.25 |
89
+ | llama-2-13b-turbomind | 16.19 | 0.00 | 9.83 | 13.08 | 22.11 | 7.66 | 10.00 |
90
+ | llama-2-70b-turbomind | 31.43 | 0.00 | 4.17 | 13.08 | 25.26 | 20.43 | 7.50 |
91
+ | llama-3-8b-turbomind | 1.90 | 1.15 | 42.00 | 7.69 | 29.47 | 17.66 | 17.50 |
92
+ | llama-3-70b-turbomind | 18.75 | 3.45 | 53.67 | 76.15 | 18.60 | 36.76 | 8.75 |
93
+ | internlm2-1.8b-turbomind | 33.33 | 3.45 | 15.67 | 13.85 | 32.63 | 10.43 | 25.00 |
94
+ | internlm2-7b-turbomind | 61.90 | 20.69 | 57.33 | 20.77 | 61.05 | 40.21 | 47.50 |
95
+ | internlm2-20b-turbomind | 72.38 | 37.93 | 62.33 | 19.23 | 74.74 | 38.51 | 48.75 |
96
+ | qwen-1.8b-turbomind | 47.62 | 9.20 | 13.50 | 12.31 | 25.26 | 16.38 | 21.25 |
97
+ | qwen-7b-turbomind | 42.86 | 12.64 | 35.83 | 26.15 | 51.58 | 17.87 | 30.00 |
98
+ | qwen-14b-turbomind | 89.58 | 3.45 | 5.00 | 23.85 | 93.02 | 21.10 | 40.62 |
99
+ | qwen-72b-turbomind | 71.43 | 81.25 | 88.17 | 96.25 | 95.79 | 79.57 | 90.00 |
100
+ | qwen1.5-0.5b-hf | 40.95 | 22.99 | 21.67 | 21.54 | 38.95 | 17.02 | 22.50 |
101
+ | qwen1.5-1.8b-hf | 85.71 | 29.89 | 22.17 | 30.00 | 34.74 | 20.43 | 27.50 |
102
+ | qwen1.5-4b-hf | 88.57 | 35.63 | 41.00 | 67.69 | 64.21 | 41.28 | 68.75 |
103
+ | qwen1.5-7b-hf | 93.33 | 14.94 | 59.33 | 70.00 | 61.05 | 67.87 | 61.25 |
104
+ | qwen1.5-14b-hf | 94.29 | 16.09 | 59.67 | 76.92 | 90.53 | 59.57 | 77.50 |
105
+ | qwen1.5-32b-hf | 94.29 | 43.68 | 82.83 | 38.46 | 97.89 | 75.96 | 67.50 |
106
+ | qwen1.5-72b-hf | 99.05 | 28.74 | 85.62 | 77.69 | 94.74 | 72.77 | 87.50 |
107
+ | qwen1.5-moe-a2-7b-hf | 65.71 | 36.78 | 51.67 | 75.38 | 72.63 | 61.28 | 33.75 |
108
+ | mistral-7b-v0.1-hf | 17.14 | 8.05 | 28.33 | 6.92 | 24.21 | 30.43 | 12.50 |
109
+ | mistral-7b-v0.2-hf | 7.62 | 9.20 | 23.17 | 6.15 | 25.26 | 19.15 | 7.50 |
110
+ | mixtral-8x7b-v0.1-hf | 0.00 | 4.60 | 33.83 | 10.77 | 37.89 | 25.96 | 3.75 |
111
+ | mixtral-8x22b-v0.1-hf | 7.62 | 4.17 | 51.33 | 14.62 | 53.68 | 21.91 | 10.00 |
112
+ | yi-6b-hf | 17.14 | 52.87 | 50.83 | 36.25 | 36.84 | 48.09 | 36.25 |
113
+ | yi-34b-hf | 0.00 | 59.77 | 76.67 | 86.92 | 67.44 | 61.06 | 81.25 |
114
+ | deepseek-7b-base-hf | 20.95 | 2.30 | 17.83 | 12.31 | 25.26 | 12.55 | 8.75 |
115
+ | deepseek-67b-base-hf | 1.90 | 9.20 | 27.33 | 30.00 | 40.00 | 13.19 | 3.75 |
116
+
117
+ ## Chat Models
118
+
119
+ | model | GaokaoBench |
120
+ |:-----------------------------:|--------------:|
121
+ | qwen1.5-0.5b-chat-hf | 21.51 |
122
+ | qwen1.5-1.8b-chat-hf | 46.19 |
123
+ | qwen1.5-4b-chat-hf | 59.11 |
124
+ | qwen1.5-7b-chat-hf | 70.55 |
125
+ | qwen1.5-14b-chat-hf | 80.39 |
126
+ | qwen1.5-32b-chat-hf | 86.15 |
127
+ | qwen1.5-72b-chat-hf | 88.58 |
128
+ | qwen1.5-110b-chat-hf | 89.59 |
129
+ | internlm2-chat-1.8b-hf | 29.73 |
130
+ | internlm2-chat-1.8b-sft-hf | 28.79 |
131
+ | internlm2-chat-7b-hf | 54.54 |
132
+ | internlm2-chat-7b-sft-hf | 55.39 |
133
+ | internlm2-chat-20b-hf | 57.95 |
134
+ | internlm2-chat-20b-sft-hf | 57.62 |
135
+ | llama-3-8b-instruct-hf | 45.48 |
136
+ | llama-3-70b-instruct-hf | 65.91 |
137
+ | llama-3-8b-instruct-lmdeploy | 44.48 |
138
+ | llama-3-70b-instruct-lmdeploy | 67.06 |
139
+ | mistral-7b-instruct-v0.1-hf | 26.21 |
140
+ | mistral-7b-instruct-v0.2-hf | 32.17 |
141
+ | mixtral-8x7b-instruct-v0.1-hf | 42.46 |
142
+
143
+ ### Details
144
+
145
+ | model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
146
+ |:-----------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
147
+ | qwen1.5-0.5b-chat-hf | 25.23 | 25.70 | 39.02 | 24.67 | 25.00 | 0.78 | 25.00 |
148
+ | qwen1.5-1.8b-chat-hf | 30.28 | 26.64 | 61.32 | 55.33 | 77.81 | 11.72 | 40.32 |
149
+ | qwen1.5-4b-chat-hf | 38.53 | 35.05 | 70.73 | 70.00 | 83.44 | 25.00 | 41.13 |
150
+ | qwen1.5-7b-chat-hf | 49.54 | 39.72 | 81.88 | 82.67 | 90.62 | 46.88 | 61.29 |
151
+ | qwen1.5-14b-chat-hf | 64.68 | 54.21 | 87.80 | 90.67 | 94.69 | 44.53 | 69.35 |
152
+ | qwen1.5-32b-chat-hf | 70.92 | 66.14 | 98.02 | 97.74 | 96.07 | 57.81 | 72.92 |
153
+ | qwen1.5-72b-chat-hf | 76.61 | 68.22 | 95.47 | 96.00 | 97.19 | 64.06 | 86.29 |
154
+ | qwen1.5-110b-chat-hf | 80.36 | 66.67 | 100.00 | 100.00 | 96.25 | 65.62 | 75.00 |
155
+ | internlm2-chat-1.8b-hf | 28.44 | 28.50 | 46.69 | 39.33 | 44.38 | 10.16 | 26.61 |
156
+ | internlm2-chat-1.8b-sft-hf | 23.85 | 20.09 | 55.75 | 40.67 | 53.12 | 14.84 | 30.65 |
157
+ | internlm2-chat-7b-hf | 45.87 | 42.52 | 77.70 | 75.33 | 76.56 | 16.41 | 38.71 |
158
+ | internlm2-chat-7b-sft-hf | 49.08 | 39.72 | 80.84 | 68.67 | 81.25 | 29.69 | 42.74 |
159
+ | internlm2-chat-20b-hf | 53.21 | 46.73 | 80.49 | 74.00 | 85.00 | 31.25 | 37.10 |
160
+ | internlm2-chat-20b-sft-hf | 51.83 | 47.20 | 86.06 | 78.00 | 88.12 | 35.16 | 45.16 |
161
+ | llama-3-8b-instruct-hf | 37.16 | 31.31 | 60.98 | 48.67 | 51.25 | 11.72 | 39.52 |
162
+ | llama-3-70b-instruct-hf | 58.26 | 52.34 | 63.76 | 75.33 | 75.31 | 36.72 | 53.23 |
163
+ | llama-3-8b-instruct-lmdeploy | 37.61 | 35.51 | 55.05 | 53.33 | 52.19 | 7.81 | 34.68 |
164
+ | llama-3-70b-instruct-lmdeploy | 75.00 | 55.56 | 61.11 | 73.68 | 70.00 | 40.62 | 43.75 |
165
+ | mistral-7b-instruct-v0.1-hf | 23.39 | 21.03 | 35.19 | 18.00 | 26.56 | 5.47 | 30.65 |
166
+ | mistral-7b-instruct-v0.2-hf | 31.19 | 19.63 | 38.33 | 40.00 | 35.94 | 20.31 | 34.68 |
167
+ | mixtral-8x7b-instruct-v0.1-hf | 41.28 | 37.85 | 52.26 | 47.33 | 50.00 | 25.78 | 43.55 |
168
+
169
+ | model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
170
+ |:-----------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
171
+ | qwen1.5-0.5b-chat-hf | 32.38 | 10.34 | 0.00 | 2.31 | 27.37 | 15.11 | 18.75 |
172
+ | qwen1.5-1.8b-chat-hf | 69.52 | 42.53 | 56.33 | 2.31 | 61.05 | 32.98 | 35.00 |
173
+ | qwen1.5-4b-chat-hf | 70.48 | 58.62 | 82.33 | 16.15 | 68.42 | 68.51 | 47.50 |
174
+ | qwen1.5-7b-chat-hf | 83.81 | 71.26 | 85.17 | 57.69 | 81.05 | 78.94 | 66.25 |
175
+ | qwen1.5-14b-chat-hf | 93.33 | 78.16 | 97.17 | 71.54 | 91.58 | 94.26 | 81.25 |
176
+ | qwen1.5-32b-chat-hf | 100.00 | 81.61 | 95.83 | 90.00 | 97.89 | 92.43 | 92.86 |
177
+ | qwen1.5-72b-chat-hf | 98.10 | 83.91 | 98.00 | 90.77 | 94.74 | 96.38 | 96.25 |
178
+ | qwen1.5-110b-chat-hf | 100.00 | 91.95 | 98.50 | 97.69 | 95.35 | 98.44 | 100.00 |
179
+ | internlm2-chat-1.8b-hf | 38.10 | 6.90 | 0.67 | 1.54 | 56.84 | 23.19 | 30.00 |
180
+ | internlm2-chat-1.8b-sft-hf | 50.48 | 0.00 | 0.00 | 0.00 | 27.37 | 11.91 | 32.50 |
181
+ | internlm2-chat-7b-hf | 60.95 | 67.82 | 7.00 | 7.69 | 70.53 | 79.79 | 38.75 |
182
+ | internlm2-chat-7b-sft-hf | 60.00 | 71.26 | 6.50 | 0.77 | 68.42 | 77.02 | 42.50 |
183
+ | internlm2-chat-20b-hf | 60.95 | 43.68 | 34.83 | 4.62 | 71.58 | 62.55 | 43.75 |
184
+ | internlm2-chat-20b-sft-hf | 75.24 | 47.13 | 1.00 | 2.31 | 80.00 | 65.96 | 37.50 |
185
+ | llama-3-8b-instruct-hf | 50.48 | 36.78 | 30.83 | 21.54 | 57.89 | 81.70 | 28.75 |
186
+ | llama-3-70b-instruct-hf | 73.33 | 59.77 | 82.83 | 24.62 | 73.68 | 91.28 | 45.00 |
187
+ | llama-3-8b-instruct-lmdeploy | 52.38 | 42.53 | 21.33 | 18.46 | 58.95 | 81.28 | 26.25 |
188
+ | llama-3-70b-instruct-lmdeploy | 87.50 | 62.07 | 84.38 | 26.92 | 72.63 | 91.20 | 56.25 |
189
+ | mistral-7b-instruct-v0.1-hf | 38.10 | 18.39 | 30.50 | 6.15 | 31.58 | 38.72 | 18.75 |
190
+ | mistral-7b-instruct-v0.2-hf | 41.90 | 31.03 | 28.00 | 20.77 | 29.47 | 42.13 | 15.00 |
191
+ | mixtral-8x7b-instruct-v0.1-hf | 49.52 | 39.08 | 41.33 | 9.23 | 44.21 | 43.19 | 21.25 |
opencompass/configs/datasets/XLSum/XLSum_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .XLSum_gen_2bb71c import XLSum_datasets # noqa: F401, F403
opencompass/configs/datasets/XLSum/XLSum_gen_2bb71c.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import RougeEvaluator
5
+ from opencompass.datasets import XLSUMDataset, Xsum_postprocess
6
+
7
+ XLSum_reader_cfg = dict(input_columns=['text'], output_column='summary')
8
+
9
+ XLSum_infer_cfg = dict(
10
+ prompt_template=dict(
11
+ type=PromptTemplate,
12
+ template='Document:{text}\n'
13
+ 'Based on the previous text, provide a brief single summary:'),
14
+ retriever=dict(type=ZeroRetriever),
15
+ inferencer=dict(type=GenInferencer))
16
+
17
+ XLSum_eval_cfg = dict(
18
+ evaluator=dict(type=RougeEvaluator),
19
+ pred_postprocessor=dict(type=Xsum_postprocess),
20
+ )
21
+
22
+ XLSum_datasets = [
23
+ dict(
24
+ type=XLSUMDataset,
25
+ path='csebuetnlp/xlsum',
26
+ reader_cfg=XLSum_reader_cfg,
27
+ infer_cfg=XLSum_infer_cfg,
28
+ eval_cfg=XLSum_eval_cfg)
29
+ ]
opencompass/configs/datasets/bbh/README.md ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BBH
2
+
3
+ ```bash
4
+ python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
5
+ python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
6
+ ```
7
+
8
+ ## Base Models
9
+
10
+ | model | bbh |
11
+ |:------------------------:|------:|
12
+ | llama-7b-turbomind | 33.34 |
13
+ | llama-13b-turbomind | 37.99 |
14
+ | llama-30b-turbomind | 49.86 |
15
+ | llama-65b-turbomind | 58.26 |
16
+ | llama-2-7b-turbomind | 38.27 |
17
+ | llama-2-13b-turbomind | 45.68 |
18
+ | llama-2-70b-turbomind | 64.78 |
19
+ | llama-3-8b-turbomind | 59.69 |
20
+ | llama-3-70b-turbomind | 79.16 |
21
+ | internlm2-1.8b-turbomind | 36.03 |
22
+ | internlm2-7b-turbomind | 63.56 |
23
+ | internlm2-20b-turbomind | 71.29 |
24
+ | qwen-1.8b-turbomind | 22.53 |
25
+ | qwen-7b-turbomind | 45.89 |
26
+ | qwen-14b-turbomind | 56.75 |
27
+ | qwen-72b-turbomind | 63.35 |
28
+ | qwen1.5-0.5b-hf | 20.54 |
29
+ | qwen1.5-1.8b-hf | 27.01 |
30
+ | qwen1.5-4b-hf | 34.81 |
31
+ | qwen1.5-7b-hf | 39.87 |
32
+ | qwen1.5-14b-hf | 50.38 |
33
+ | qwen1.5-32b-hf | 67.47 |
34
+ | qwen1.5-72b-hf | 58.81 |
35
+ | qwen1.5-moe-a2-7b-hf | 39.46 |
36
+ | mistral-7b-v0.1-hf | 56.71 |
37
+ | mistral-7b-v0.2-hf | 57.32 |
38
+ | mixtral-8x7b-v0.1-hf | 68.46 |
39
+ | mixtral-8x22b-v0.1-hf | 79.48 |
40
+ | yi-6b-hf | 44.82 |
41
+ | yi-34b-hf | 66.37 |
42
+ | deepseek-7b-base-hf | 42.88 |
43
+ | deepseek-67b-base-hf | 71.86 |
44
+
45
+ ### Details
46
+
47
+ | model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
48
+ |:------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
49
+ | llama-7b-turbomind | 23.60 | 46.00 | 44.80 | 36.40 | 30.14 | 0.00 | 46.07 | 21.60 | 15.20 |
50
+ | llama-13b-turbomind | 16.80 | 50.00 | 56.80 | 36.40 | 43.15 | 0.00 | 60.67 | 29.20 | 15.20 |
51
+ | llama-30b-turbomind | 33.60 | 60.00 | 76.40 | 29.20 | 57.53 | 0.00 | 59.55 | 62.40 | 17.20 |
52
+ | llama-65b-turbomind | 84.00 | 76.00 | 84.40 | 50.00 | 65.75 | 0.00 | 62.92 | 69.60 | 31.60 |
53
+ | llama-2-7b-turbomind | 12.00 | 46.80 | 60.00 | 34.00 | 32.19 | 0.00 | 49.44 | 32.80 | 18.40 |
54
+ | llama-2-13b-turbomind | 24.00 | 40.80 | 73.20 | 36.00 | 45.89 | 0.00 | 55.06 | 37.60 | 22.40 |
55
+ | llama-2-70b-turbomind | 75.60 | 66.80 | 88.80 | 73.60 | 69.86 | 0.00 | 73.60 | 60.80 | 57.60 |
56
+ | llama-3-8b-turbomind | 65.60 | 42.00 | 78.80 | 56.80 | 69.86 | 0.00 | 56.18 | 66.00 | 30.80 |
57
+ | llama-3-70b-turbomind | 100.00 | 82.80 | 91.60 | 100.00 | 86.30 | 0.00 | 81.46 | 77.20 | 94.40 |
58
+ | internlm2-1.8b-turbomind | 31.20 | 44.00 | 60.00 | 36.00 | 35.62 | 0.00 | 44.94 | 27.20 | 12.80 |
59
+ | internlm2-7b-turbomind | 94.80 | 75.60 | 86.40 | 53.60 | 69.18 | 0.00 | 59.55 | 68.00 | 46.00 |
60
+ | internlm2-20b-turbomind | 98.40 | 83.60 | 84.00 | 72.00 | 71.92 | 0.00 | 81.46 | 78.40 | 74.40 |
61
+ | qwen-1.8b-turbomind | 26.40 | 39.60 | 33.20 | 28.40 | 28.08 | 0.00 | 44.94 | 21.60 | 12.40 |
62
+ | qwen-7b-turbomind | 38.80 | 42.80 | 64.40 | 30.80 | 45.89 | 0.00 | 55.62 | 44.00 | 14.40 |
63
+ | qwen-14b-turbomind | 57.60 | 59.20 | 67.20 | 46.40 | 67.12 | 0.00 | 51.12 | 63.60 | 30.40 |
64
+ | qwen-72b-turbomind | 72.00 | 66.80 | 77.60 | 81.20 | 84.93 | 0.00 | 78.09 | 67.20 | 63.60 |
65
+ | qwen1.5-0.5b-hf | 15.20 | 37.20 | 20.40 | 30.40 | 18.49 | 8.40 | 44.94 | 11.20 | 14.00 |
66
+ | qwen1.5-1.8b-hf | 27.60 | 40.80 | 36.00 | 24.40 | 32.19 | 0.00 | 50.56 | 20.80 | 11.20 |
67
+ | qwen1.5-4b-hf | 10.40 | 44.40 | 47.20 | 36.80 | 44.52 | 24.80 | 46.63 | 20.80 | 14.80 |
68
+ | qwen1.5-7b-hf | 37.20 | 42.40 | 52.00 | 52.40 | 56.85 | 6.80 | 48.31 | 23.60 | 18.40 |
69
+ | qwen1.5-14b-hf | 38.80 | 62.80 | 73.60 | 24.80 | 69.86 | 26.80 | 66.29 | 52.80 | 2.00 |
70
+ | qwen1.5-32b-hf | 93.60 | 77.20 | 68.40 | 70.00 | 82.88 | 36.80 | 47.75 | 70.40 | 71.20 |
71
+ | qwen1.5-72b-hf | 75.60 | 66.00 | 78.80 | 72.80 | 80.82 | 0.00 | 75.84 | 64.80 | 44.40 |
72
+ | qwen1.5-moe-a2-7b-hf | 23.20 | 59.60 | 43.20 | 27.60 | 46.58 | 25.20 | 48.88 | 16.80 | 13.20 |
73
+ | mistral-7b-v0.1-hf | 73.60 | 53.60 | 76.40 | 45.20 | 56.85 | 28.00 | 64.04 | 66.00 | 21.60 |
74
+ | mistral-7b-v0.2-hf | 76.80 | 42.00 | 73.20 | 47.20 | 60.27 | 26.00 | 66.85 | 60.80 | 26.40 |
75
+ | mixtral-8x7b-v0.1-hf | 89.60 | 70.80 | 84.80 | 81.20 | 70.55 | 25.60 | 66.29 | 71.20 | 58.80 |
76
+ | mixtral-8x22b-v0.1-hf | 98.80 | 77.60 | 92.00 | 98.80 | 83.56 | 35.60 | 80.34 | 79.20 | 82.00 |
77
+ | yi-6b-hf | 32.80 | 46.40 | 64.40 | 34.40 | 47.26 | 28.80 | 60.11 | 45.60 | 14.00 |
78
+ | yi-34b-hf | 86.00 | 76.00 | 84.80 | 54.80 | 67.81 | 24.80 | 73.60 | 66.00 | 65.60 |
79
+ | deepseek-7b-base-hf | 27.60 | 42.00 | 64.40 | 31.20 | 40.41 | 33.60 | 52.25 | 46.00 | 13.20 |
80
+ | deepseek-67b-base-hf | 95.60 | 75.60 | 86.40 | 86.40 | 76.71 | 39.20 | 76.40 | 77.20 | 82.00 |
81
+
82
+ | model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
83
+ |:------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
84
+ | llama-7b-turbomind | 18.40 | 42.80 | 58.00 | 23.20 | 13.20 | 40.00 | 16.40 | 30.40 | 0.00 |
85
+ | llama-13b-turbomind | 16.00 | 48.80 | 53.60 | 30.40 | 16.40 | 61.60 | 11.20 | 44.80 | 0.80 |
86
+ | llama-30b-turbomind | 22.40 | 66.40 | 73.20 | 43.60 | 31.60 | 84.40 | 43.60 | 57.60 | 2.80 |
87
+ | llama-65b-turbomind | 41.60 | 79.20 | 74.40 | 48.40 | 39.20 | 91.20 | 40.40 | 67.20 | 20.00 |
88
+ | llama-2-7b-turbomind | 17.20 | 54.80 | 51.60 | 32.80 | 23.60 | 74.40 | 19.60 | 45.60 | 1.20 |
89
+ | llama-2-13b-turbomind | 23.20 | 63.60 | 52.40 | 46.00 | 42.00 | 68.00 | 21.60 | 62.00 | 2.00 |
90
+ | llama-2-70b-turbomind | 72.40 | 86.40 | 84.40 | 55.20 | 43.20 | 95.60 | 50.80 | 76.80 | 20.80 |
91
+ | llama-3-8b-turbomind | 40.80 | 76.40 | 93.20 | 45.20 | 36.80 | 88.80 | 53.60 | 72.80 | 30.80 |
92
+ | llama-3-70b-turbomind | 99.20 | 94.00 | 98.00 | 58.40 | 42.80 | 93.60 | 63.60 | 88.40 | 79.20 |
93
+ | internlm2-1.8b-turbomind | 16.80 | 47.60 | 63.60 | 21.60 | 12.00 | 69.20 | 16.80 | 45.20 | 5.60 |
94
+ | internlm2-7b-turbomind | 51.20 | 78.80 | 90.40 | 52.00 | 41.20 | 95.60 | 58.80 | 74.40 | 44.40 |
95
+ | internlm2-20b-turbomind | 81.20 | 95.60 | 83.60 | 62.40 | 48.00 | 94.80 | 57.60 | 75.60 | 72.80 |
96
+ | qwen-1.8b-turbomind | 14.80 | 35.60 | 51.20 | 22.40 | 15.20 | 31.20 | 12.40 | 22.00 | 3.20 |
97
+ | qwen-7b-turbomind | 20.80 | 54.80 | 76.00 | 37.60 | 27.60 | 74.80 | 41.20 | 57.60 | 23.60 |
98
+ | qwen-14b-turbomind | 35.60 | 81.20 | 78.40 | 45.20 | 40.80 | 80.00 | 44.80 | 70.40 | 65.60 |
99
+ | qwen-72b-turbomind | 66.40 | 89.20 | 90.40 | 60.00 | 50.80 | 81.60 | 56.40 | 88.00 | 70.40 |
100
+ | qwen1.5-0.5b-hf | 20.00 | 34.80 | 46.80 | 18.80 | 15.60 | 24.40 | 15.20 | 16.00 | 1.20 |
101
+ | qwen1.5-1.8b-hf | 18.00 | 32.80 | 66.00 | 18.80 | 11.20 | 24.80 | 13.60 | 27.60 | 4.80 |
102
+ | qwen1.5-4b-hf | 18.40 | 56.40 | 56.80 | 30.00 | 20.80 | 40.80 | 46.80 | 44.80 | 41.20 |
103
+ | qwen1.5-7b-hf | 32.40 | 58.40 | 67.20 | 36.00 | 28.00 | 62.80 | 49.20 | 60.40 | 48.00 |
104
+ | qwen1.5-14b-hf | 7.20 | 78.40 | 75.20 | 41.20 | 27.60 | 74.40 | 46.00 | 81.60 | 8.00 |
105
+ | qwen1.5-32b-hf | 71.60 | 88.40 | 97.60 | 58.80 | 46.40 | 68.00 | 51.60 | 88.40 | 66.80 |
106
+ | qwen1.5-72b-hf | 61.20 | 88.40 | 96.00 | 60.40 | 49.20 | 86.40 | 34.80 | 86.80 | 53.60 |
107
+ | qwen1.5-moe-a2-7b-hf | 22.80 | 49.20 | 68.00 | 28.40 | 22.40 | 58.40 | 40.80 | 42.00 | 33.60 |
108
+ | mistral-7b-v0.1-hf | 30.40 | 79.60 | 70.80 | 54.40 | 42.80 | 77.60 | 47.20 | 70.00 | 30.40 |
109
+ | mistral-7b-v0.2-hf | 32.80 | 74.00 | 77.60 | 48.00 | 40.40 | 84.00 | 49.20 | 76.00 | 35.20 |
110
+ | mixtral-8x7b-v0.1-hf | 66.80 | 86.00 | 94.80 | 50.40 | 40.40 | 86.40 | 53.20 | 82.80 | 60.80 |
111
+ | mixtral-8x22b-v0.1-hf | 87.60 | 95.20 | 99.60 | 70.00 | 54.00 | 95.20 | 58.40 | 95.20 | 82.00 |
112
+ | yi-6b-hf | 17.20 | 49.20 | 72.40 | 34.40 | 28.00 | 76.80 | 32.40 | 56.80 | 9.20 |
113
+ | yi-34b-hf | 67.20 | 85.60 | 79.60 | 49.20 | 39.60 | 86.80 | 56.00 | 81.20 | 33.20 |
114
+ | deepseek-7b-base-hf | 17.60 | 51.20 | 72.40 | 28.80 | 20.00 | 78.40 | 28.80 | 46.80 | 1.60 |
115
+ | deepseek-67b-base-hf | 82.40 | 90.00 | 78.80 | 60.40 | 44.80 | 88.80 | 56.80 | 86.40 | 38.00 |
116
+
117
+ | model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
118
+ |:------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
119
+ | llama-7b-turbomind | 45.20 | 1.60 | 8.40 | 81.60 | 66.00 | 47.20 | 46.00 | 40.64 | 57.20 |
120
+ | llama-13b-turbomind | 59.20 | 0.80 | 14.40 | 76.40 | 69.20 | 46.40 | 47.20 | 53.48 | 66.80 |
121
+ | llama-30b-turbomind | 64.80 | 2.40 | 17.20 | 93.60 | 78.40 | 71.20 | 43.20 | 55.61 | 98.40 |
122
+ | llama-65b-turbomind | 72.40 | 6.80 | 21.60 | 98.80 | 81.60 | 70.00 | 40.80 | 55.61 | 99.60 |
123
+ | llama-2-7b-turbomind | 54.40 | 1.20 | 10.80 | 88.80 | 68.40 | 49.20 | 48.40 | 52.41 | 53.20 |
124
+ | llama-2-13b-turbomind | 74.40 | 2.80 | 18.80 | 97.60 | 74.40 | 52.80 | 46.40 | 54.55 | 96.00 |
125
+ | llama-2-70b-turbomind | 82.40 | 13.60 | 30.40 | 98.40 | 81.60 | 83.20 | 43.60 | 63.64 | 100.00 |
126
+ | llama-3-8b-turbomind | 90.00 | 9.20 | 38.80 | 95.20 | 87.60 | 84.80 | 51.20 | 50.27 | 100.00 |
127
+ | llama-3-70b-turbomind | 96.80 | 48.40 | 48.80 | 99.60 | 92.40 | 99.60 | 62.40 | 58.29 | 100.00 |
128
+ | internlm2-1.8b-turbomind | 64.40 | 0.40 | 3.20 | 66.40 | 54.00 | 50.00 | 49.20 | 48.13 | 46.80 |
129
+ | internlm2-7b-turbomind | 78.80 | 2.40 | 35.20 | 95.60 | 85.60 | 75.60 | 48.00 | 63.10 | 92.00 |
130
+ | internlm2-20b-turbomind | 88.80 | 15.60 | 36.00 | 96.80 | 88.80 | 76.00 | 50.40 | 56.68 | 100.00 |
131
+ | qwen-1.8b-turbomind | 50.00 | 0.00 | 0.80 | 62.80 | 29.20 | 2.40 | 6.00 | 12.83 | 1.60 |
132
+ | qwen-7b-turbomind | 62.80 | 1.60 | 18.00 | 81.60 | 75.20 | 68.80 | 50.00 | 63.64 | 66.80 |
133
+ | qwen-14b-turbomind | 75.60 | 1.20 | 26.80 | 88.80 | 80.40 | 74.40 | 50.00 | 53.48 | 96.80 |
134
+ | qwen-72b-turbomind | 56.00 | 14.40 | 35.20 | 87.60 | 91.60 | 81.60 | 5.60 | 31.55 | 62.40 |
135
+ | qwen1.5-0.5b-hf | 25.60 | 0.00 | 0.40 | 41.60 | 51.60 | 16.80 | 4.40 | 1.07 | 20.00 |
136
+ | qwen1.5-1.8b-hf | 55.60 | 0.00 | 1.60 | 63.60 | 55.20 | 47.60 | 4.40 | 28.88 | 11.20 |
137
+ | qwen1.5-4b-hf | 61.60 | 0.40 | 8.80 | 0.80 | 76.00 | 54.40 | 0.80 | 28.34 | 62.40 |
138
+ | qwen1.5-7b-hf | 63.60 | 2.40 | 20.80 | 72.40 | 69.60 | 26.80 | 0.00 | 40.64 | 0.00 |
139
+ | qwen1.5-14b-hf | 82.40 | 1.20 | 27.60 | 78.40 | 87.20 | 48.00 | 54.00 | 24.06 | 100.00 |
140
+ | qwen1.5-32b-hf | 86.80 | 5.60 | 36.80 | 90.00 | 86.40 | 66.40 | 35.60 | 62.57 | 95.60 |
141
+ | qwen1.5-72b-hf | 48.40 | 13.20 | 34.40 | 87.60 | 8.00 | 67.60 | 13.60 | 39.57 | 99.60 |
142
+ | qwen1.5-moe-a2-7b-hf | 56.80 | 2.00 | 8.80 | 79.60 | 73.60 | 66.80 | 4.00 | 53.48 | 50.40 |
143
+ | mistral-7b-v0.1-hf | 73.60 | 4.00 | 26.40 | 97.20 | 82.00 | 67.60 | 43.20 | 48.66 | 100.00 |
144
+ | mistral-7b-v0.2-hf | 72.80 | 4.00 | 30.40 | 97.20 | 81.20 | 66.80 | 46.00 | 52.41 | 100.00 |
145
+ | mixtral-8x7b-v0.1-hf | 85.60 | 18.80 | 33.60 | 98.00 | 90.80 | 85.20 | 49.60 | 55.61 | 90.80 |
146
+ | mixtral-8x22b-v0.1-hf | 92.80 | 51.60 | 40.00 | 98.40 | 91.60 | 95.60 | 54.80 | 56.15 | 100.00 |
147
+ | yi-6b-hf | 66.40 | 1.20 | 16.00 | 92.80 | 59.60 | 53.20 | 53.20 | 52.41 | 65.20 |
148
+ | yi-34b-hf | 81.20 | 18.80 | 36.40 | 97.60 | 85.60 | 84.00 | 51.20 | 59.89 | 99.60 |
149
+ | deepseek-7b-base-hf | 59.20 | 3.20 | 6.40 | 92.00 | 73.20 | 49.60 | 50.80 | 52.41 | 74.80 |
150
+ | deepseek-67b-base-hf | 85.20 | 30.00 | 33.20 | 99.60 | 84.80 | 82.40 | 46.80 | 56.68 | 99.60 |
151
+
152
+ ## Chat Models
153
+
154
+ | model | bbh |
155
+ |:-----------------------------:|------:|
156
+ | qwen1.5-0.5b-chat-hf | 24.12 |
157
+ | qwen1.5-1.8b-chat-hf | 26.82 |
158
+ | qwen1.5-4b-chat-hf | 43.15 |
159
+ | qwen1.5-7b-chat-hf | 38.12 |
160
+ | qwen1.5-14b-chat-hf | 55.38 |
161
+ | qwen1.5-32b-chat-hf | 69.28 |
162
+ | qwen1.5-72b-chat-hf | 72.97 |
163
+ | qwen1.5-110b-chat-hf | 71.04 |
164
+ | internlm2-chat-1.8b-hf | 37.69 |
165
+ | internlm2-chat-1.8b-sft-hf | 37.12 |
166
+ | internlm2-chat-7b-hf | 57.83 |
167
+ | internlm2-chat-7b-sft-hf | 57.19 |
168
+ | internlm2-chat-20b-hf | 68.24 |
169
+ | internlm2-chat-20b-sft-hf | 69.38 |
170
+ | llama-3-8b-instruct-hf | 52.85 |
171
+ | llama-3-70b-instruct-hf | 82.42 |
172
+ | llama-3-8b-instruct-lmdeploy | 53.54 |
173
+ | llama-3-70b-instruct-lmdeploy | 82.58 |
174
+ | mistral-7b-instruct-v0.1-hf | 32.88 |
175
+ | mistral-7b-instruct-v0.2-hf | 48.84 |
176
+ | mixtral-8x7b-instruct-v0.1-hf | 59.64 |
177
+
178
+ ### Details
179
+
180
+ | model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
181
+ |:-----------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
182
+ | qwen1.5-0.5b-chat-hf | 25.60 | 42.00 | 20.00 | 31.20 | 15.07 | 14.40 | 46.07 | 24.80 | 13.20 |
183
+ | qwen1.5-1.8b-chat-hf | 28.80 | 36.00 | 30.40 | 35.20 | 19.18 | 7.60 | 46.63 | 24.00 | 9.60 |
184
+ | qwen1.5-4b-chat-hf | 8.00 | 56.00 | 64.80 | 28.40 | 48.63 | 19.60 | 60.67 | 34.00 | 14.40 |
185
+ | qwen1.5-7b-chat-hf | 39.60 | 37.60 | 62.40 | 36.80 | 60.96 | 30.80 | 54.49 | 38.00 | 20.00 |
186
+ | qwen1.5-14b-chat-hf | 61.60 | 63.60 | 70.00 | 54.00 | 74.66 | 33.60 | 67.42 | 61.20 | 35.60 |
187
+ | qwen1.5-32b-chat-hf | 94.40 | 77.60 | 78.00 | 66.00 | 93.84 | 46.00 | 82.58 | 73.60 | 61.60 |
188
+ | qwen1.5-72b-chat-hf | 70.40 | 72.40 | 84.40 | 67.20 | 89.73 | 52.00 | 79.21 | 86.40 | 68.80 |
189
+ | qwen1.5-110b-chat-hf | 74.80 | 71.20 | 82.80 | 74.80 | 89.04 | 48.00 | 90.45 | 87.60 | 73.60 |
190
+ | internlm2-chat-1.8b-hf | 35.60 | 52.40 | 48.80 | 29.60 | 39.73 | 24.40 | 51.69 | 27.20 | 13.20 |
191
+ | internlm2-chat-1.8b-sft-hf | 37.20 | 53.60 | 44.00 | 30.00 | 34.93 | 22.40 | 56.74 | 28.00 | 12.00 |
192
+ | internlm2-chat-7b-hf | 72.00 | 66.40 | 73.60 | 65.20 | 60.27 | 50.00 | 62.92 | 52.40 | 44.40 |
193
+ | internlm2-chat-7b-sft-hf | 67.20 | 66.80 | 58.00 | 63.20 | 48.63 | 45.60 | 64.04 | 59.60 | 42.80 |
194
+ | internlm2-chat-20b-hf | 80.40 | 76.00 | 77.60 | 88.80 | 78.08 | 36.40 | 71.91 | 71.60 | 77.20 |
195
+ | internlm2-chat-20b-sft-hf | 80.00 | 70.80 | 78.00 | 87.60 | 82.88 | 41.20 | 76.40 | 72.80 | 71.60 |
196
+ | llama-3-8b-instruct-hf | 70.40 | 42.80 | 28.40 | 81.20 | 13.01 | 49.20 | 44.94 | 73.20 | 42.40 |
197
+ | llama-3-70b-instruct-hf | 100.00 | 84.00 | 91.60 | 95.60 | 78.08 | 52.40 | 87.08 | 89.60 | 97.60 |
198
+ | llama-3-8b-instruct-lmdeploy | 73.20 | 45.60 | 34.00 | 79.60 | 31.51 | 48.40 | 47.75 | 76.80 | 47.60 |
199
+ | llama-3-70b-instruct-lmdeploy | 100.00 | 84.00 | 90.00 | 96.80 | 83.56 | 56.00 | 87.08 | 89.20 | 97.20 |
200
+ | mistral-7b-instruct-v0.1-hf | 32.00 | 22.40 | 52.40 | 35.20 | 30.82 | 23.20 | 38.76 | 46.00 | 18.40 |
201
+ | mistral-7b-instruct-v0.2-hf | 66.00 | 58.40 | 50.40 | 48.40 | 48.63 | 37.20 | 65.73 | 40.40 | 29.20 |
202
+ | mixtral-8x7b-instruct-v0.1-hf | 63.20 | 68.40 | 65.20 | 60.00 | 78.08 | 40.40 | 74.16 | 64.00 | 46.00 |
203
+
204
+ | model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
205
+ |:-----------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
206
+ | qwen1.5-0.5b-chat-hf | 20.40 | 34.40 | 51.60 | 21.20 | 13.20 | 26.00 | 20.80 | 17.20 | 1.20 |
207
+ | qwen1.5-1.8b-chat-hf | 18.00 | 34.80 | 48.40 | 21.20 | 16.40 | 34.80 | 24.00 | 28.80 | 4.40 |
208
+ | qwen1.5-4b-chat-hf | 19.20 | 56.80 | 65.20 | 36.40 | 35.60 | 51.60 | 40.40 | 55.20 | 29.20 |
209
+ | qwen1.5-7b-chat-hf | 31.60 | 58.80 | 53.20 | 35.60 | 27.20 | 56.00 | 44.80 | 62.00 | 50.00 |
210
+ | qwen1.5-14b-chat-hf | 43.20 | 75.20 | 52.80 | 52.40 | 50.80 | 76.40 | 48.80 | 83.60 | 65.20 |
211
+ | qwen1.5-32b-chat-hf | 68.40 | 84.00 | 81.20 | 57.20 | 46.00 | 78.80 | 54.40 | 86.00 | 86.00 |
212
+ | qwen1.5-72b-chat-hf | 76.80 | 94.40 | 85.20 | 62.80 | 54.00 | 78.40 | 63.60 | 86.40 | 82.80 |
213
+ | qwen1.5-110b-chat-hf | 79.20 | 91.60 | 88.80 | 61.20 | 50.00 | 82.40 | 59.60 | 88.80 | 78.00 |
214
+ | internlm2-chat-1.8b-hf | 20.00 | 48.40 | 56.00 | 24.40 | 26.80 | 65.20 | 18.00 | 39.60 | 7.60 |
215
+ | internlm2-chat-1.8b-sft-hf | 18.40 | 48.00 | 51.20 | 20.40 | 25.20 | 63.20 | 22.00 | 38.80 | 6.00 |
216
+ | internlm2-chat-7b-hf | 48.40 | 75.20 | 84.80 | 42.00 | 36.80 | 79.60 | 53.20 | 65.60 | 26.40 |
217
+ | internlm2-chat-7b-sft-hf | 44.00 | 72.40 | 85.60 | 41.60 | 37.20 | 82.40 | 55.60 | 52.80 | 32.00 |
218
+ | internlm2-chat-20b-hf | 88.00 | 88.80 | 88.80 | 52.80 | 50.40 | 85.20 | 56.80 | 79.60 | 40.00 |
219
+ | internlm2-chat-20b-sft-hf | 83.20 | 90.00 | 90.40 | 55.60 | 48.80 | 84.40 | 57.60 | 79.20 | 38.40 |
220
+ | llama-3-8b-instruct-hf | 49.60 | 85.60 | 76.00 | 54.00 | 29.20 | 57.60 | 46.00 | 44.80 | 52.00 |
221
+ | llama-3-70b-instruct-hf | 99.20 | 96.80 | 95.20 | 77.20 | 65.20 | 80.00 | 69.60 | 94.80 | 84.00 |
222
+ | llama-3-8b-instruct-lmdeploy | 57.20 | 78.00 | 75.60 | 36.00 | 13.20 | 59.20 | 53.60 | 54.80 | 52.80 |
223
+ | llama-3-70b-instruct-lmdeploy | 98.80 | 96.40 | 96.80 | 75.20 | 68.80 | 79.60 | 67.60 | 94.00 | 84.80 |
224
+ | mistral-7b-instruct-v0.1-hf | 26.00 | 46.00 | 60.00 | 38.00 | 24.00 | 59.20 | 1.20 | 6.00 | 12.40 |
225
+ | mistral-7b-instruct-v0.2-hf | 39.60 | 63.60 | 64.00 | 44.00 | 33.20 | 56.00 | 42.40 | 68.40 | 14.00 |
226
+ | mixtral-8x7b-instruct-v0.1-hf | 46.40 | 71.60 | 88.80 | 48.00 | 36.80 | 60.00 | 50.00 | 81.20 | 59.20 |
227
+
228
+ | model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
229
+ |:-----------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
230
+ | qwen1.5-0.5b-chat-hf | 45.60 | 0.00 | 1.20 | 17.20 | 50.40 | 16.40 | 11.60 | 42.78 | 27.60 |
231
+ | qwen1.5-1.8b-chat-hf | 58.40 | 0.00 | 2.00 | 34.00 | 44.80 | 30.40 | 11.60 | 24.60 | 50.00 |
232
+ | qwen1.5-4b-chat-hf | 64.00 | 3.20 | 6.80 | 80.40 | 77.60 | 48.80 | 41.20 | 55.61 | 63.20 |
233
+ | qwen1.5-7b-chat-hf | 54.40 | 0.40 | 8.00 | 55.60 | 47.60 | 31.20 | 0.00 | 2.14 | 30.00 |
234
+ | qwen1.5-14b-chat-hf | 74.40 | 6.40 | 26.40 | 72.40 | 76.40 | 61.60 | 0.80 | 25.67 | 81.20 |
235
+ | qwen1.5-32b-chat-hf | 90.00 | 10.40 | 28.40 | 82.40 | 92.80 | 76.80 | 32.40 | 41.71 | 100.00 |
236
+ | qwen1.5-72b-chat-hf | 81.20 | 18.40 | 37.60 | 95.20 | 92.80 | 76.00 | 50.40 | 63.64 | 100.00 |
237
+ | qwen1.5-110b-chat-hf | 91.60 | 18.00 | 39.60 | 82.80 | 80.80 | 75.20 | 22.40 | 35.83 | 100.00 |
238
+ | internlm2-chat-1.8b-hf | 63.20 | 0.00 | 6.00 | 58.00 | 56.80 | 48.80 | 54.80 | 52.94 | 48.40 |
239
+ | internlm2-chat-1.8b-sft-hf | 63.20 | 0.00 | 5.60 | 58.00 | 56.80 | 50.00 | 52.40 | 56.68 | 47.60 |
240
+ | internlm2-chat-7b-hf | 73.60 | 3.60 | 18.00 | 55.20 | 83.60 | 62.80 | 50.00 | 58.29 | 97.20 |
241
+ | internlm2-chat-7b-sft-hf | 71.60 | 4.40 | 20.00 | 82.00 | 84.00 | 60.00 | 51.60 | 52.94 | 98.00 |
242
+ | internlm2-chat-20b-hf | 82.40 | 8.00 | 36.00 | 55.60 | 84.40 | 78.00 | 50.40 | 59.36 | 100.00 |
243
+ | internlm2-chat-20b-sft-hf | 81.60 | 10.40 | 36.40 | 89.20 | 82.40 | 80.40 | 48.40 | 55.61 | 100.00 |
244
+ | llama-3-8b-instruct-hf | 82.80 | 8.80 | 37.20 | 94.40 | 78.80 | 89.60 | 45.20 | 24.06 | 25.60 |
245
+ | llama-3-70b-instruct-hf | 95.20 | 18.80 | 49.20 | 98.00 | 94.00 | 90.00 | 73.20 | 68.98 | 100.00 |
246
+ | llama-3-8b-instruct-lmdeploy | 83.60 | 10.00 | 40.40 | 96.00 | 77.20 | 89.20 | 43.60 | 37.43 | 3.20 |
247
+ | llama-3-70b-instruct-lmdeploy | 95.60 | 22.40 | 48.80 | 96.80 | 91.60 | 87.20 | 72.00 | 69.52 | 100.00 |
248
+ | mistral-7b-instruct-v0.1-hf | 70.80 | 0.80 | 5.20 | 68.80 | 69.60 | 51.60 | 3.20 | 12.30 | 33.60 |
249
+ | mistral-7b-instruct-v0.2-hf | 62.40 | 4.00 | 15.60 | 81.20 | 70.40 | 50.40 | 32.00 | 34.76 | 98.40 |
250
+ | mixtral-8x7b-instruct-v0.1-hf | 76.40 | 12.80 | 23.20 | 55.20 | 85.60 | 83.60 | 40.00 | 43.32 | 88.80 |
opencompass/configs/datasets/bbh/bbh_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .bbh_gen_5b92b0 import bbh_datasets # noqa: F401, F403
opencompass/configs/datasets/bbh/bbh_gen_2879b0.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from mmengine.config import read_base
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+ from opencompass.openicl.icl_inferencer import GenInferencer
6
+ from opencompass.datasets import BBHDataset, bbh_mcq_postprocess, BBHEvaluator, BBHEvaluator_mcq
7
+
8
+ with read_base():
9
+ from .bbh_subset_settings import settings
10
+
11
+ bbh_datasets = []
12
+ for name, test_type in settings:
13
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{name}.txt'), 'r') as f:
14
+ hint = f.read()
15
+
16
+ task_prompt, body = hint.split('\n\nQ:', 1)
17
+ sections = ('Q:' + body).split('\n\n')
18
+ prompt_rounds = []
19
+ for index, section in enumerate(sections):
20
+ question, answer = section.split('\nA:')
21
+ answer = 'A:' + answer
22
+ if index == 0:
23
+ desc = task_prompt.strip() + '\n'
24
+ else:
25
+ desc = ''
26
+ prompt_rounds.append(dict(role='HUMAN', prompt=f'{desc}{question.strip()}'))
27
+ prompt_rounds.append(dict(role='BOT', prompt=answer.strip()))
28
+ prompt_rounds.append(dict(role='HUMAN', prompt='Q: {input}'))
29
+
30
+ bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
31
+
32
+ bbh_infer_cfg = dict(
33
+ prompt_template=dict(type=PromptTemplate, template=dict(round=prompt_rounds)),
34
+ retriever=dict(type=ZeroRetriever),
35
+ inferencer=dict(type=GenInferencer, max_out_len=512))
36
+
37
+ if test_type == 'mcq':
38
+ bbh_eval_cfg = dict(
39
+ evaluator=dict(type=BBHEvaluator_mcq),
40
+ pred_role='BOT',
41
+ pred_postprocessor=dict(type=bbh_mcq_postprocess),
42
+ dataset_postprocessor=dict(type=bbh_mcq_postprocess))
43
+ else:
44
+ bbh_eval_cfg = dict(
45
+ evaluator=dict(type=BBHEvaluator),
46
+ pred_role='BOT')
47
+
48
+ bbh_datasets.append(
49
+ dict(
50
+ type=BBHDataset,
51
+ path='opencompass/bbh',
52
+ name=name,
53
+ abbr='bbh-' + name,
54
+ reader_cfg=bbh_reader_cfg.copy(),
55
+ infer_cfg=bbh_infer_cfg.copy(),
56
+ eval_cfg=bbh_eval_cfg.copy()))
opencompass/configs/datasets/bbh/bbh_gen_4a31fa.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
7
+
8
+ bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
9
+
10
+ bbh_multiple_choice_sets = [
11
+ 'temporal_sequences',
12
+ 'disambiguation_qa',
13
+ 'date_understanding',
14
+ 'tracking_shuffled_objects_three_objects',
15
+ 'penguins_in_a_table',
16
+ 'geometric_shapes',
17
+ 'snarks',
18
+ 'ruin_names',
19
+ 'tracking_shuffled_objects_seven_objects',
20
+ 'tracking_shuffled_objects_five_objects',
21
+ 'logical_deduction_three_objects',
22
+ 'hyperbaton',
23
+ 'logical_deduction_five_objects',
24
+ 'logical_deduction_seven_objects',
25
+ 'movie_recommendation',
26
+ 'salient_translation_error_detection',
27
+ 'reasoning_about_colored_objects',
28
+ ]
29
+ bbh_free_form_sets = [
30
+ 'multistep_arithmetic_two',
31
+ 'navigate',
32
+ 'dyck_languages',
33
+ 'word_sorting',
34
+ 'sports_understanding',
35
+ 'boolean_expressions',
36
+ 'object_counting',
37
+ 'formal_fallacies',
38
+ 'causal_judgement',
39
+ 'web_of_lies',
40
+ ]
41
+
42
+ bbh_datasets = []
43
+ for _name in bbh_multiple_choice_sets:
44
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
45
+ _hint = f.read()
46
+ bbh_infer_cfg = dict(
47
+ prompt_template=dict(
48
+ type=PromptTemplate,
49
+ template=dict(round=[
50
+ dict(
51
+ role='HUMAN',
52
+ prompt=
53
+ f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
54
+ )
55
+ ])),
56
+ retriever=dict(type=ZeroRetriever),
57
+ inferencer=dict(type=GenInferencer, max_out_len=512))
58
+ bbh_eval_cfg = dict(
59
+ evaluator=dict(type=BBHEvaluator_mcq),
60
+ pred_role='BOT',
61
+ pred_postprocessor=dict(type=bbh_mcq_postprocess),
62
+ dataset_postprocessor=dict(type=bbh_mcq_postprocess))
63
+
64
+ bbh_datasets.append(
65
+ dict(
66
+ type=BBHDataset,
67
+ path='opencompass/bbh',
68
+ name=_name,
69
+ abbr='bbh-' + _name,
70
+ reader_cfg=bbh_reader_cfg,
71
+ infer_cfg=bbh_infer_cfg.copy(),
72
+ eval_cfg=bbh_eval_cfg.copy()))
73
+
74
+ for _name in bbh_free_form_sets:
75
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
76
+ _hint = f.read()
77
+ bbh_infer_cfg = dict(
78
+ prompt_template=dict(
79
+ type=PromptTemplate,
80
+ template=dict(round=[
81
+ dict(
82
+ role='HUMAN',
83
+ prompt=
84
+ f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
85
+ )
86
+ ])),
87
+ retriever=dict(type=ZeroRetriever),
88
+ inferencer=dict(type=GenInferencer, max_out_len=512))
89
+ bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
90
+
91
+ bbh_datasets.append(
92
+ dict(
93
+ type=BBHDataset,
94
+ path='opencompass/bbh',
95
+ name=_name,
96
+ abbr='bbh-' + _name,
97
+ reader_cfg=bbh_reader_cfg,
98
+ infer_cfg=bbh_infer_cfg.copy(),
99
+ eval_cfg=bbh_eval_cfg.copy()))
opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
7
+
8
+ bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
9
+
10
+ bbh_multiple_choice_sets = [
11
+ 'temporal_sequences',
12
+ 'disambiguation_qa',
13
+ 'date_understanding',
14
+ 'tracking_shuffled_objects_three_objects',
15
+ 'penguins_in_a_table',
16
+ 'geometric_shapes',
17
+ 'snarks',
18
+ 'ruin_names',
19
+ 'tracking_shuffled_objects_seven_objects',
20
+ 'tracking_shuffled_objects_five_objects',
21
+ 'logical_deduction_three_objects',
22
+ 'hyperbaton',
23
+ 'logical_deduction_five_objects',
24
+ 'logical_deduction_seven_objects',
25
+ 'movie_recommendation',
26
+ 'salient_translation_error_detection',
27
+ 'reasoning_about_colored_objects',
28
+ ]
29
+ bbh_free_form_sets = [
30
+ 'multistep_arithmetic_two',
31
+ 'navigate',
32
+ 'dyck_languages',
33
+ 'word_sorting',
34
+ 'sports_understanding',
35
+ 'boolean_expressions',
36
+ 'object_counting',
37
+ 'formal_fallacies',
38
+ 'causal_judgement',
39
+ 'web_of_lies',
40
+ ]
41
+
42
+ bbh_datasets = []
43
+ for _name in bbh_multiple_choice_sets:
44
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
45
+ _hint = f.read()
46
+ bbh_infer_cfg = dict(
47
+ prompt_template=dict(
48
+ type=PromptTemplate,
49
+ template=dict(round=[
50
+ dict(
51
+ role='HUMAN',
52
+ prompt=
53
+ f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
54
+ )
55
+ ])),
56
+ retriever=dict(type=ZeroRetriever),
57
+ inferencer=dict(type=GenInferencer, max_out_len=512))
58
+ bbh_eval_cfg = dict(
59
+ evaluator=dict(type=BBHEvaluator_mcq),
60
+ pred_role='BOT',
61
+ pred_postprocessor=dict(type=bbh_mcq_postprocess),
62
+ dataset_postprocessor=dict(type=bbh_mcq_postprocess))
63
+
64
+ bbh_datasets.append(
65
+ dict(
66
+ type=BBHDataset,
67
+ path='opencompass/bbh',
68
+ name=_name,
69
+ abbr='bbh-' + _name,
70
+ reader_cfg=bbh_reader_cfg,
71
+ infer_cfg=bbh_infer_cfg.copy(),
72
+ eval_cfg=bbh_eval_cfg.copy()))
73
+
74
+ for _name in bbh_free_form_sets:
75
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
76
+ _hint = f.read()
77
+ bbh_infer_cfg = dict(
78
+ prompt_template=dict(
79
+ type=PromptTemplate,
80
+ template=dict(round=[
81
+ dict(
82
+ role='HUMAN',
83
+ prompt=
84
+ f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
85
+ )
86
+ ])),
87
+ retriever=dict(type=ZeroRetriever),
88
+ inferencer=dict(type=GenInferencer, max_out_len=512))
89
+ bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
90
+
91
+ bbh_datasets.append(
92
+ dict(
93
+ type=BBHDataset,
94
+ path='opencompass/bbh',
95
+ name=_name,
96
+ abbr='bbh-' + _name,
97
+ reader_cfg=bbh_reader_cfg,
98
+ infer_cfg=bbh_infer_cfg.copy(),
99
+ eval_cfg=bbh_eval_cfg.copy()))
opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
7
+
8
+ bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
9
+
10
+ bbh_multiple_choice_sets = [
11
+ 'temporal_sequences',
12
+ 'disambiguation_qa',
13
+ 'date_understanding',
14
+ 'tracking_shuffled_objects_three_objects',
15
+ 'penguins_in_a_table',
16
+ 'geometric_shapes',
17
+ 'snarks',
18
+ 'ruin_names',
19
+ 'tracking_shuffled_objects_seven_objects',
20
+ 'tracking_shuffled_objects_five_objects',
21
+ 'logical_deduction_three_objects',
22
+ 'hyperbaton',
23
+ 'logical_deduction_five_objects',
24
+ 'logical_deduction_seven_objects',
25
+ 'movie_recommendation',
26
+ 'salient_translation_error_detection',
27
+ 'reasoning_about_colored_objects',
28
+ ]
29
+ bbh_free_form_sets = [
30
+ 'multistep_arithmetic_two',
31
+ 'navigate',
32
+ 'dyck_languages',
33
+ 'word_sorting',
34
+ 'sports_understanding',
35
+ 'boolean_expressions',
36
+ 'object_counting',
37
+ 'formal_fallacies',
38
+ 'causal_judgement',
39
+ 'web_of_lies',
40
+ ]
41
+
42
+ bbh_datasets = []
43
+ for _name in bbh_multiple_choice_sets:
44
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
45
+ _hint = f.read()
46
+ bbh_infer_cfg = dict(
47
+ prompt_template=dict(
48
+ type=PromptTemplate,
49
+ template=dict(round=[
50
+ dict(
51
+ role='HUMAN',
52
+ prompt=
53
+ f'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
54
+ )
55
+ ])),
56
+ retriever=dict(type=ZeroRetriever),
57
+ inferencer=dict(type=GenInferencer, max_out_len=512))
58
+ bbh_eval_cfg = dict(
59
+ evaluator=dict(type=BBHEvaluator_mcq),
60
+ pred_role='BOT',
61
+ pred_postprocessor=dict(type=bbh_mcq_postprocess),
62
+ dataset_postprocessor=dict(type=bbh_mcq_postprocess))
63
+
64
+ bbh_datasets.append(
65
+ dict(
66
+ type=BBHDataset,
67
+ path='opencompass/bbh',
68
+ name=_name,
69
+ abbr='bbh-' + _name,
70
+ reader_cfg=bbh_reader_cfg,
71
+ infer_cfg=bbh_infer_cfg.copy(),
72
+ eval_cfg=bbh_eval_cfg.copy()))
73
+
74
+ for _name in bbh_free_form_sets:
75
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
76
+ _hint = f.read()
77
+ bbh_infer_cfg = dict(
78
+ prompt_template=dict(
79
+ type=PromptTemplate,
80
+ template=dict(round=[
81
+ dict(
82
+ role='HUMAN',
83
+ prompt=
84
+ f'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
85
+ )
86
+ ])),
87
+ retriever=dict(type=ZeroRetriever),
88
+ inferencer=dict(type=GenInferencer, max_out_len=512))
89
+ bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
90
+
91
+ bbh_datasets.append(
92
+ dict(
93
+ type=BBHDataset,
94
+ path='opencompass/bbh',
95
+ name=_name,
96
+ abbr='bbh-' + _name,
97
+ reader_cfg=bbh_reader_cfg,
98
+ infer_cfg=bbh_infer_cfg.copy(),
99
+ eval_cfg=bbh_eval_cfg.copy()))
opencompass/configs/datasets/bbh/bbh_gen_98fba6.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
7
+
8
+ bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
9
+
10
+ bbh_multiple_choice_sets = [
11
+ 'temporal_sequences',
12
+ 'disambiguation_qa',
13
+ 'date_understanding',
14
+ 'tracking_shuffled_objects_three_objects',
15
+ 'penguins_in_a_table',
16
+ 'geometric_shapes',
17
+ 'snarks',
18
+ 'ruin_names',
19
+ 'tracking_shuffled_objects_seven_objects',
20
+ 'tracking_shuffled_objects_five_objects',
21
+ 'logical_deduction_three_objects',
22
+ 'hyperbaton',
23
+ 'logical_deduction_five_objects',
24
+ 'logical_deduction_seven_objects',
25
+ 'movie_recommendation',
26
+ 'salient_translation_error_detection',
27
+ 'reasoning_about_colored_objects',
28
+ ]
29
+ bbh_free_form_sets = [
30
+ 'multistep_arithmetic_two',
31
+ 'navigate',
32
+ 'dyck_languages',
33
+ 'word_sorting',
34
+ 'sports_understanding',
35
+ 'boolean_expressions',
36
+ 'object_counting',
37
+ 'formal_fallacies',
38
+ 'causal_judgement',
39
+ 'web_of_lies',
40
+ ]
41
+
42
+ bbh_datasets = []
43
+ for _name in bbh_multiple_choice_sets:
44
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
45
+ _hint = f.read()
46
+ bbh_infer_cfg = dict(
47
+ prompt_template=dict(
48
+ type=PromptTemplate,
49
+ template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
50
+ ),
51
+ retriever=dict(type=ZeroRetriever),
52
+ inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
53
+ bbh_eval_cfg = dict(
54
+ evaluator=dict(type=BBHEvaluator_mcq),
55
+ pred_role='BOT',
56
+ pred_postprocessor=dict(type=bbh_mcq_postprocess),
57
+ dataset_postprocessor=dict(type=bbh_mcq_postprocess))
58
+
59
+ bbh_datasets.append(
60
+ dict(
61
+ type=BBHDataset,
62
+ path='opencompass/bbh',
63
+ name=_name,
64
+ abbr='bbh-' + _name,
65
+ reader_cfg=bbh_reader_cfg,
66
+ infer_cfg=bbh_infer_cfg.copy(),
67
+ eval_cfg=bbh_eval_cfg.copy()))
68
+
69
+
70
+ for _name in bbh_free_form_sets:
71
+ with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
72
+ _hint = f.read()
73
+ bbh_infer_cfg = dict(
74
+ prompt_template=dict(
75
+ type=PromptTemplate,
76
+ template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
77
+ ),
78
+ retriever=dict(type=ZeroRetriever),
79
+ inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
80
+ bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
81
+
82
+ bbh_datasets.append(
83
+ dict(
84
+ type=BBHDataset,
85
+ path='opencompass/bbh',
86
+ name=_name,
87
+ abbr='bbh-' + _name,
88
+ reader_cfg=bbh_reader_cfg,
89
+ infer_cfg=bbh_infer_cfg.copy(),
90
+ eval_cfg=bbh_eval_cfg.copy()))
opencompass/configs/datasets/bbh/bbh_subset_settings.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ settings = [
2
+ ('temporal_sequences', 'mcq'),
3
+ ('disambiguation_qa', 'mcq'),
4
+ ('date_understanding', 'mcq'),
5
+ ('tracking_shuffled_objects_three_objects', 'mcq'),
6
+ ('penguins_in_a_table', 'mcq'),
7
+ ('geometric_shapes', 'mcq'),
8
+ ('snarks', 'mcq'),
9
+ ('ruin_names', 'mcq'),
10
+ ('tracking_shuffled_objects_seven_objects', 'mcq'),
11
+ ('tracking_shuffled_objects_five_objects', 'mcq'),
12
+ ('logical_deduction_three_objects', 'mcq'),
13
+ ('hyperbaton', 'mcq'),
14
+ ('logical_deduction_five_objects', 'mcq'),
15
+ ('logical_deduction_seven_objects', 'mcq'),
16
+ ('movie_recommendation', 'mcq'),
17
+ ('salient_translation_error_detection', 'mcq'),
18
+ ('reasoning_about_colored_objects', 'mcq'),
19
+ ('multistep_arithmetic_two', 'free_form'),
20
+ ('navigate', 'free_form'),
21
+ ('dyck_languages', 'free_form'),
22
+ ('word_sorting', 'free_form'),
23
+ ('sports_understanding', 'free_form'),
24
+ ('boolean_expressions', 'free_form'),
25
+ ('object_counting', 'free_form'),
26
+ ('formal_fallacies', 'free_form'),
27
+ ('causal_judgement', 'free_form'),
28
+ ('web_of_lies', 'free_form'),
29
+ ]
opencompass/configs/datasets/cmmlu/cmmlu_0shot_cot_gen_305931.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CMMLUDataset
6
+ from opencompass.utils.text_postprocessors import match_answer_pattern
7
+
8
+ cmmlu_subject_mapping = {
9
+ 'agronomy': '农学',
10
+ 'anatomy': '解剖学',
11
+ 'ancient_chinese': '古汉语',
12
+ 'arts': '艺术学',
13
+ 'astronomy': '天文学',
14
+ 'business_ethics': '商业伦理',
15
+ 'chinese_civil_service_exam': '中国公务员考试',
16
+ 'chinese_driving_rule': '中国驾驶规则',
17
+ 'chinese_food_culture': '中国饮食文化',
18
+ 'chinese_foreign_policy': '中国外交政策',
19
+ 'chinese_history': '中国历史',
20
+ 'chinese_literature': '中国文学',
21
+ 'chinese_teacher_qualification': '中国教师资格',
22
+ 'clinical_knowledge': '临床知识',
23
+ 'college_actuarial_science': '大学精算学',
24
+ 'college_education': '大学教育学',
25
+ 'college_engineering_hydrology': '大学工程水文学',
26
+ 'college_law': '大学法律',
27
+ 'college_mathematics': '大学数学',
28
+ 'college_medical_statistics': '大学医学统计',
29
+ 'college_medicine': '大学医学',
30
+ 'computer_science': '计算机科学',
31
+ 'computer_security': '计算机安全',
32
+ 'conceptual_physics': '概念物理学',
33
+ 'construction_project_management': '建设工程管理',
34
+ 'economics': '经济学',
35
+ 'education': '教育学',
36
+ 'electrical_engineering': '电气工程',
37
+ 'elementary_chinese': '小学语文',
38
+ 'elementary_commonsense': '小学常识',
39
+ 'elementary_information_and_technology': '小学信息技术',
40
+ 'elementary_mathematics': '初等数学',
41
+ 'ethnology': '民族学',
42
+ 'food_science': '食品科学',
43
+ 'genetics': '遗传学',
44
+ 'global_facts': '全球事实',
45
+ 'high_school_biology': '高中生物',
46
+ 'high_school_chemistry': '高中化学',
47
+ 'high_school_geography': '高中地理',
48
+ 'high_school_mathematics': '高中数学',
49
+ 'high_school_physics': '高中物理学',
50
+ 'high_school_politics': '高中政治',
51
+ 'human_sexuality': '人类性行为',
52
+ 'international_law': '国际法学',
53
+ 'journalism': '新闻学',
54
+ 'jurisprudence': '法理学',
55
+ 'legal_and_moral_basis': '法律与道德基础',
56
+ 'logical': '逻辑学',
57
+ 'machine_learning': '机器学习',
58
+ 'management': '管理学',
59
+ 'marketing': '市场营销',
60
+ 'marxist_theory': '马克思主义理论',
61
+ 'modern_chinese': '现代汉语',
62
+ 'nutrition': '营养学',
63
+ 'philosophy': '哲学',
64
+ 'professional_accounting': '专业会计',
65
+ 'professional_law': '专业法学',
66
+ 'professional_medicine': '专业医学',
67
+ 'professional_psychology': '专业心理学',
68
+ 'public_relations': '公共关系',
69
+ 'security_study': '安全研究',
70
+ 'sociology': '社会学',
71
+ 'sports_science': '体育学',
72
+ 'traditional_chinese_medicine': '中医中药',
73
+ 'virology': '病毒学',
74
+ 'world_history': '世界历史',
75
+ 'world_religions': '世界宗教'
76
+ }
77
+
78
+ QUERY_TEMPLATE = """
79
+ 你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一. 请在回答之前一步步思考.
80
+
81
+ {question}
82
+
83
+ A) {A}
84
+ B) {B}
85
+ C) {C}
86
+ D) {D}
87
+ """.strip()
88
+
89
+ cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
90
+
91
+ cmmlu_datasets = []
92
+ for _name in cmmlu_all_sets:
93
+ _ch_name = cmmlu_subject_mapping[_name]
94
+ prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
95
+ cmmlu_infer_cfg = dict(
96
+ prompt_template=dict(
97
+ type=PromptTemplate,
98
+ template=dict(
99
+ round=[
100
+ dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
101
+ ],
102
+ ),
103
+ ),
104
+ retriever=dict(type=ZeroRetriever),
105
+ inferencer=dict(type=GenInferencer),
106
+ )
107
+ cmmlu_eval_cfg = dict(
108
+ evaluator=dict(type=AccEvaluator),
109
+ pred_postprocessor=dict(
110
+ type=match_answer_pattern,
111
+ # answer_pattern=r'(?i)答案\s*:\s*([A-D])'
112
+ answer_pattern=r'(?i)答案\s*:\s*[\W]*([A-D])[\W]*',
113
+ )
114
+ )
115
+ cmmlu_datasets.append(
116
+ dict(
117
+ type=CMMLUDataset,
118
+ path='opencompass/cmmlu',
119
+ name=_name,
120
+ abbr=f'cmmlu-{_name}',
121
+ reader_cfg=dict(
122
+ input_columns=['question', 'A', 'B', 'C', 'D'],
123
+ output_column='answer',
124
+ train_split='dev',
125
+ test_split='test'),
126
+ infer_cfg=cmmlu_infer_cfg,
127
+ eval_cfg=cmmlu_eval_cfg,
128
+ ))
129
+
130
+ del _name, _ch_name
opencompass/configs/datasets/cmmlu/cmmlu_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, F403
opencompass/configs/datasets/cmmlu/cmmlu_gen_c13365.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
5
+ from opencompass.datasets import CMMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+
9
+ cmmlu_subject_mapping = {
10
+ 'agronomy': '农学',
11
+ 'anatomy': '解剖学',
12
+ 'ancient_chinese': '古汉语',
13
+ 'arts': '艺术学',
14
+ 'astronomy': '天文学',
15
+ 'business_ethics': '商业伦理',
16
+ 'chinese_civil_service_exam': '中国公务员考试',
17
+ 'chinese_driving_rule': '中国驾驶规则',
18
+ 'chinese_food_culture': '中国饮食文化',
19
+ 'chinese_foreign_policy': '中国外交政策',
20
+ 'chinese_history': '中国历史',
21
+ 'chinese_literature': '中国文学',
22
+ 'chinese_teacher_qualification': '中国教师资格',
23
+ 'clinical_knowledge': '临床知识',
24
+ 'college_actuarial_science': '大学精算学',
25
+ 'college_education': '大学教育学',
26
+ 'college_engineering_hydrology': '大学工程水文学',
27
+ 'college_law': '大学法律',
28
+ 'college_mathematics': '大学数学',
29
+ 'college_medical_statistics': '大学医学统计',
30
+ 'college_medicine': '大学医学',
31
+ 'computer_science': '计算机科学',
32
+ 'computer_security': '计算机安全',
33
+ 'conceptual_physics': '概念物理学',
34
+ 'construction_project_management': '建设工程管理',
35
+ 'economics': '经济学',
36
+ 'education': '教育学',
37
+ 'electrical_engineering': '电气工程',
38
+ 'elementary_chinese': '小学语文',
39
+ 'elementary_commonsense': '小学常识',
40
+ 'elementary_information_and_technology': '小学信息技术',
41
+ 'elementary_mathematics': '初等数学',
42
+ 'ethnology': '民族学',
43
+ 'food_science': '食品科学',
44
+ 'genetics': '遗传学',
45
+ 'global_facts': '全球事实',
46
+ 'high_school_biology': '高中生物',
47
+ 'high_school_chemistry': '高中化学',
48
+ 'high_school_geography': '高中地理',
49
+ 'high_school_mathematics': '高中数学',
50
+ 'high_school_physics': '高中物理学',
51
+ 'high_school_politics': '高中政治',
52
+ 'human_sexuality': '人类性行为',
53
+ 'international_law': '国际法学',
54
+ 'journalism': '新闻学',
55
+ 'jurisprudence': '法理学',
56
+ 'legal_and_moral_basis': '法律与道德基础',
57
+ 'logical': '逻辑学',
58
+ 'machine_learning': '机器学习',
59
+ 'management': '管理学',
60
+ 'marketing': '市场营销',
61
+ 'marxist_theory': '马克思主义理论',
62
+ 'modern_chinese': '现代汉语',
63
+ 'nutrition': '营养学',
64
+ 'philosophy': '哲学',
65
+ 'professional_accounting': '专业会计',
66
+ 'professional_law': '专业法学',
67
+ 'professional_medicine': '专业医学',
68
+ 'professional_psychology': '专业心理学',
69
+ 'public_relations': '公共关系',
70
+ 'security_study': '安全研究',
71
+ 'sociology': '社会学',
72
+ 'sports_science': '体育学',
73
+ 'traditional_chinese_medicine': '中医中药',
74
+ 'virology': '病毒学',
75
+ 'world_history': '世界历史',
76
+ 'world_religions': '世界宗教'
77
+ }
78
+
79
+
80
+ cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
81
+
82
+ cmmlu_datasets = []
83
+ for _name in cmmlu_all_sets:
84
+ _ch_name = cmmlu_subject_mapping[_name]
85
+ cmmlu_infer_cfg = dict(
86
+ ice_template=dict(
87
+ type=PromptTemplate,
88
+ template=dict(
89
+ begin='</E>',
90
+ round=[
91
+ dict(
92
+ role='HUMAN',
93
+ prompt=
94
+ f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
95
+ ),
96
+ dict(role='BOT', prompt='答案是: {answer}'),
97
+ ]),
98
+ ice_token='</E>',
99
+ ),
100
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
101
+ inferencer=dict(type=GenInferencer),
102
+ )
103
+
104
+ cmmlu_eval_cfg = dict(
105
+ evaluator=dict(type=AccwithDetailsEvaluator),
106
+ pred_postprocessor=dict(type=first_capital_postprocess))
107
+
108
+ cmmlu_datasets.append(
109
+ dict(
110
+ type=CMMLUDataset,
111
+ path='opencompass/cmmlu',
112
+ name=_name,
113
+ abbr=f'cmmlu-{_name}',
114
+ reader_cfg=dict(
115
+ input_columns=['question', 'A', 'B', 'C', 'D'],
116
+ output_column='answer',
117
+ train_split='dev',
118
+ test_split='test'),
119
+ infer_cfg=cmmlu_infer_cfg,
120
+ eval_cfg=cmmlu_eval_cfg,
121
+ ))
122
+
123
+ del _name, _ch_name
opencompass/configs/datasets/cmmlu/cmmlu_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .cmmlu_ppl_8b9c76 import cmmlu_datasets # noqa: F401, F403
opencompass/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
5
+ from opencompass.datasets import CMMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ cmmlu_subject_mapping = {
9
+ 'agronomy': '农学',
10
+ 'anatomy': '解剖学',
11
+ 'ancient_chinese': '古汉语',
12
+ 'arts': '艺术学',
13
+ 'astronomy': '天文学',
14
+ 'business_ethics': '商业伦理',
15
+ 'chinese_civil_service_exam': '中国公务员考试',
16
+ 'chinese_driving_rule': '中国驾驶规则',
17
+ 'chinese_food_culture': '中国饮食文化',
18
+ 'chinese_foreign_policy': '中国外交政策',
19
+ 'chinese_history': '中国历史',
20
+ 'chinese_literature': '中国文学',
21
+ 'chinese_teacher_qualification': '中国教师资格',
22
+ 'clinical_knowledge': '临床知识',
23
+ 'college_actuarial_science': '大学精算学',
24
+ 'college_education': '大学教育学',
25
+ 'college_engineering_hydrology': '大学工程水文学',
26
+ 'college_law': '大学法律',
27
+ 'college_mathematics': '大学数学',
28
+ 'college_medical_statistics': '大学医学统计',
29
+ 'college_medicine': '大学医学',
30
+ 'computer_science': '计算机科学',
31
+ 'computer_security': '计算机安全',
32
+ 'conceptual_physics': '概念物理学',
33
+ 'construction_project_management': '建设工程管理',
34
+ 'economics': '经济学',
35
+ 'education': '教育学',
36
+ 'electrical_engineering': '电气工程',
37
+ 'elementary_chinese': '小学语文',
38
+ 'elementary_commonsense': '小学常识',
39
+ 'elementary_information_and_technology': '小学信息技术',
40
+ 'elementary_mathematics': '初等数学',
41
+ 'ethnology': '民族学',
42
+ 'food_science': '食品科学',
43
+ 'genetics': '遗传学',
44
+ 'global_facts': '全球事实',
45
+ 'high_school_biology': '高中生物',
46
+ 'high_school_chemistry': '高中化学',
47
+ 'high_school_geography': '高中地理',
48
+ 'high_school_mathematics': '高中数学',
49
+ 'high_school_physics': '高中物理学',
50
+ 'high_school_politics': '高中政治',
51
+ 'human_sexuality': '人类性行为',
52
+ 'international_law': '国际法学',
53
+ 'journalism': '新闻学',
54
+ 'jurisprudence': '法理学',
55
+ 'legal_and_moral_basis': '法律与道德基础',
56
+ 'logical': '逻辑学',
57
+ 'machine_learning': '机器学习',
58
+ 'management': '管理学',
59
+ 'marketing': '市场营销',
60
+ 'marxist_theory': '马克思主义理论',
61
+ 'modern_chinese': '现代汉语',
62
+ 'nutrition': '营养学',
63
+ 'philosophy': '哲学',
64
+ 'professional_accounting': '专业会计',
65
+ 'professional_law': '专业法学',
66
+ 'professional_medicine': '专业医学',
67
+ 'professional_psychology': '专业心理学',
68
+ 'public_relations': '公共关系',
69
+ 'security_study': '安全研究',
70
+ 'sociology': '社会学',
71
+ 'sports_science': '体育学',
72
+ 'traditional_chinese_medicine': '中医中药',
73
+ 'virology': '病毒学',
74
+ 'world_history': '世界历史',
75
+ 'world_religions': '世界宗教'
76
+ }
77
+
78
+
79
+ cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
80
+
81
+ cmmlu_datasets = []
82
+ for _name in cmmlu_all_sets:
83
+ _ch_name = cmmlu_subject_mapping[_name]
84
+ hint = f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。'
85
+ question_and_options = '题目:{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
86
+ cmmlu_infer_cfg = dict(
87
+ ice_template=dict(
88
+ type=PromptTemplate,
89
+ template={answer: f'{question_and_options}\n答案是: {answer}\n' for answer in ['A', 'B', 'C', 'D']},
90
+ ),
91
+ prompt_template=dict(
92
+ type=PromptTemplate,
93
+ template={answer: f'{hint}\n</E>{question_and_options}\n答案是: {answer}' for answer in ['A', 'B', 'C', 'D']},
94
+ ice_token='</E>',
95
+ ),
96
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
97
+ inferencer=dict(type=PPLInferencer),
98
+ )
99
+
100
+ cmmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
101
+
102
+ cmmlu_datasets.append(
103
+ dict(
104
+ type=CMMLUDataset,
105
+ path='opencompass/cmmlu',
106
+ name=_name,
107
+ abbr=f'cmmlu-{_name}',
108
+ reader_cfg=dict(
109
+ input_columns=['question', 'A', 'B', 'C', 'D'],
110
+ output_column='answer',
111
+ train_split='dev',
112
+ test_split='test'),
113
+ infer_cfg=cmmlu_infer_cfg,
114
+ eval_cfg=cmmlu_eval_cfg,
115
+ ))
116
+
117
+ del _name, _ch_name
opencompass/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CMMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ cmmlu_subject_mapping = {
9
+ 'agronomy': '农学',
10
+ 'anatomy': '解剖学',
11
+ 'ancient_chinese': '古汉语',
12
+ 'arts': '艺术学',
13
+ 'astronomy': '天文学',
14
+ 'business_ethics': '商业伦理',
15
+ 'chinese_civil_service_exam': '中国公务员考试',
16
+ 'chinese_driving_rule': '中国驾驶规则',
17
+ 'chinese_food_culture': '中国饮食文化',
18
+ 'chinese_foreign_policy': '中国外交政策',
19
+ 'chinese_history': '中国历史',
20
+ 'chinese_literature': '中国文学',
21
+ 'chinese_teacher_qualification': '中国教师资格',
22
+ 'clinical_knowledge': '临床知识',
23
+ 'college_actuarial_science': '大学精算学',
24
+ 'college_education': '大学教育学',
25
+ 'college_engineering_hydrology': '大学工程水文学',
26
+ 'college_law': '大学法律',
27
+ 'college_mathematics': '大学数学',
28
+ 'college_medical_statistics': '大学医学统计',
29
+ 'college_medicine': '大学医学',
30
+ 'computer_science': '计算机科学',
31
+ 'computer_security': '计算机安全',
32
+ 'conceptual_physics': '概念物理学',
33
+ 'construction_project_management': '建设工程管理',
34
+ 'economics': '经济学',
35
+ 'education': '教育学',
36
+ 'electrical_engineering': '电气工程',
37
+ 'elementary_chinese': '小学语文',
38
+ 'elementary_commonsense': '小学常识',
39
+ 'elementary_information_and_technology': '小学信息技术',
40
+ 'elementary_mathematics': '初等数学',
41
+ 'ethnology': '民族学',
42
+ 'food_science': '食品科学',
43
+ 'genetics': '遗传学',
44
+ 'global_facts': '全球事实',
45
+ 'high_school_biology': '高中生物',
46
+ 'high_school_chemistry': '高中化学',
47
+ 'high_school_geography': '高中地理',
48
+ 'high_school_mathematics': '高中数学',
49
+ 'high_school_physics': '高中物理学',
50
+ 'high_school_politics': '高中政治',
51
+ 'human_sexuality': '人类性行为',
52
+ 'international_law': '国际法学',
53
+ 'journalism': '新闻学',
54
+ 'jurisprudence': '法理学',
55
+ 'legal_and_moral_basis': '法律与道德基础',
56
+ 'logical': '逻辑学',
57
+ 'machine_learning': '机器学习',
58
+ 'management': '管理学',
59
+ 'marketing': '市场营销',
60
+ 'marxist_theory': '马克思主义理论',
61
+ 'modern_chinese': '现代汉语',
62
+ 'nutrition': '营养学',
63
+ 'philosophy': '哲学',
64
+ 'professional_accounting': '专业会计',
65
+ 'professional_law': '专业法学',
66
+ 'professional_medicine': '专业医学',
67
+ 'professional_psychology': '专业心理学',
68
+ 'public_relations': '公共关系',
69
+ 'security_study': '安全研究',
70
+ 'sociology': '社会学',
71
+ 'sports_science': '体育学',
72
+ 'traditional_chinese_medicine': '中医中药',
73
+ 'virology': '病毒学',
74
+ 'world_history': '世界历史',
75
+ 'world_religions': '世界宗教'
76
+ }
77
+
78
+
79
+ cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
80
+
81
+ cmmlu_datasets = []
82
+ for _name in cmmlu_all_sets:
83
+ _ch_name = cmmlu_subject_mapping[_name]
84
+ cmmlu_infer_cfg = dict(
85
+ ice_template=dict(
86
+ type=PromptTemplate,
87
+ template={
88
+ answer: dict(
89
+ begin='</E>',
90
+ round=[
91
+ dict(
92
+ role='HUMAN',
93
+ prompt=f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
94
+ ),
95
+ dict(role='BOT', prompt=f'答案是: {answer}'),
96
+ ])
97
+ for answer in ['A', 'B', 'C', 'D']
98
+ },
99
+ ice_token='</E>',
100
+ ),
101
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
102
+ inferencer=dict(type=PPLInferencer),
103
+ )
104
+
105
+ cmmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
106
+
107
+ cmmlu_datasets.append(
108
+ dict(
109
+ type=CMMLUDataset,
110
+ path='opencompass/cmmlu',
111
+ name=_name,
112
+ abbr=f'cmmlu-{_name}',
113
+ reader_cfg=dict(
114
+ input_columns=['question', 'A', 'B', 'C', 'D'],
115
+ output_column='answer',
116
+ train_split='dev',
117
+ test_split='test'),
118
+ infer_cfg=cmmlu_infer_cfg,
119
+ eval_cfg=cmmlu_eval_cfg,
120
+ ))
121
+
122
+ del _name, _ch_name
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .commonsenseqacn_gen_d380d0 import commonsenseqacn_datasets # noqa: F401, F403
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen_d380d0.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CommonsenseQADataset_CN
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ commonsenseqacn_reader_cfg = dict(
9
+ input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
10
+ output_column='answerKey',
11
+ test_split='validation',
12
+ )
13
+
14
+ _ice_template = dict(
15
+ type=PromptTemplate,
16
+ template=dict(
17
+ begin='</E>',
18
+ round=[
19
+ dict(
20
+ role='HUMAN',
21
+ prompt='{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\n答案:',
22
+ ),
23
+ dict(role='BOT', prompt='{answerKey}'),
24
+ ],
25
+ ),
26
+ ice_token='</E>',
27
+ )
28
+
29
+
30
+ commonsenseqacn_infer_cfg = dict(
31
+ prompt_template=_ice_template,
32
+ retriever=dict(type=ZeroRetriever),
33
+ inferencer=dict(type=GenInferencer),
34
+ )
35
+
36
+ commonsenseqacn_eval_cfg = dict(
37
+ evaluator=dict(type=AccEvaluator),
38
+ pred_postprocessor=dict(type=first_capital_postprocess),
39
+ )
40
+
41
+ commonsenseqacn_datasets = [
42
+ dict(
43
+ abbr='commonsenseqa_cn',
44
+ type=CommonsenseQADataset_CN,
45
+ path='./data/commonsenseqa_cn/validation.jsonl',
46
+ reader_cfg=commonsenseqacn_reader_cfg,
47
+ infer_cfg=commonsenseqacn_infer_cfg,
48
+ eval_cfg=commonsenseqacn_eval_cfg,
49
+ )
50
+ ]
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .commonsenseqacn_ppl_971f48 import commonsenseqacn_datasets # noqa: F401, F403
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl_971f48.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CommonsenseQADataset_CN
6
+
7
+ commonsenseqacn_reader_cfg = dict(
8
+ input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
9
+ output_column='answerKey',
10
+ test_split='validation',
11
+ )
12
+
13
+ _ice_template = dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ ans: dict(
17
+ begin='</E>',
18
+ round=[
19
+ dict(role='HUMAN', prompt='问题: {question}\n答案: '),
20
+ dict(role='BOT', prompt=ans_token),
21
+ ],
22
+ )
23
+ for ans, ans_token in [
24
+ ['A', '{A}'],
25
+ ['B', '{B}'],
26
+ ['C', '{C}'],
27
+ ['D', '{D}'],
28
+ ['E', '{E}'],
29
+ ]
30
+ },
31
+ ice_token='</E>',
32
+ )
33
+
34
+
35
+ commonsenseqacn_infer_cfg = dict(
36
+ prompt_template=_ice_template,
37
+ retriever=dict(type=ZeroRetriever),
38
+ inferencer=dict(type=PPLInferencer),
39
+ )
40
+
41
+ commonsenseqacn_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
42
+
43
+ commonsenseqacn_datasets = [
44
+ dict(
45
+ abbr='commonsenseqa_cn',
46
+ type=CommonsenseQADataset_CN,
47
+ path='./data/commonsenseqa_cn/validation.jsonl',
48
+ reader_cfg=commonsenseqacn_reader_cfg,
49
+ infer_cfg=commonsenseqacn_infer_cfg,
50
+ eval_cfg=commonsenseqacn_eval_cfg,
51
+ )
52
+ ]
opencompass/configs/datasets/demo/demo_cmmlu_base_ppl.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from mmengine import read_base
2
+
3
+ with read_base():
4
+ from ..cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets
5
+
6
+ for d in cmmlu_datasets:
7
+ d['abbr'] = 'demo_' + d['abbr']
8
+ d['reader_cfg']['test_range'] = '[0:4]'
opencompass/configs/datasets/demo/demo_cmmlu_chat_gen.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from mmengine import read_base
2
+
3
+ with read_base():
4
+ from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
5
+
6
+ for d in cmmlu_datasets:
7
+ d['abbr'] = 'demo_' + d['abbr']
8
+ d['reader_cfg']['test_range'] = '[0:4]'
opencompass/configs/datasets/demo/demo_gsm8k_base_gen.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from mmengine import read_base
2
+
3
+ with read_base():
4
+ from ..gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets
5
+
6
+ gsm8k_datasets[0]['abbr'] = 'demo_' + gsm8k_datasets[0]['abbr']
7
+ gsm8k_datasets[0]['reader_cfg']['test_range'] = '[0:64]'
opencompass/configs/datasets/demo/demo_gsm8k_chat_gen.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from mmengine import read_base
2
+
3
+ with read_base():
4
+ from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
5
+
6
+ gsm8k_datasets[0]['abbr'] = 'demo_' + gsm8k_datasets[0]['abbr']
7
+ gsm8k_datasets[0]['reader_cfg']['test_range'] = '[0:64]'
opencompass/configs/datasets/demo/demo_math_base_gen.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from mmengine import read_base
2
+
3
+ with read_base():
4
+ from ..math.math_4shot_base_gen_db136b import math_datasets
5
+
6
+ math_datasets[0]['abbr'] = 'demo_' + math_datasets[0]['abbr']
7
+ math_datasets[0]['reader_cfg']['test_range'] = '[0:64]'
opencompass/configs/datasets/demo/demo_math_chat_gen.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from mmengine import read_base
2
+
3
+ with read_base():
4
+ from ..math.math_0shot_gen_393424 import math_datasets
5
+
6
+ math_datasets[0]['abbr'] = 'demo_' + math_datasets[0]['abbr']
7
+ math_datasets[0]['reader_cfg']['test_range'] = '[0:64]'
opencompass/configs/datasets/gpqa/README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPQA
2
+
3
+ ```bash
4
+ python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
5
+ python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
6
+ ```
7
+
8
+ ## Base Models
9
+
10
+ | model | GPQA_diamond |
11
+ |:------------------------:|---------------:|
12
+ | llama-7b-turbomind | 24.24 |
13
+ | llama-13b-turbomind | 25.25 |
14
+ | llama-30b-turbomind | 22.73 |
15
+ | llama-65b-turbomind | 21.72 |
16
+ | llama-2-7b-turbomind | 25.25 |
17
+ | llama-2-13b-turbomind | 23.74 |
18
+ | llama-2-70b-turbomind | 28.28 |
19
+ | llama-3-8b-turbomind | 31.82 |
20
+ | llama-3-70b-turbomind | 40.91 |
21
+ | internlm2-1.8b-turbomind | 24.24 |
22
+ | internlm2-7b-turbomind | 28.28 |
23
+ | internlm2-20b-turbomind | 31.31 |
24
+ | qwen-1.8b-turbomind | 28.79 |
25
+ | qwen-7b-turbomind | 24.75 |
26
+ | qwen-14b-turbomind | 27.78 |
27
+ | qwen-72b-turbomind | 31.31 |
28
+ | qwen1.5-0.5b-hf | 23.74 |
29
+ | qwen1.5-1.8b-hf | 28.79 |
30
+ | qwen1.5-4b-hf | 23.23 |
31
+ | qwen1.5-7b-hf | 20.71 |
32
+ | qwen1.5-14b-hf | 32.32 |
33
+ | qwen1.5-32b-hf | 30.81 |
34
+ | qwen1.5-72b-hf | 31.82 |
35
+ | qwen1.5-moe-a2-7b-hf | 28.79 |
36
+ | mistral-7b-v0.1-hf | 24.75 |
37
+ | mistral-7b-v0.2-hf | 23.74 |
38
+ | mixtral-8x7b-v0.1-hf | 28.79 |
39
+ | mixtral-8x22b-v0.1-hf | 36.36 |
40
+ | yi-6b-hf | 28.28 |
41
+ | yi-34b-hf | 35.86 |
42
+ | deepseek-7b-base-hf | 20.71 |
43
+ | deepseek-67b-base-hf | 25.25 |
44
+
45
+ ## Chat Models
46
+
47
+ | model | GPQA_diamond |
48
+ |:-----------------------------:|---------------:|
49
+ | qwen1.5-0.5b-chat-hf | 19.70 |
50
+ | qwen1.5-1.8b-chat-hf | 29.80 |
51
+ | qwen1.5-4b-chat-hf | 25.25 |
52
+ | qwen1.5-7b-chat-hf | 31.82 |
53
+ | qwen1.5-14b-chat-hf | 30.30 |
54
+ | qwen1.5-32b-chat-hf | 31.31 |
55
+ | qwen1.5-72b-chat-hf | 32.83 |
56
+ | qwen1.5-110b-chat-hf | 35.86 |
57
+ | internlm2-chat-1.8b-hf | 25.76 |
58
+ | internlm2-chat-1.8b-sft-hf | 26.26 |
59
+ | internlm2-chat-7b-hf | 28.28 |
60
+ | internlm2-chat-7b-sft-hf | 27.27 |
61
+ | internlm2-chat-20b-hf | 30.30 |
62
+ | internlm2-chat-20b-sft-hf | 29.29 |
63
+ | llama-3-8b-instruct-hf | 25.76 |
64
+ | llama-3-70b-instruct-hf | 37.88 |
65
+ | llama-3-8b-instruct-lmdeploy | 25.76 |
66
+ | llama-3-70b-instruct-lmdeploy | 37.88 |
67
+ | mistral-7b-instruct-v0.1-hf | 30.30 |
68
+ | mistral-7b-instruct-v0.2-hf | 25.25 |
69
+ | mixtral-8x7b-instruct-v0.1-hf | 30.30 |
opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_4b5a83.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
5
+ from opencompass.datasets import GPQADataset, GPQAEvaluator
6
+ from opencompass.utils import first_option_postprocess
7
+
8
+ gpqa_reader_cfg = dict(
9
+ input_columns=['question', 'A', 'B', 'C', 'D'],
10
+ output_column='answer')
11
+
12
+ hint = f'For the multiple choice question below, please provide the correct answer option directly.'
13
+ question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
14
+ gpqa_infer_cfg = dict(
15
+ ice_template=dict(
16
+ type=PromptTemplate,
17
+ template={
18
+ opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
19
+ ),
20
+ prompt_template=dict(
21
+ type=PromptTemplate,
22
+ template={
23
+ opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
24
+ },
25
+ ice_token='</E>'
26
+ ),
27
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
28
+ inferencer=dict(type=PPLInferencer))
29
+
30
+ gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
31
+
32
+ gpqa_datasets = []
33
+ gpqa_subsets = {
34
+ # 'extended': 'gpqa_extended.csv',
35
+ # 'main': 'gpqa_main.csv',
36
+ 'diamond': 'gpqa_diamond.csv'
37
+ }
38
+
39
+ for split in list(gpqa_subsets.keys()):
40
+ gpqa_datasets.append(
41
+ dict(
42
+ abbr='GPQA_' + split,
43
+ type=GPQADataset,
44
+ path='./data/gpqa/',
45
+ name=gpqa_subsets[split],
46
+ reader_cfg=gpqa_reader_cfg,
47
+ infer_cfg=gpqa_infer_cfg,
48
+ eval_cfg=gpqa_eval_cfg)
49
+ )
opencompass/configs/datasets/gpqa/gpqa_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
opencompass/configs/datasets/gpqa/gpqa_gen_015262.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import GPQADataset, GPQAEvaluator
5
+ from opencompass.utils import first_option_postprocess
6
+
7
+ gpqa_reader_cfg = dict(
8
+ input_columns=['question', 'A', 'B', 'C', 'D'],
9
+ output_column='answer')
10
+
11
+ gpqa_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template=dict(
15
+ round=[
16
+ dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
17
+ '(A){A}\n'
18
+ '(B){B}\n'
19
+ '(C){C}\n'
20
+ '(D){D}\n'
21
+ 'Format your response as follows: "The correct answer is (insert answer here)"'),
22
+ ], )),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer))
25
+
26
+ gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
27
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
28
+
29
+ gpqa_datasets = []
30
+ gpqa_subsets = {
31
+ 'extended': 'gpqa_extended.csv',
32
+ 'main': 'gpqa_main.csv',
33
+ 'diamond': 'gpqa_diamond.csv'
34
+ }
35
+
36
+ for split in list(gpqa_subsets.keys()):
37
+ gpqa_datasets.append(
38
+ dict(
39
+ abbr='GPQA_' + split,
40
+ type=GPQADataset,
41
+ path='./data/gpqa/',
42
+ name=gpqa_subsets[split],
43
+ reader_cfg=gpqa_reader_cfg,
44
+ infer_cfg=gpqa_infer_cfg,
45
+ eval_cfg=gpqa_eval_cfg)
46
+ )
opencompass/configs/datasets/gpqa/gpqa_gen_4baadb.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import GPQADataset, GPQAEvaluator
5
+ from opencompass.utils import first_option_postprocess
6
+
7
+ gpqa_reader_cfg = dict(
8
+ input_columns=['question', 'A', 'B', 'C', 'D'],
9
+ output_column='answer')
10
+
11
+ gpqa_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template=dict(
15
+ round=[
16
+ dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
17
+ '(A){A}\n'
18
+ '(B){B}\n'
19
+ '(C){C}\n'
20
+ '(D){D}\n'
21
+ 'Format your response as follows: "The correct answer is (insert answer here)"'),
22
+ ], )),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer))
25
+
26
+ gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
27
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
28
+
29
+ gpqa_datasets = []
30
+ gpqa_subsets = {
31
+ # 'extended': 'gpqa_extended.csv',
32
+ # 'main': 'gpqa_main.csv',
33
+ 'diamond': 'gpqa_diamond.csv'
34
+ }
35
+
36
+ for split in list(gpqa_subsets.keys()):
37
+ gpqa_datasets.append(
38
+ dict(
39
+ abbr='GPQA_' + split,
40
+ type=GPQADataset,
41
+ path='./data/gpqa/',
42
+ name=gpqa_subsets[split],
43
+ reader_cfg=gpqa_reader_cfg,
44
+ infer_cfg=gpqa_infer_cfg,
45
+ eval_cfg=gpqa_eval_cfg)
46
+ )