Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py +4 -0
- opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py +53 -0
- opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl.py +4 -0
- opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_4b16c0.py +65 -0
- opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_9ef540.py +43 -0
- opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_e53034.py +59 -0
- opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py +4 -0
- opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py +52 -0
- opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py +4 -0
- opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py +60 -0
- opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py +44 -0
- opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py +4 -0
- opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py +304 -0
- opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py +4 -0
- opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py +356 -0
- opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py +45 -0
- opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py +44 -0
- opencompass/configs/datasets/GaokaoBench/GaokaoBench_prompts.py +191 -0
- opencompass/configs/datasets/GaokaoBench/README.md +191 -0
- opencompass/configs/datasets/XLSum/XLSum_gen.py +4 -0
- opencompass/configs/datasets/XLSum/XLSum_gen_2bb71c.py +29 -0
- opencompass/configs/datasets/bbh/README.md +250 -0
- opencompass/configs/datasets/bbh/bbh_gen.py +4 -0
- opencompass/configs/datasets/bbh/bbh_gen_2879b0.py +56 -0
- opencompass/configs/datasets/bbh/bbh_gen_4a31fa.py +99 -0
- opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py +99 -0
- opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py +99 -0
- opencompass/configs/datasets/bbh/bbh_gen_98fba6.py +90 -0
- opencompass/configs/datasets/bbh/bbh_subset_settings.py +29 -0
- opencompass/configs/datasets/cmmlu/cmmlu_0shot_cot_gen_305931.py +130 -0
- opencompass/configs/datasets/cmmlu/cmmlu_gen.py +4 -0
- opencompass/configs/datasets/cmmlu/cmmlu_gen_c13365.py +123 -0
- opencompass/configs/datasets/cmmlu/cmmlu_ppl.py +4 -0
- opencompass/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py +117 -0
- opencompass/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py +122 -0
- opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py +4 -0
- opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen_d380d0.py +50 -0
- opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl.py +4 -0
- opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl_971f48.py +52 -0
- opencompass/configs/datasets/demo/demo_cmmlu_base_ppl.py +8 -0
- opencompass/configs/datasets/demo/demo_cmmlu_chat_gen.py +8 -0
- opencompass/configs/datasets/demo/demo_gsm8k_base_gen.py +7 -0
- opencompass/configs/datasets/demo/demo_gsm8k_chat_gen.py +7 -0
- opencompass/configs/datasets/demo/demo_math_base_gen.py +7 -0
- opencompass/configs/datasets/demo/demo_math_chat_gen.py +7 -0
- opencompass/configs/datasets/gpqa/README.md +69 -0
- opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_4b5a83.py +49 -0
- opencompass/configs/datasets/gpqa/gpqa_gen.py +4 -0
- opencompass/configs/datasets/gpqa/gpqa_gen_015262.py +46 -0
- opencompass/configs/datasets/gpqa/gpqa_gen_4baadb.py +46 -0
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_bustm_gen_634f41 import bustm_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import AFQMCDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
bustm_reader_cfg = dict(
|
9 |
+
input_columns=['sentence1', 'sentence2'],
|
10 |
+
output_column='label',
|
11 |
+
test_split='train')
|
12 |
+
|
13 |
+
bustm_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt=
|
20 |
+
'语句一:“{sentence1}”\n语句二:“{sentence2}”\n请判断语句一和语句二说的是否是一个意思?\nA. 无关\nB. 相关\n请从“A”,“B”中进行选择。\n答:',
|
21 |
+
),
|
22 |
+
]),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
bustm_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=AccEvaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
32 |
+
)
|
33 |
+
|
34 |
+
bustm_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='bustm-dev',
|
37 |
+
type=AFQMCDatasetV2, # bustm share the same format with AFQMC
|
38 |
+
path='./data/FewCLUE/bustm/dev_few_all.json',
|
39 |
+
local_mode=True,
|
40 |
+
reader_cfg=bustm_reader_cfg,
|
41 |
+
infer_cfg=bustm_infer_cfg,
|
42 |
+
eval_cfg=bustm_eval_cfg,
|
43 |
+
),
|
44 |
+
dict(
|
45 |
+
abbr='bustm-test',
|
46 |
+
type=AFQMCDatasetV2, # bustm share the same format with AFQMC
|
47 |
+
path='./data/FewCLUE/bustm/test_public.json',
|
48 |
+
local_mode=True,
|
49 |
+
reader_cfg=bustm_reader_cfg,
|
50 |
+
infer_cfg=bustm_infer_cfg,
|
51 |
+
eval_cfg=bustm_eval_cfg,
|
52 |
+
),
|
53 |
+
]
|
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_bustm_ppl_e53034 import bustm_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_4b16c0.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
bustm_reader_cfg = dict(
|
8 |
+
input_columns=['sentence1', 'sentence2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
bustm_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0:
|
17 |
+
dict(
|
18 |
+
begin=[
|
19 |
+
dict(
|
20 |
+
role='SYSTEM',
|
21 |
+
fallback_role='HUMAN',
|
22 |
+
prompt='请判断以下两句话说的是否是一个意思:')
|
23 |
+
],
|
24 |
+
round=[
|
25 |
+
dict(role='HUMAN', prompt='{sentence1},{sentence2}'),
|
26 |
+
dict(role='BOT', prompt='两句话说的毫不相关。')
|
27 |
+
]),
|
28 |
+
1:
|
29 |
+
dict(
|
30 |
+
begin=[
|
31 |
+
dict(
|
32 |
+
role='SYSTEM',
|
33 |
+
fallback_role='HUMAN',
|
34 |
+
prompt='请判断以下两句话说的是否是一个意思:')
|
35 |
+
],
|
36 |
+
round=[
|
37 |
+
dict(role='HUMAN', prompt='{sentence1},{sentence2}'),
|
38 |
+
dict(role='BOT', prompt='两句话说是的一个意思。')
|
39 |
+
]),
|
40 |
+
}),
|
41 |
+
retriever=dict(type=ZeroRetriever),
|
42 |
+
inferencer=dict(type=PPLInferencer))
|
43 |
+
|
44 |
+
bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
45 |
+
|
46 |
+
bustm_datasets = [
|
47 |
+
dict(
|
48 |
+
type=HFDataset,
|
49 |
+
abbr='bustm-dev',
|
50 |
+
path='json',
|
51 |
+
data_files='./data/FewCLUE/bustm/dev_few_all.json',
|
52 |
+
split='train',
|
53 |
+
reader_cfg=bustm_reader_cfg,
|
54 |
+
infer_cfg=bustm_infer_cfg,
|
55 |
+
eval_cfg=bustm_eval_cfg),
|
56 |
+
dict(
|
57 |
+
type=HFDataset,
|
58 |
+
abbr='bustm-test',
|
59 |
+
path='json',
|
60 |
+
data_files='./data/FewCLUE/bustm/test_public.json',
|
61 |
+
split='train',
|
62 |
+
reader_cfg=bustm_reader_cfg,
|
63 |
+
infer_cfg=bustm_infer_cfg,
|
64 |
+
eval_cfg=bustm_eval_cfg)
|
65 |
+
]
|
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_9ef540.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
bustm_reader_cfg = dict(
|
8 |
+
input_columns=['sentence1', 'sentence2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
bustm_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0: '{sentence1}。\n{sentence2}。\n两句话说的毫不相关。',
|
17 |
+
1: '{sentence1}。\n{sentence2}。\n两句话说的一个意思。'
|
18 |
+
}),
|
19 |
+
retriever=dict(type=ZeroRetriever),
|
20 |
+
inferencer=dict(type=PPLInferencer))
|
21 |
+
|
22 |
+
bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
23 |
+
|
24 |
+
bustm_datasets = [
|
25 |
+
dict(
|
26 |
+
type=HFDataset,
|
27 |
+
abbr='bustm-dev',
|
28 |
+
path='json',
|
29 |
+
data_files='./data/FewCLUE/bustm/dev_few_all.json',
|
30 |
+
split='train',
|
31 |
+
reader_cfg=bustm_reader_cfg,
|
32 |
+
infer_cfg=bustm_infer_cfg,
|
33 |
+
eval_cfg=bustm_eval_cfg),
|
34 |
+
dict(
|
35 |
+
type=HFDataset,
|
36 |
+
abbr='bustm-test',
|
37 |
+
path='json',
|
38 |
+
data_files='./data/FewCLUE/bustm/test_public.json',
|
39 |
+
split='train',
|
40 |
+
reader_cfg=bustm_reader_cfg,
|
41 |
+
infer_cfg=bustm_infer_cfg,
|
42 |
+
eval_cfg=bustm_eval_cfg)
|
43 |
+
]
|
opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_ppl_e53034.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
bustm_reader_cfg = dict(
|
8 |
+
input_columns=['sentence1', 'sentence2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
bustm_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0:
|
17 |
+
dict(round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt=
|
21 |
+
'语句一:“{sentence1}”\n语句二:“{sentence2}”\n请判断语句一和语句二说的是否是一个意思?'
|
22 |
+
),
|
23 |
+
dict(role='BOT', prompt='两句话说的毫不相关。')
|
24 |
+
]),
|
25 |
+
1:
|
26 |
+
dict(round=[
|
27 |
+
dict(
|
28 |
+
role='HUMAN',
|
29 |
+
prompt=
|
30 |
+
'语句一:“{sentence1}”\n语句二:“{sentence2}”\n请判断语句一和语句二说的是否是一个意思?'
|
31 |
+
),
|
32 |
+
dict(role='BOT', prompt='两句话说是的一个意思。')
|
33 |
+
]),
|
34 |
+
}),
|
35 |
+
retriever=dict(type=ZeroRetriever),
|
36 |
+
inferencer=dict(type=PPLInferencer))
|
37 |
+
|
38 |
+
bustm_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
39 |
+
|
40 |
+
bustm_datasets = [
|
41 |
+
dict(
|
42 |
+
type=HFDataset,
|
43 |
+
abbr='bustm-dev',
|
44 |
+
path='json',
|
45 |
+
data_files='./data/FewCLUE/bustm/dev_few_all.json',
|
46 |
+
split='train',
|
47 |
+
reader_cfg=bustm_reader_cfg,
|
48 |
+
infer_cfg=bustm_infer_cfg,
|
49 |
+
eval_cfg=bustm_eval_cfg),
|
50 |
+
dict(
|
51 |
+
type=HFDataset,
|
52 |
+
abbr='bustm-test',
|
53 |
+
path='json',
|
54 |
+
data_files='./data/FewCLUE/bustm/test_public.json',
|
55 |
+
split='train',
|
56 |
+
reader_cfg=bustm_reader_cfg,
|
57 |
+
infer_cfg=bustm_infer_cfg,
|
58 |
+
eval_cfg=bustm_eval_cfg)
|
59 |
+
]
|
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CMNLIDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
ocnli_fc_reader_cfg = dict(
|
9 |
+
input_columns=['sentence1', 'sentence2'],
|
10 |
+
output_column='label',
|
11 |
+
test_split='train')
|
12 |
+
|
13 |
+
ocnli_fc_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt=
|
20 |
+
'阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}\nA. 对\nB. 错\nC. 可能\n请从“A”,“B”,“C”中进行选择。\n答:'
|
21 |
+
),
|
22 |
+
]),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
ocnli_fc_eval_cfg = dict(
|
28 |
+
evaluator=dict(type=AccEvaluator),
|
29 |
+
pred_role='BOT',
|
30 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
31 |
+
)
|
32 |
+
|
33 |
+
ocnli_fc_datasets = [
|
34 |
+
dict(
|
35 |
+
abbr='ocnli_fc-dev',
|
36 |
+
type=CMNLIDatasetV2, # ocnli_fc share the same format with cmnli
|
37 |
+
path='./data/FewCLUE/ocnli/dev_few_all.json',
|
38 |
+
local_mode=True,
|
39 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
40 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
41 |
+
eval_cfg=ocnli_fc_eval_cfg,
|
42 |
+
),
|
43 |
+
dict(
|
44 |
+
abbr='ocnli_fc-test',
|
45 |
+
type=CMNLIDatasetV2, # ocnli_fc share the same format with cmnli
|
46 |
+
path='./data/FewCLUE/ocnli/test_public.json',
|
47 |
+
local_mode=True,
|
48 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
49 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
50 |
+
eval_cfg=ocnli_fc_eval_cfg,
|
51 |
+
),
|
52 |
+
]
|
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_9e8b3d.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
ocnli_fc_reader_cfg = dict(
|
8 |
+
input_columns=['sentence1', 'sentence2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
ocnli_fc_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'contradiction':
|
17 |
+
dict(round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
|
21 |
+
dict(role='BOT', prompt='错')
|
22 |
+
]),
|
23 |
+
'entailment':
|
24 |
+
dict(round=[
|
25 |
+
dict(
|
26 |
+
role='HUMAN',
|
27 |
+
prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
|
28 |
+
dict(role='BOT', prompt='对')
|
29 |
+
]),
|
30 |
+
'neutral':
|
31 |
+
dict(round=[
|
32 |
+
dict(
|
33 |
+
role='HUMAN', prompt='如果{sentence1}为真,那么{sentence2}也为真吗?'),
|
34 |
+
dict(role='BOT', prompt='可能')
|
35 |
+
]),
|
36 |
+
}),
|
37 |
+
retriever=dict(type=ZeroRetriever),
|
38 |
+
inferencer=dict(type=PPLInferencer))
|
39 |
+
ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
40 |
+
|
41 |
+
ocnli_fc_datasets = [
|
42 |
+
dict(
|
43 |
+
type=HFDataset,
|
44 |
+
abbr='ocnli_fc-dev',
|
45 |
+
path='json',
|
46 |
+
split='train',
|
47 |
+
data_files='./data/FewCLUE/ocnli/dev_few_all.json',
|
48 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
49 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
50 |
+
eval_cfg=ocnli_fc_eval_cfg),
|
51 |
+
dict(
|
52 |
+
type=HFDataset,
|
53 |
+
abbr='ocnli_fc-test',
|
54 |
+
path='json',
|
55 |
+
split='train',
|
56 |
+
data_files='./data/FewCLUE/ocnli/test_public.json',
|
57 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
58 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
59 |
+
eval_cfg=ocnli_fc_eval_cfg)
|
60 |
+
]
|
opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_ppl_c08300.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
ocnli_fc_reader_cfg = dict(
|
8 |
+
input_columns=['sentence1', 'sentence2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
ocnli_fc_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'contradiction':
|
17 |
+
'阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:错',
|
18 |
+
'entailment': '阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:对',
|
19 |
+
'neutral': '如果{sentence1}为真,那么{sentence2}也为真吗?可能'
|
20 |
+
}),
|
21 |
+
retriever=dict(type=ZeroRetriever),
|
22 |
+
inferencer=dict(type=PPLInferencer))
|
23 |
+
ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
24 |
+
|
25 |
+
ocnli_fc_datasets = [
|
26 |
+
dict(
|
27 |
+
type=HFDataset,
|
28 |
+
abbr='ocnli_fc-dev',
|
29 |
+
path='json',
|
30 |
+
split='train',
|
31 |
+
data_files='./data/FewCLUE/ocnli/dev_few_all.json',
|
32 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
33 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
34 |
+
eval_cfg=ocnli_fc_eval_cfg),
|
35 |
+
dict(
|
36 |
+
type=HFDataset,
|
37 |
+
abbr='ocnli_fc-test',
|
38 |
+
path='json',
|
39 |
+
split='train',
|
40 |
+
data_files='./data/FewCLUE/ocnli/test_public.json',
|
41 |
+
reader_cfg=ocnli_fc_reader_cfg,
|
42 |
+
infer_cfg=ocnli_fc_infer_cfg,
|
43 |
+
eval_cfg=ocnli_fc_eval_cfg)
|
44 |
+
]
|
opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .GaokaoBench_gen_5cfe9e import GaokaoBench_datasets # noqa: F401, F403
|
opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen_5cfe9e.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import GaokaoBenchDataset
|
5 |
+
|
6 |
+
|
7 |
+
_MCQ_prompts = [
|
8 |
+
{
|
9 |
+
'type': 'single_choice',
|
10 |
+
'keyword': '2010-2022_Math_II_MCQs',
|
11 |
+
'prefix_prompt':
|
12 |
+
'请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
13 |
+
'comment': ''
|
14 |
+
},
|
15 |
+
{
|
16 |
+
'type': 'single_choice',
|
17 |
+
'keyword': '2010-2022_Math_I_MCQs',
|
18 |
+
'prefix_prompt':
|
19 |
+
'请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
20 |
+
'comment': ''
|
21 |
+
},
|
22 |
+
{
|
23 |
+
'type':
|
24 |
+
'single_choice',
|
25 |
+
'keyword':
|
26 |
+
'2010-2022_History_MCQs',
|
27 |
+
'prefix_prompt':
|
28 |
+
'请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
29 |
+
},
|
30 |
+
{
|
31 |
+
'type':
|
32 |
+
'single_choice',
|
33 |
+
'keyword':
|
34 |
+
'2010-2022_Biology_MCQs',
|
35 |
+
'prefix_prompt':
|
36 |
+
'请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
37 |
+
},
|
38 |
+
{
|
39 |
+
'type':
|
40 |
+
'single_choice',
|
41 |
+
'keyword':
|
42 |
+
'2010-2022_Political_Science_MCQs',
|
43 |
+
'prefix_prompt':
|
44 |
+
'请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
45 |
+
},
|
46 |
+
{
|
47 |
+
'type':
|
48 |
+
'multi_choice',
|
49 |
+
'keyword':
|
50 |
+
'2010-2022_Physics_MCQs',
|
51 |
+
'prefix_prompt':
|
52 |
+
'请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
|
53 |
+
},
|
54 |
+
{
|
55 |
+
'type':
|
56 |
+
'single_choice',
|
57 |
+
'keyword':
|
58 |
+
'2010-2022_Chemistry_MCQs',
|
59 |
+
'prefix_prompt':
|
60 |
+
'请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
61 |
+
},
|
62 |
+
{
|
63 |
+
'type':
|
64 |
+
'single_choice',
|
65 |
+
'keyword':
|
66 |
+
'2010-2013_English_MCQs',
|
67 |
+
'prefix_prompt':
|
68 |
+
'请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
69 |
+
},
|
70 |
+
{
|
71 |
+
'type':
|
72 |
+
'multi_question_choice',
|
73 |
+
'keyword':
|
74 |
+
'2010-2022_Chinese_Modern_Lit',
|
75 |
+
'prefix_prompt':
|
76 |
+
'请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
77 |
+
},
|
78 |
+
{
|
79 |
+
'type':
|
80 |
+
'multi_question_choice',
|
81 |
+
'keyword':
|
82 |
+
'2010-2022_English_Fill_in_Blanks',
|
83 |
+
'prefix_prompt':
|
84 |
+
'请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
85 |
+
},
|
86 |
+
{
|
87 |
+
'type':
|
88 |
+
'five_out_of_seven',
|
89 |
+
'keyword':
|
90 |
+
'2012-2022_English_Cloze_Test',
|
91 |
+
'prefix_prompt':
|
92 |
+
'请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
|
93 |
+
},
|
94 |
+
{
|
95 |
+
'type':
|
96 |
+
'multi_question_choice',
|
97 |
+
'keyword':
|
98 |
+
'2010-2022_Geography_MCQs',
|
99 |
+
'prefix_prompt':
|
100 |
+
'请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
101 |
+
},
|
102 |
+
{
|
103 |
+
'type':
|
104 |
+
'multi_question_choice',
|
105 |
+
'keyword':
|
106 |
+
'2010-2022_English_Reading_Comp',
|
107 |
+
'prefix_prompt':
|
108 |
+
'请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
109 |
+
},
|
110 |
+
{
|
111 |
+
'type':
|
112 |
+
'multi_question_choice',
|
113 |
+
'keyword':
|
114 |
+
'2010-2022_Chinese_Lang_and_Usage_MCQs',
|
115 |
+
'prefix_prompt':
|
116 |
+
'请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:'
|
117 |
+
},
|
118 |
+
]
|
119 |
+
_FBQ_prompts = [{
|
120 |
+
'type': 'cloze',
|
121 |
+
'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
|
122 |
+
'prefix_prompt':
|
123 |
+
'请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
124 |
+
'comment': ''
|
125 |
+
}, {
|
126 |
+
'type': 'cloze',
|
127 |
+
'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
|
128 |
+
'prefix_prompt':
|
129 |
+
'请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
130 |
+
'comment': ''
|
131 |
+
}, {
|
132 |
+
'type': 'cloze',
|
133 |
+
'keyword':
|
134 |
+
'2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
|
135 |
+
'prefix_prompt':
|
136 |
+
'请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
137 |
+
'comment': ''
|
138 |
+
}, {
|
139 |
+
'type': 'cloze',
|
140 |
+
'keyword': '2014-2022_English_Language_Cloze_Passage',
|
141 |
+
'prefix_prompt':
|
142 |
+
'请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
143 |
+
'comment': ''
|
144 |
+
}]
|
145 |
+
_OEQ_prompts = [
|
146 |
+
{
|
147 |
+
'type': 'subjective',
|
148 |
+
'keyword': '2010-2022_Geography_Open-ended_Questions',
|
149 |
+
'prefix_prompt':
|
150 |
+
'请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果���止一道题,请分别作答。\n题目如下:',
|
151 |
+
'comment': ''
|
152 |
+
},
|
153 |
+
{
|
154 |
+
'type': 'subjective',
|
155 |
+
'keyword': '2010-2022_Chemistry_Open-ended_Questions',
|
156 |
+
'prefix_prompt':
|
157 |
+
'请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
158 |
+
'comment': ''
|
159 |
+
},
|
160 |
+
{
|
161 |
+
'type': 'subjective',
|
162 |
+
'keyword': '2010-2022_Math_I_Open-ended_Questions',
|
163 |
+
'prefix_prompt':
|
164 |
+
'请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
165 |
+
'comment': ''
|
166 |
+
},
|
167 |
+
{
|
168 |
+
'type': 'subjective',
|
169 |
+
'keyword': '2010-2022_History_Open-ended_Questions',
|
170 |
+
'prefix_prompt':
|
171 |
+
'请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
172 |
+
'comment': ''
|
173 |
+
},
|
174 |
+
{
|
175 |
+
'type': 'subjective',
|
176 |
+
'keyword': '2010-2022_Biology_Open-ended_Questions',
|
177 |
+
'prefix_prompt':
|
178 |
+
'请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
179 |
+
'comment': ''
|
180 |
+
},
|
181 |
+
{
|
182 |
+
'type': 'subjective',
|
183 |
+
'keyword': '2010-2022_Math_II_Open-ended_Questions',
|
184 |
+
'prefix_prompt':
|
185 |
+
'请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
186 |
+
'comment': ''
|
187 |
+
},
|
188 |
+
{
|
189 |
+
'type': 'subjective',
|
190 |
+
'keyword': '2010-2022_Physics_Open-ended_Questions',
|
191 |
+
'prefix_prompt':
|
192 |
+
'请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
193 |
+
'comment': ''
|
194 |
+
},
|
195 |
+
{
|
196 |
+
'type': 'subjective',
|
197 |
+
'keyword': '2010-2022_Political_Science_Open-ended_Questions',
|
198 |
+
'prefix_prompt':
|
199 |
+
'请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
200 |
+
'comment': ''
|
201 |
+
},
|
202 |
+
{
|
203 |
+
'type': 'correction',
|
204 |
+
'keyword': '2012-2022_English_Language_Error_Correction',
|
205 |
+
'prefix_prompt':
|
206 |
+
'请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一���步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
|
207 |
+
# "prefix_prompt": [
|
208 |
+
# "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
209 |
+
# "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
|
210 |
+
# ],
|
211 |
+
'comment': ''
|
212 |
+
},
|
213 |
+
{
|
214 |
+
'type': 'subjective',
|
215 |
+
'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
|
216 |
+
'prefix_prompt':
|
217 |
+
'请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
218 |
+
'comment': ''
|
219 |
+
},
|
220 |
+
{
|
221 |
+
'type': 'subjective',
|
222 |
+
'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
|
223 |
+
'prefix_prompt':
|
224 |
+
'请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
225 |
+
'comment': ''
|
226 |
+
},
|
227 |
+
{
|
228 |
+
'type': 'subjective',
|
229 |
+
'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
|
230 |
+
'prefix_prompt':
|
231 |
+
'请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
232 |
+
'comment': ''
|
233 |
+
},
|
234 |
+
{
|
235 |
+
'type': 'subjective',
|
236 |
+
'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
|
237 |
+
'prefix_prompt':
|
238 |
+
'请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
239 |
+
'comment': ''
|
240 |
+
},
|
241 |
+
{
|
242 |
+
'type': 'subjective',
|
243 |
+
'keyword':
|
244 |
+
'2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
|
245 |
+
'prefix_prompt':
|
246 |
+
'请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
247 |
+
'comment': ''
|
248 |
+
}
|
249 |
+
]
|
250 |
+
|
251 |
+
GaokaoBench_datasets = []
|
252 |
+
for _folder, _prompts in [
|
253 |
+
('Multiple-choice_Questions', _MCQ_prompts),
|
254 |
+
('Fill-in-the-blank_Questions', _FBQ_prompts),
|
255 |
+
('Open-ended_Questions', _OEQ_prompts),
|
256 |
+
]:
|
257 |
+
for _p in _prompts:
|
258 |
+
_reader_cfg = {
|
259 |
+
'input_columns': ['question'],
|
260 |
+
'output_column': 'answer',
|
261 |
+
}
|
262 |
+
_infer_cfg = {
|
263 |
+
'ice_template': {
|
264 |
+
'type': PromptTemplate,
|
265 |
+
'template': {
|
266 |
+
'round': [{
|
267 |
+
'role': 'HUMAN',
|
268 |
+
'prompt': _p['prefix_prompt'] + '{question}'
|
269 |
+
}]
|
270 |
+
},
|
271 |
+
'ice_token': '</E>'
|
272 |
+
},
|
273 |
+
'retriever': {
|
274 |
+
'type': ZeroRetriever
|
275 |
+
},
|
276 |
+
'inferencer': {
|
277 |
+
'type': GenInferencer,
|
278 |
+
'max_out_len': 1024,
|
279 |
+
}
|
280 |
+
}
|
281 |
+
_eval_cfg = {
|
282 |
+
'evaluator': {
|
283 |
+
'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
|
284 |
+
},
|
285 |
+
'pred_role': 'BOT',
|
286 |
+
}
|
287 |
+
_base_path = 'opencompass/GAOKAO-BENCH'
|
288 |
+
_dataset = {
|
289 |
+
'type': GaokaoBenchDataset,
|
290 |
+
'abbr': 'GaokaoBench_' + _p['keyword'],
|
291 |
+
'path': _base_path,
|
292 |
+
'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
|
293 |
+
'name': _p['keyword'],
|
294 |
+
'reader_cfg': _reader_cfg,
|
295 |
+
'infer_cfg': _infer_cfg,
|
296 |
+
'eval_cfg': _eval_cfg,
|
297 |
+
}
|
298 |
+
|
299 |
+
GaokaoBench_datasets.append(_dataset)
|
300 |
+
|
301 |
+
_temporary_variables = [k for k in globals() if k.startswith('_')]
|
302 |
+
for _t in _temporary_variables:
|
303 |
+
del globals()[_t]
|
304 |
+
del _temporary_variables, _t
|
opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .GaokaoBench_mixed_9af5ee import GaokaoBench_datasets # noqa: F401, F403
|
opencompass/configs/datasets/GaokaoBench/GaokaoBench_mixed_9af5ee.py
ADDED
@@ -0,0 +1,356 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
4 |
+
from opencompass.datasets import GaokaoBenchDataset
|
5 |
+
_MCQ_prompts = [
|
6 |
+
{
|
7 |
+
'type': 'single_choice',
|
8 |
+
'keyword': '2010-2022_Math_II_MCQs',
|
9 |
+
'prefix_prompt':
|
10 |
+
'请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
11 |
+
'comment': ''
|
12 |
+
},
|
13 |
+
{
|
14 |
+
'type': 'single_choice',
|
15 |
+
'keyword': '2010-2022_Math_I_MCQs',
|
16 |
+
'prefix_prompt':
|
17 |
+
'请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
18 |
+
'comment': ''
|
19 |
+
},
|
20 |
+
{
|
21 |
+
'type':
|
22 |
+
'single_choice',
|
23 |
+
'keyword':
|
24 |
+
'2010-2022_History_MCQs',
|
25 |
+
'prefix_prompt':
|
26 |
+
'请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
27 |
+
},
|
28 |
+
{
|
29 |
+
'type':
|
30 |
+
'single_choice',
|
31 |
+
'keyword':
|
32 |
+
'2010-2022_Biology_MCQs',
|
33 |
+
'prefix_prompt':
|
34 |
+
'请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
35 |
+
},
|
36 |
+
{
|
37 |
+
'type':
|
38 |
+
'single_choice',
|
39 |
+
'keyword':
|
40 |
+
'2010-2022_Political_Science_MCQs',
|
41 |
+
'prefix_prompt':
|
42 |
+
'请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
43 |
+
},
|
44 |
+
{
|
45 |
+
'type':
|
46 |
+
'multi_choice',
|
47 |
+
'keyword':
|
48 |
+
'2010-2022_Physics_MCQs',
|
49 |
+
'prefix_prompt':
|
50 |
+
'请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n'
|
51 |
+
},
|
52 |
+
{
|
53 |
+
'type':
|
54 |
+
'single_choice',
|
55 |
+
'keyword':
|
56 |
+
'2010-2022_Chemistry_MCQs',
|
57 |
+
'prefix_prompt':
|
58 |
+
'请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
59 |
+
},
|
60 |
+
{
|
61 |
+
'type':
|
62 |
+
'single_choice',
|
63 |
+
'keyword':
|
64 |
+
'2010-2013_English_MCQs',
|
65 |
+
'prefix_prompt':
|
66 |
+
'请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:'
|
67 |
+
},
|
68 |
+
{
|
69 |
+
'type':
|
70 |
+
'multi_question_choice',
|
71 |
+
'keyword':
|
72 |
+
'2010-2022_Chinese_Modern_Lit',
|
73 |
+
'prefix_prompt':
|
74 |
+
'请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
75 |
+
},
|
76 |
+
{
|
77 |
+
'type':
|
78 |
+
'multi_question_choice',
|
79 |
+
'keyword':
|
80 |
+
'2010-2022_English_Fill_in_Blanks',
|
81 |
+
'prefix_prompt':
|
82 |
+
'请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
83 |
+
},
|
84 |
+
{
|
85 |
+
'type':
|
86 |
+
'five_out_of_seven',
|
87 |
+
'keyword':
|
88 |
+
'2012-2022_English_Cloze_Test',
|
89 |
+
'prefix_prompt':
|
90 |
+
'请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n'
|
91 |
+
},
|
92 |
+
{
|
93 |
+
'type':
|
94 |
+
'multi_question_choice',
|
95 |
+
'keyword':
|
96 |
+
'2010-2022_Geography_MCQs',
|
97 |
+
'prefix_prompt':
|
98 |
+
'请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
99 |
+
},
|
100 |
+
{
|
101 |
+
'type':
|
102 |
+
'multi_question_choice',
|
103 |
+
'keyword':
|
104 |
+
'2010-2022_English_Reading_Comp',
|
105 |
+
'prefix_prompt':
|
106 |
+
'请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n'
|
107 |
+
},
|
108 |
+
{
|
109 |
+
'type':
|
110 |
+
'multi_question_choice',
|
111 |
+
'keyword':
|
112 |
+
'2010-2022_Chinese_Lang_and_Usage_MCQs',
|
113 |
+
'prefix_prompt':
|
114 |
+
'请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:'
|
115 |
+
},
|
116 |
+
]
|
117 |
+
_FBQ_prompts = [{
|
118 |
+
'type': 'cloze',
|
119 |
+
'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
|
120 |
+
'prefix_prompt':
|
121 |
+
'请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
122 |
+
'comment': ''
|
123 |
+
}, {
|
124 |
+
'type': 'cloze',
|
125 |
+
'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
|
126 |
+
'prefix_prompt':
|
127 |
+
'请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
128 |
+
'comment': ''
|
129 |
+
}, {
|
130 |
+
'type': 'cloze',
|
131 |
+
'keyword':
|
132 |
+
'2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
|
133 |
+
'prefix_prompt':
|
134 |
+
'请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
135 |
+
'comment': ''
|
136 |
+
}, {
|
137 |
+
'type': 'cloze',
|
138 |
+
'keyword': '2014-2022_English_Language_Cloze_Passage',
|
139 |
+
'prefix_prompt':
|
140 |
+
'请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
141 |
+
'comment': ''
|
142 |
+
}]
|
143 |
+
_OEQ_prompts = [
|
144 |
+
{
|
145 |
+
'type': 'subjective',
|
146 |
+
'keyword': '2010-2022_Geography_Open-ended_Questions',
|
147 |
+
'prefix_prompt':
|
148 |
+
'请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作��,如果不止一道题,请分别作答。\n题目如下:',
|
149 |
+
'comment': ''
|
150 |
+
},
|
151 |
+
{
|
152 |
+
'type': 'subjective',
|
153 |
+
'keyword': '2010-2022_Chemistry_Open-ended_Questions',
|
154 |
+
'prefix_prompt':
|
155 |
+
'请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
156 |
+
'comment': ''
|
157 |
+
},
|
158 |
+
{
|
159 |
+
'type': 'subjective',
|
160 |
+
'keyword': '2010-2022_Math_I_Open-ended_Questions',
|
161 |
+
'prefix_prompt':
|
162 |
+
'请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
163 |
+
'comment': ''
|
164 |
+
},
|
165 |
+
{
|
166 |
+
'type': 'subjective',
|
167 |
+
'keyword': '2010-2022_History_Open-ended_Questions',
|
168 |
+
'prefix_prompt':
|
169 |
+
'请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
170 |
+
'comment': ''
|
171 |
+
},
|
172 |
+
{
|
173 |
+
'type': 'subjective',
|
174 |
+
'keyword': '2010-2022_Biology_Open-ended_Questions',
|
175 |
+
'prefix_prompt':
|
176 |
+
'请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
177 |
+
'comment': ''
|
178 |
+
},
|
179 |
+
{
|
180 |
+
'type': 'subjective',
|
181 |
+
'keyword': '2010-2022_Math_II_Open-ended_Questions',
|
182 |
+
'prefix_prompt':
|
183 |
+
'请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
184 |
+
'comment': ''
|
185 |
+
},
|
186 |
+
{
|
187 |
+
'type': 'subjective',
|
188 |
+
'keyword': '2010-2022_Physics_Open-ended_Questions',
|
189 |
+
'prefix_prompt':
|
190 |
+
'请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
191 |
+
'comment': ''
|
192 |
+
},
|
193 |
+
{
|
194 |
+
'type': 'subjective',
|
195 |
+
'keyword': '2010-2022_Political_Science_Open-ended_Questions',
|
196 |
+
'prefix_prompt':
|
197 |
+
'请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
198 |
+
'comment': ''
|
199 |
+
},
|
200 |
+
{
|
201 |
+
'type': 'correction',
|
202 |
+
'keyword': '2012-2022_English_Language_Error_Correction',
|
203 |
+
'prefix_prompt':
|
204 |
+
'请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方��请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
|
205 |
+
# "prefix_prompt": [
|
206 |
+
# "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
207 |
+
# "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
|
208 |
+
# ],
|
209 |
+
'comment': ''
|
210 |
+
},
|
211 |
+
{
|
212 |
+
'type': 'subjective',
|
213 |
+
'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
|
214 |
+
'prefix_prompt':
|
215 |
+
'请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
216 |
+
'comment': ''
|
217 |
+
},
|
218 |
+
{
|
219 |
+
'type': 'subjective',
|
220 |
+
'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
|
221 |
+
'prefix_prompt':
|
222 |
+
'请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
223 |
+
'comment': ''
|
224 |
+
},
|
225 |
+
{
|
226 |
+
'type': 'subjective',
|
227 |
+
'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
|
228 |
+
'prefix_prompt':
|
229 |
+
'请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
230 |
+
'comment': ''
|
231 |
+
},
|
232 |
+
{
|
233 |
+
'type': 'subjective',
|
234 |
+
'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
|
235 |
+
'prefix_prompt':
|
236 |
+
'请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
237 |
+
'comment': ''
|
238 |
+
},
|
239 |
+
{
|
240 |
+
'type': 'subjective',
|
241 |
+
'keyword':
|
242 |
+
'2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
|
243 |
+
'prefix_prompt':
|
244 |
+
'请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
245 |
+
'comment': ''
|
246 |
+
}
|
247 |
+
]
|
248 |
+
|
249 |
+
GaokaoBench_datasets = []
|
250 |
+
for _folder, _prompts in [
|
251 |
+
('Multiple-choice_Questions', _MCQ_prompts),
|
252 |
+
('Fill-in-the-blank_Questions', _FBQ_prompts),
|
253 |
+
('Open-ended_Questions', _OEQ_prompts),
|
254 |
+
]:
|
255 |
+
for _p in _prompts:
|
256 |
+
if _p['type'] == 'single_choice':
|
257 |
+
continue
|
258 |
+
_reader_cfg = {
|
259 |
+
'input_columns': ['question'],
|
260 |
+
'output_column': 'answer',
|
261 |
+
}
|
262 |
+
_infer_cfg = {
|
263 |
+
'ice_template': {
|
264 |
+
'type': PromptTemplate,
|
265 |
+
'template': {
|
266 |
+
'round': [{
|
267 |
+
'role': 'HUMAN',
|
268 |
+
'prompt': _p['prefix_prompt'] + '{question}'
|
269 |
+
}]
|
270 |
+
},
|
271 |
+
'ice_token': '</E>'
|
272 |
+
},
|
273 |
+
'retriever': {
|
274 |
+
'type': ZeroRetriever
|
275 |
+
},
|
276 |
+
'inferencer': {
|
277 |
+
'type': GenInferencer,
|
278 |
+
'max_out_len': 1024,
|
279 |
+
}
|
280 |
+
}
|
281 |
+
_eval_cfg = {
|
282 |
+
'evaluator': {
|
283 |
+
'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
|
284 |
+
},
|
285 |
+
'pred_role': 'BOT',
|
286 |
+
}
|
287 |
+
_base_path = './data/GAOKAO-BENCH/data'
|
288 |
+
_dataset = {
|
289 |
+
'type': GaokaoBenchDataset,
|
290 |
+
'abbr': 'GaokaoBench_' + _p['keyword'],
|
291 |
+
'path': _base_path,
|
292 |
+
'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
|
293 |
+
'name': _p['keyword'],
|
294 |
+
'reader_cfg': _reader_cfg,
|
295 |
+
'infer_cfg': _infer_cfg,
|
296 |
+
'eval_cfg': _eval_cfg,
|
297 |
+
}
|
298 |
+
|
299 |
+
GaokaoBench_datasets.append(_dataset)
|
300 |
+
|
301 |
+
_folder = 'Multiple-choice_Questions'
|
302 |
+
for _p in _MCQ_prompts:
|
303 |
+
if _p['type'] != 'single_choice':
|
304 |
+
continue
|
305 |
+
_reader_cfg = {
|
306 |
+
'input_columns': ['question'],
|
307 |
+
'output_column': 'answer',
|
308 |
+
}
|
309 |
+
_infer_cfg = {
|
310 |
+
'ice_template': {
|
311 |
+
'type': PromptTemplate,
|
312 |
+
'template': {
|
313 |
+
answer: {
|
314 |
+
'round': [{
|
315 |
+
'role': 'HUMAN',
|
316 |
+
'prompt': _p['prefix_prompt'] + '{question}'
|
317 |
+
}, {
|
318 |
+
'role': 'BOT',
|
319 |
+
'prompt': f'【答案】{answer} <eoa>'
|
320 |
+
}]
|
321 |
+
}
|
322 |
+
for answer in ['A', 'B', 'C', 'D']
|
323 |
+
},
|
324 |
+
'ice_token': '</E>'
|
325 |
+
},
|
326 |
+
'retriever': {
|
327 |
+
'type': ZeroRetriever
|
328 |
+
},
|
329 |
+
'inferencer': {
|
330 |
+
'type': PPLInferencer
|
331 |
+
}
|
332 |
+
}
|
333 |
+
_eval_cfg = {
|
334 |
+
'evaluator': {
|
335 |
+
'type': 'GaokaoBenchEvaluator' + '_' + _p['type'],
|
336 |
+
},
|
337 |
+
'pred_role': 'BOT',
|
338 |
+
}
|
339 |
+
_base_path = 'opencompass/GAOKAO-BENCH'
|
340 |
+
_dataset = {
|
341 |
+
'type': GaokaoBenchDataset,
|
342 |
+
'abbr': 'GaokaoBench_' + _p['keyword'],
|
343 |
+
'path': _base_path,
|
344 |
+
'filename': '/' + _folder + '/' + _p['keyword'] + '.json',
|
345 |
+
'name': _p['keyword'],
|
346 |
+
'reader_cfg': _reader_cfg,
|
347 |
+
'infer_cfg': _infer_cfg,
|
348 |
+
'eval_cfg': _eval_cfg,
|
349 |
+
}
|
350 |
+
|
351 |
+
GaokaoBench_datasets.append(_dataset)
|
352 |
+
|
353 |
+
_temporary_variables = [k for k in globals() if k.startswith('_')]
|
354 |
+
for _t in _temporary_variables:
|
355 |
+
del globals()[_t]
|
356 |
+
del _temporary_variables, _t
|
opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_4c31db.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.datasets import GaokaoBenchDataset
|
6 |
+
from mmengine.config import read_base
|
7 |
+
|
8 |
+
with read_base():
|
9 |
+
from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
|
10 |
+
|
11 |
+
GaokaoBench_datasets = []
|
12 |
+
for folder, prompts in [
|
13 |
+
('Multiple-choice_Questions', MCQ_prompts),
|
14 |
+
('Fill-in-the-blank_Questions', FBQ_prompts),
|
15 |
+
]:
|
16 |
+
for p in prompts:
|
17 |
+
reader_cfg = {
|
18 |
+
'input_columns': ['question'],
|
19 |
+
'output_column': 'answer',
|
20 |
+
}
|
21 |
+
infer_cfg = {
|
22 |
+
'ice_template': {
|
23 |
+
'type': PromptTemplate,
|
24 |
+
'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
|
25 |
+
'ice_token': '</E>',
|
26 |
+
},
|
27 |
+
'retriever': {'type': ZeroRetriever},
|
28 |
+
'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
|
29 |
+
}
|
30 |
+
eval_cfg = {
|
31 |
+
'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
|
32 |
+
'pred_role': 'BOT',
|
33 |
+
}
|
34 |
+
_base_path = 'opencompass/GAOKAO-BENCH'
|
35 |
+
dataset = {
|
36 |
+
'type': GaokaoBenchDataset,
|
37 |
+
'abbr': 'GaokaoBench_' + p['keyword'],
|
38 |
+
'path': _base_path,
|
39 |
+
'filename': '/' + folder + '/' + p['keyword'] + '.json',
|
40 |
+
'name': p['keyword'],
|
41 |
+
'reader_cfg': reader_cfg,
|
42 |
+
'infer_cfg': infer_cfg,
|
43 |
+
'eval_cfg': eval_cfg,
|
44 |
+
}
|
45 |
+
GaokaoBench_datasets.append(dataset)
|
opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d21e37.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.datasets import GaokaoBenchDataset
|
6 |
+
from mmengine.config import read_base
|
7 |
+
|
8 |
+
with read_base():
|
9 |
+
from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
|
10 |
+
|
11 |
+
GaokaoBench_datasets = []
|
12 |
+
for folder, prompts in [
|
13 |
+
('Multiple-choice_Questions', MCQ_prompts),
|
14 |
+
('Fill-in-the-blank_Questions', FBQ_prompts),
|
15 |
+
]:
|
16 |
+
for p in prompts:
|
17 |
+
reader_cfg = {
|
18 |
+
'input_columns': ['question'],
|
19 |
+
'output_column': 'answer',
|
20 |
+
}
|
21 |
+
infer_cfg = {
|
22 |
+
'prompt_template': {
|
23 |
+
'type': PromptTemplate,
|
24 |
+
'template': p['prefix_prompt'] + '{question}',
|
25 |
+
},
|
26 |
+
'retriever': {'type': ZeroRetriever},
|
27 |
+
'inferencer': {'type': GenInferencer, 'max_out_len': 1024},
|
28 |
+
}
|
29 |
+
eval_cfg = {
|
30 |
+
'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
|
31 |
+
'pred_role': 'BOT',
|
32 |
+
}
|
33 |
+
_base_path = 'opencompass/GAOKAO-BENCH'
|
34 |
+
dataset = {
|
35 |
+
'type': GaokaoBenchDataset,
|
36 |
+
'abbr': 'GaokaoBench_' + p['keyword'],
|
37 |
+
'path': _base_path,
|
38 |
+
'filename': '/' + folder + '/' + p['keyword'] + '.json',
|
39 |
+
'name': p['keyword'],
|
40 |
+
'reader_cfg': reader_cfg,
|
41 |
+
'infer_cfg': infer_cfg,
|
42 |
+
'eval_cfg': eval_cfg,
|
43 |
+
}
|
44 |
+
GaokaoBench_datasets.append(dataset)
|
opencompass/configs/datasets/GaokaoBench/GaokaoBench_prompts.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
MCQ_prompts = [
|
3 |
+
{
|
4 |
+
'type': 'single_choice',
|
5 |
+
'keyword': '2010-2022_Math_II_MCQs',
|
6 |
+
'prefix_prompt': '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
7 |
+
'comment': '',
|
8 |
+
},
|
9 |
+
{
|
10 |
+
'type': 'single_choice',
|
11 |
+
'keyword': '2010-2022_Math_I_MCQs',
|
12 |
+
'prefix_prompt': '请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
13 |
+
'comment': '',
|
14 |
+
},
|
15 |
+
{
|
16 |
+
'type': 'single_choice',
|
17 |
+
'keyword': '2010-2022_History_MCQs',
|
18 |
+
'prefix_prompt': '请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
19 |
+
},
|
20 |
+
{
|
21 |
+
'type': 'single_choice',
|
22 |
+
'keyword': '2010-2022_Biology_MCQs',
|
23 |
+
'prefix_prompt': '请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
24 |
+
},
|
25 |
+
{
|
26 |
+
'type': 'single_choice',
|
27 |
+
'keyword': '2010-2022_Political_Science_MCQs',
|
28 |
+
'prefix_prompt': '请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
29 |
+
},
|
30 |
+
{
|
31 |
+
'type': 'multi_choice',
|
32 |
+
'keyword': '2010-2022_Physics_MCQs',
|
33 |
+
'prefix_prompt': '请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n',
|
34 |
+
},
|
35 |
+
{
|
36 |
+
'type': 'single_choice',
|
37 |
+
'keyword': '2010-2022_Chemistry_MCQs',
|
38 |
+
'prefix_prompt': '请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
39 |
+
},
|
40 |
+
{
|
41 |
+
'type': 'single_choice',
|
42 |
+
'keyword': '2010-2013_English_MCQs',
|
43 |
+
'prefix_prompt': '请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
44 |
+
},
|
45 |
+
{
|
46 |
+
'type': 'multi_question_choice',
|
47 |
+
'keyword': '2010-2022_Chinese_Modern_Lit',
|
48 |
+
'prefix_prompt': '请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
|
49 |
+
},
|
50 |
+
{
|
51 |
+
'type': 'multi_question_choice',
|
52 |
+
'keyword': '2010-2022_English_Fill_in_Blanks',
|
53 |
+
'prefix_prompt': '请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
|
54 |
+
},
|
55 |
+
{
|
56 |
+
'type': 'five_out_of_seven',
|
57 |
+
'keyword': '2012-2022_English_Cloze_Test',
|
58 |
+
'prefix_prompt': '请回答下面��问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n',
|
59 |
+
},
|
60 |
+
{
|
61 |
+
'type': 'multi_question_choice',
|
62 |
+
'keyword': '2010-2022_Geography_MCQs',
|
63 |
+
'prefix_prompt': '请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
|
64 |
+
},
|
65 |
+
{
|
66 |
+
'type': 'multi_question_choice',
|
67 |
+
'keyword': '2010-2022_English_Reading_Comp',
|
68 |
+
'prefix_prompt': '请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n',
|
69 |
+
},
|
70 |
+
{
|
71 |
+
'type': 'multi_question_choice',
|
72 |
+
'keyword': '2010-2022_Chinese_Lang_and_Usage_MCQs',
|
73 |
+
'prefix_prompt': '请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:',
|
74 |
+
},
|
75 |
+
]
|
76 |
+
FBQ_prompts = [
|
77 |
+
{
|
78 |
+
'type': 'cloze',
|
79 |
+
'keyword': '2010-2022_Math_I_Fill-in-the-Blank',
|
80 |
+
'prefix_prompt': '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
81 |
+
'comment': '',
|
82 |
+
},
|
83 |
+
{
|
84 |
+
'type': 'cloze',
|
85 |
+
'keyword': '2010-2022_Math_II_Fill-in-the-Blank',
|
86 |
+
'prefix_prompt': '请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:',
|
87 |
+
'comment': '',
|
88 |
+
},
|
89 |
+
{
|
90 |
+
'type': 'cloze',
|
91 |
+
'keyword': '2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation',
|
92 |
+
'prefix_prompt': '请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
93 |
+
'comment': '',
|
94 |
+
},
|
95 |
+
{
|
96 |
+
'type': 'cloze',
|
97 |
+
'keyword': '2014-2022_English_Language_Cloze_Passage',
|
98 |
+
'prefix_prompt': '请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
99 |
+
'comment': '',
|
100 |
+
},
|
101 |
+
]
|
102 |
+
OEQ_prompts = [
|
103 |
+
{
|
104 |
+
'type': 'subjective',
|
105 |
+
'keyword': '2010-2022_Geography_Open-ended_Questions',
|
106 |
+
'prefix_prompt': '请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
107 |
+
'comment': '',
|
108 |
+
},
|
109 |
+
{
|
110 |
+
'type': 'subjective',
|
111 |
+
'keyword': '2010-2022_Chemistry_Open-ended_Questions',
|
112 |
+
'prefix_prompt': '请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解���】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
113 |
+
'comment': '',
|
114 |
+
},
|
115 |
+
{
|
116 |
+
'type': 'subjective',
|
117 |
+
'keyword': '2010-2022_Math_I_Open-ended_Questions',
|
118 |
+
'prefix_prompt': '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
119 |
+
'comment': '',
|
120 |
+
},
|
121 |
+
{
|
122 |
+
'type': 'subjective',
|
123 |
+
'keyword': '2010-2022_History_Open-ended_Questions',
|
124 |
+
'prefix_prompt': '请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
125 |
+
'comment': '',
|
126 |
+
},
|
127 |
+
{
|
128 |
+
'type': 'subjective',
|
129 |
+
'keyword': '2010-2022_Biology_Open-ended_Questions',
|
130 |
+
'prefix_prompt': '请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
131 |
+
'comment': '',
|
132 |
+
},
|
133 |
+
{
|
134 |
+
'type': 'subjective',
|
135 |
+
'keyword': '2010-2022_Math_II_Open-ended_Questions',
|
136 |
+
'prefix_prompt': '请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
137 |
+
'comment': '',
|
138 |
+
},
|
139 |
+
{
|
140 |
+
'type': 'subjective',
|
141 |
+
'keyword': '2010-2022_Physics_Open-ended_Questions',
|
142 |
+
'prefix_prompt': '请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
143 |
+
'comment': '',
|
144 |
+
},
|
145 |
+
{
|
146 |
+
'type': 'subjective',
|
147 |
+
'keyword': '2010-2022_Political_Science_Open-ended_Questions',
|
148 |
+
'prefix_prompt': '请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
149 |
+
'comment': '',
|
150 |
+
},
|
151 |
+
{
|
152 |
+
'type': 'correction',
|
153 |
+
'keyword': '2012-2022_English_Language_Error_Correction',
|
154 |
+
'prefix_prompt': '请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:',
|
155 |
+
# "prefix_prompt": [
|
156 |
+
# "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
157 |
+
# "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
|
158 |
+
# ],
|
159 |
+
'comment': '',
|
160 |
+
},
|
161 |
+
{
|
162 |
+
'type': 'subjective',
|
163 |
+
'keyword': '2010-2022_Chinese_Language_Ancient_Poetry_Reading',
|
164 |
+
'prefix_prompt': '请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
165 |
+
'comment': '',
|
166 |
+
},
|
167 |
+
{
|
168 |
+
'type': 'subjective',
|
169 |
+
'keyword': '2010-2022_Chinese_Language_Practical_Text_Reading',
|
170 |
+
'prefix_prompt': '请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
171 |
+
'comment': '',
|
172 |
+
},
|
173 |
+
{
|
174 |
+
'type': 'subjective',
|
175 |
+
'keyword': '2010-2022_Chinese_Language_Literary_Text_Reading',
|
176 |
+
'prefix_prompt': '请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
177 |
+
'comment': '',
|
178 |
+
},
|
179 |
+
{
|
180 |
+
'type': 'subjective',
|
181 |
+
'keyword': '2010-2022_Chinese_Language_Classical_Chinese_Reading',
|
182 |
+
'prefix_prompt': '请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:',
|
183 |
+
'comment': '',
|
184 |
+
},
|
185 |
+
{
|
186 |
+
'type': 'subjective',
|
187 |
+
'keyword': '2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions',
|
188 |
+
'prefix_prompt': '请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:',
|
189 |
+
'comment': '',
|
190 |
+
},
|
191 |
+
]
|
opencompass/configs/datasets/GaokaoBench/README.md
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GaokaoBench
|
2 |
+
|
3 |
+
```bash
|
4 |
+
python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
|
5 |
+
python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
|
6 |
+
```
|
7 |
+
|
8 |
+
## Base Models
|
9 |
+
|
10 |
+
| model | GaokaoBench |
|
11 |
+
|:------------------------:|--------------:|
|
12 |
+
| llama-7b-turbomind | 14.55 |
|
13 |
+
| llama-13b-turbomind | 16.20 |
|
14 |
+
| llama-30b-turbomind | 16.14 |
|
15 |
+
| llama-65b-turbomind | 13.31 |
|
16 |
+
| llama-2-7b-turbomind | 15.02 |
|
17 |
+
| llama-2-13b-turbomind | 14.86 |
|
18 |
+
| llama-2-70b-turbomind | 16.36 |
|
19 |
+
| llama-3-8b-turbomind | 20.88 |
|
20 |
+
| llama-3-70b-turbomind | 19.98 |
|
21 |
+
| internlm2-1.8b-turbomind | 23.78 |
|
22 |
+
| internlm2-7b-turbomind | 41.41 |
|
23 |
+
| internlm2-20b-turbomind | 58.99 |
|
24 |
+
| qwen-1.8b-turbomind | 22.11 |
|
25 |
+
| qwen-7b-turbomind | 35.32 |
|
26 |
+
| qwen-14b-turbomind | 54.07 |
|
27 |
+
| qwen-72b-turbomind | 77.56 |
|
28 |
+
| qwen1.5-0.5b-hf | 30.67 |
|
29 |
+
| qwen1.5-1.8b-hf | 35.66 |
|
30 |
+
| qwen1.5-4b-hf | 54.31 |
|
31 |
+
| qwen1.5-7b-hf | 65.99 |
|
32 |
+
| qwen1.5-14b-hf | 66.60 |
|
33 |
+
| qwen1.5-32b-hf | 79.01 |
|
34 |
+
| qwen1.5-72b-hf | 80.26 |
|
35 |
+
| qwen1.5-moe-a2-7b-hf | 52.79 |
|
36 |
+
| mistral-7b-v0.1-hf | 14.35 |
|
37 |
+
| mistral-7b-v0.2-hf | 11.10 |
|
38 |
+
| mixtral-8x7b-v0.1-hf | 8.40 |
|
39 |
+
| mixtral-8x22b-v0.1-hf | 16.23 |
|
40 |
+
| yi-6b-hf | 31.70 |
|
41 |
+
| yi-34b-hf | 30.51 |
|
42 |
+
| deepseek-7b-base-hf | 17.02 |
|
43 |
+
| deepseek-67b-base-hf | 10.14 |
|
44 |
+
|
45 |
+
### Details
|
46 |
+
|
47 |
+
| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
|
48 |
+
|:------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
|
49 |
+
| llama-7b-turbomind | 14.22 | 13.55 | 12.54 | 18.67 | 19.06 | 2.34 | 17.74 |
|
50 |
+
| llama-13b-turbomind | 18.81 | 15.89 | 21.25 | 22.67 | 15.62 | 1.56 | 25.81 |
|
51 |
+
| llama-30b-turbomind | 20.64 | 19.16 | 27.18 | 16.67 | 16.56 | 2.34 | 12.10 |
|
52 |
+
| llama-65b-turbomind | 21.10 | 15.89 | 11.50 | 20.00 | 5.94 | 1.56 | 21.77 |
|
53 |
+
| llama-2-7b-turbomind | 16.97 | 16.36 | 20.91 | 22.00 | 18.75 | 2.34 | 11.29 |
|
54 |
+
| llama-2-13b-turbomind | 14.68 | 11.68 | 26.13 | 16.00 | 17.81 | 2.34 | 20.97 |
|
55 |
+
| llama-2-70b-turbomind | 18.81 | 12.15 | 26.13 | 16.00 | 20.31 | 4.69 | 16.13 |
|
56 |
+
| llama-3-8b-turbomind | 4.13 | 7.94 | 37.63 | 24.67 | 26.25 | 5.47 | 21.77 |
|
57 |
+
| llama-3-70b-turbomind | 4.59 | 3.12 | 20.83 | 10.94 | 18.00 | 6.25 | 15.62 |
|
58 |
+
| internlm2-1.8b-turbomind | 20.64 | 22.90 | 39.72 | 30.00 | 25.94 | 10.94 | 31.45 |
|
59 |
+
| internlm2-7b-turbomind | 33.94 | 35.51 | 38.33 | 59.33 | 61.56 | 2.34 | 11.29 |
|
60 |
+
| internlm2-20b-turbomind | 59.17 | 51.40 | 65.16 | 74.00 | 82.19 | 28.91 | 54.03 |
|
61 |
+
| qwen-1.8b-turbomind | 29.36 | 30.84 | 19.51 | 26.00 | 22.19 | 5.47 | 27.42 |
|
62 |
+
| qwen-7b-turbomind | 22.48 | 28.04 | 45.64 | 43.33 | 62.19 | 3.91 | 33.87 |
|
63 |
+
| qwen-14b-turbomind | 54.13 | 56.25 | 82.93 | 72.00 | 85.00 | 4.69 | 65.62 |
|
64 |
+
| qwen-72b-turbomind | 73.12 | 64.49 | 91.67 | 90.62 | 58.75 | 44.53 | 79.03 |
|
65 |
+
| qwen1.5-0.5b-hf | 26.61 | 32.71 | 32.40 | 34.67 | 53.44 | 10.94 | 28.23 |
|
66 |
+
| qwen1.5-1.8b-hf | 36.24 | 33.18 | 56.45 | 36.00 | 49.38 | 6.25 | 33.06 |
|
67 |
+
| qwen1.5-4b-hf | 45.41 | 37.85 | 68.29 | 62.00 | 87.81 | 5.47 | 47.58 |
|
68 |
+
| qwen1.5-7b-hf | 56.42 | 53.74 | 85.02 | 69.33 | 86.88 | 28.12 | 70.16 |
|
69 |
+
| qwen1.5-14b-hf | 69.27 | 63.08 | 54.01 | 79.33 | 76.56 | 40.62 | 79.84 |
|
70 |
+
| qwen1.5-32b-hf | 71.10 | 61.68 | 92.68 | 93.33 | 95.94 | 45.31 | 83.06 |
|
71 |
+
| qwen1.5-72b-hf | 71.15 | 68.22 | 94.44 | 96.67 | 95.00 | 38.28 | 75.00 |
|
72 |
+
| qwen1.5-moe-a2-7b-hf | 35.32 | 29.44 | 68.64 | 44.67 | 75.00 | 17.97 | 59.68 |
|
73 |
+
| mistral-7b-v0.1-hf | 13.76 | 12.15 | 9.76 | 8.00 | 5.94 | 0.00 | 17.74 |
|
74 |
+
| mistral-7b-v0.2-hf | 6.88 | 5.61 | 10.45 | 12.00 | 4.06 | 0.78 | 14.52 |
|
75 |
+
| mixtral-8x7b-v0.1-hf | 3.67 | 1.87 | 0.35 | 0.00 | 0.00 | 0.78 | 0.81 |
|
76 |
+
| mixtral-8x22b-v0.1-hf | 16.51 | 15.89 | 1.39 | 3.33 | 9.69 | 0.00 | 13.71 |
|
77 |
+
| yi-6b-hf | 6.25 | 3.12 | 40.74 | 43.75 | 35.94 | 8.59 | 31.25 |
|
78 |
+
| yi-34b-hf | 12.50 | 4.17 | 31.11 | 5.00 | 20.62 | 2.34 | 0.89 |
|
79 |
+
| deepseek-7b-base-hf | 14.22 | 13.08 | 25.78 | 20.67 | 20.31 | 5.47 | 18.55 |
|
80 |
+
| deepseek-67b-base-hf | 3.67 | 4.21 | 8.36 | 7.33 | 4.69 | 1.56 | 4.84 |
|
81 |
+
|
82 |
+
| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
|
83 |
+
|:------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
|
84 |
+
| llama-7b-turbomind | 19.05 | 0.00 | 15.00 | 16.15 | 22.11 | 10.43 | 15.00 |
|
85 |
+
| llama-13b-turbomind | 22.86 | 0.00 | 8.50 | 8.46 | 24.21 | 9.36 | 20.00 |
|
86 |
+
| llama-30b-turbomind | 28.57 | 0.00 | 6.33 | 13.85 | 23.16 | 12.98 | 12.50 |
|
87 |
+
| llama-65b-turbomind | 21.90 | 0.00 | 8.00 | 13.85 | 16.84 | 12.34 | 10.00 |
|
88 |
+
| llama-2-7b-turbomind | 20.95 | 0.00 | 6.17 | 12.31 | 22.11 | 11.28 | 11.25 |
|
89 |
+
| llama-2-13b-turbomind | 16.19 | 0.00 | 9.83 | 13.08 | 22.11 | 7.66 | 10.00 |
|
90 |
+
| llama-2-70b-turbomind | 31.43 | 0.00 | 4.17 | 13.08 | 25.26 | 20.43 | 7.50 |
|
91 |
+
| llama-3-8b-turbomind | 1.90 | 1.15 | 42.00 | 7.69 | 29.47 | 17.66 | 17.50 |
|
92 |
+
| llama-3-70b-turbomind | 18.75 | 3.45 | 53.67 | 76.15 | 18.60 | 36.76 | 8.75 |
|
93 |
+
| internlm2-1.8b-turbomind | 33.33 | 3.45 | 15.67 | 13.85 | 32.63 | 10.43 | 25.00 |
|
94 |
+
| internlm2-7b-turbomind | 61.90 | 20.69 | 57.33 | 20.77 | 61.05 | 40.21 | 47.50 |
|
95 |
+
| internlm2-20b-turbomind | 72.38 | 37.93 | 62.33 | 19.23 | 74.74 | 38.51 | 48.75 |
|
96 |
+
| qwen-1.8b-turbomind | 47.62 | 9.20 | 13.50 | 12.31 | 25.26 | 16.38 | 21.25 |
|
97 |
+
| qwen-7b-turbomind | 42.86 | 12.64 | 35.83 | 26.15 | 51.58 | 17.87 | 30.00 |
|
98 |
+
| qwen-14b-turbomind | 89.58 | 3.45 | 5.00 | 23.85 | 93.02 | 21.10 | 40.62 |
|
99 |
+
| qwen-72b-turbomind | 71.43 | 81.25 | 88.17 | 96.25 | 95.79 | 79.57 | 90.00 |
|
100 |
+
| qwen1.5-0.5b-hf | 40.95 | 22.99 | 21.67 | 21.54 | 38.95 | 17.02 | 22.50 |
|
101 |
+
| qwen1.5-1.8b-hf | 85.71 | 29.89 | 22.17 | 30.00 | 34.74 | 20.43 | 27.50 |
|
102 |
+
| qwen1.5-4b-hf | 88.57 | 35.63 | 41.00 | 67.69 | 64.21 | 41.28 | 68.75 |
|
103 |
+
| qwen1.5-7b-hf | 93.33 | 14.94 | 59.33 | 70.00 | 61.05 | 67.87 | 61.25 |
|
104 |
+
| qwen1.5-14b-hf | 94.29 | 16.09 | 59.67 | 76.92 | 90.53 | 59.57 | 77.50 |
|
105 |
+
| qwen1.5-32b-hf | 94.29 | 43.68 | 82.83 | 38.46 | 97.89 | 75.96 | 67.50 |
|
106 |
+
| qwen1.5-72b-hf | 99.05 | 28.74 | 85.62 | 77.69 | 94.74 | 72.77 | 87.50 |
|
107 |
+
| qwen1.5-moe-a2-7b-hf | 65.71 | 36.78 | 51.67 | 75.38 | 72.63 | 61.28 | 33.75 |
|
108 |
+
| mistral-7b-v0.1-hf | 17.14 | 8.05 | 28.33 | 6.92 | 24.21 | 30.43 | 12.50 |
|
109 |
+
| mistral-7b-v0.2-hf | 7.62 | 9.20 | 23.17 | 6.15 | 25.26 | 19.15 | 7.50 |
|
110 |
+
| mixtral-8x7b-v0.1-hf | 0.00 | 4.60 | 33.83 | 10.77 | 37.89 | 25.96 | 3.75 |
|
111 |
+
| mixtral-8x22b-v0.1-hf | 7.62 | 4.17 | 51.33 | 14.62 | 53.68 | 21.91 | 10.00 |
|
112 |
+
| yi-6b-hf | 17.14 | 52.87 | 50.83 | 36.25 | 36.84 | 48.09 | 36.25 |
|
113 |
+
| yi-34b-hf | 0.00 | 59.77 | 76.67 | 86.92 | 67.44 | 61.06 | 81.25 |
|
114 |
+
| deepseek-7b-base-hf | 20.95 | 2.30 | 17.83 | 12.31 | 25.26 | 12.55 | 8.75 |
|
115 |
+
| deepseek-67b-base-hf | 1.90 | 9.20 | 27.33 | 30.00 | 40.00 | 13.19 | 3.75 |
|
116 |
+
|
117 |
+
## Chat Models
|
118 |
+
|
119 |
+
| model | GaokaoBench |
|
120 |
+
|:-----------------------------:|--------------:|
|
121 |
+
| qwen1.5-0.5b-chat-hf | 21.51 |
|
122 |
+
| qwen1.5-1.8b-chat-hf | 46.19 |
|
123 |
+
| qwen1.5-4b-chat-hf | 59.11 |
|
124 |
+
| qwen1.5-7b-chat-hf | 70.55 |
|
125 |
+
| qwen1.5-14b-chat-hf | 80.39 |
|
126 |
+
| qwen1.5-32b-chat-hf | 86.15 |
|
127 |
+
| qwen1.5-72b-chat-hf | 88.58 |
|
128 |
+
| qwen1.5-110b-chat-hf | 89.59 |
|
129 |
+
| internlm2-chat-1.8b-hf | 29.73 |
|
130 |
+
| internlm2-chat-1.8b-sft-hf | 28.79 |
|
131 |
+
| internlm2-chat-7b-hf | 54.54 |
|
132 |
+
| internlm2-chat-7b-sft-hf | 55.39 |
|
133 |
+
| internlm2-chat-20b-hf | 57.95 |
|
134 |
+
| internlm2-chat-20b-sft-hf | 57.62 |
|
135 |
+
| llama-3-8b-instruct-hf | 45.48 |
|
136 |
+
| llama-3-70b-instruct-hf | 65.91 |
|
137 |
+
| llama-3-8b-instruct-lmdeploy | 44.48 |
|
138 |
+
| llama-3-70b-instruct-lmdeploy | 67.06 |
|
139 |
+
| mistral-7b-instruct-v0.1-hf | 26.21 |
|
140 |
+
| mistral-7b-instruct-v0.2-hf | 32.17 |
|
141 |
+
| mixtral-8x7b-instruct-v0.1-hf | 42.46 |
|
142 |
+
|
143 |
+
### Details
|
144 |
+
|
145 |
+
| model | 2010-2022_Math_II_MCQs | 2010-2022_Math_I_MCQs | 2010-2022_History_MCQs | 2010-2022_Biology_MCQs | 2010-2022_Political_Science_MCQs | 2010-2022_Physics_MCQs | 2010-2022_Chemistry_MCQs |
|
146 |
+
|:-----------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
|
147 |
+
| qwen1.5-0.5b-chat-hf | 25.23 | 25.70 | 39.02 | 24.67 | 25.00 | 0.78 | 25.00 |
|
148 |
+
| qwen1.5-1.8b-chat-hf | 30.28 | 26.64 | 61.32 | 55.33 | 77.81 | 11.72 | 40.32 |
|
149 |
+
| qwen1.5-4b-chat-hf | 38.53 | 35.05 | 70.73 | 70.00 | 83.44 | 25.00 | 41.13 |
|
150 |
+
| qwen1.5-7b-chat-hf | 49.54 | 39.72 | 81.88 | 82.67 | 90.62 | 46.88 | 61.29 |
|
151 |
+
| qwen1.5-14b-chat-hf | 64.68 | 54.21 | 87.80 | 90.67 | 94.69 | 44.53 | 69.35 |
|
152 |
+
| qwen1.5-32b-chat-hf | 70.92 | 66.14 | 98.02 | 97.74 | 96.07 | 57.81 | 72.92 |
|
153 |
+
| qwen1.5-72b-chat-hf | 76.61 | 68.22 | 95.47 | 96.00 | 97.19 | 64.06 | 86.29 |
|
154 |
+
| qwen1.5-110b-chat-hf | 80.36 | 66.67 | 100.00 | 100.00 | 96.25 | 65.62 | 75.00 |
|
155 |
+
| internlm2-chat-1.8b-hf | 28.44 | 28.50 | 46.69 | 39.33 | 44.38 | 10.16 | 26.61 |
|
156 |
+
| internlm2-chat-1.8b-sft-hf | 23.85 | 20.09 | 55.75 | 40.67 | 53.12 | 14.84 | 30.65 |
|
157 |
+
| internlm2-chat-7b-hf | 45.87 | 42.52 | 77.70 | 75.33 | 76.56 | 16.41 | 38.71 |
|
158 |
+
| internlm2-chat-7b-sft-hf | 49.08 | 39.72 | 80.84 | 68.67 | 81.25 | 29.69 | 42.74 |
|
159 |
+
| internlm2-chat-20b-hf | 53.21 | 46.73 | 80.49 | 74.00 | 85.00 | 31.25 | 37.10 |
|
160 |
+
| internlm2-chat-20b-sft-hf | 51.83 | 47.20 | 86.06 | 78.00 | 88.12 | 35.16 | 45.16 |
|
161 |
+
| llama-3-8b-instruct-hf | 37.16 | 31.31 | 60.98 | 48.67 | 51.25 | 11.72 | 39.52 |
|
162 |
+
| llama-3-70b-instruct-hf | 58.26 | 52.34 | 63.76 | 75.33 | 75.31 | 36.72 | 53.23 |
|
163 |
+
| llama-3-8b-instruct-lmdeploy | 37.61 | 35.51 | 55.05 | 53.33 | 52.19 | 7.81 | 34.68 |
|
164 |
+
| llama-3-70b-instruct-lmdeploy | 75.00 | 55.56 | 61.11 | 73.68 | 70.00 | 40.62 | 43.75 |
|
165 |
+
| mistral-7b-instruct-v0.1-hf | 23.39 | 21.03 | 35.19 | 18.00 | 26.56 | 5.47 | 30.65 |
|
166 |
+
| mistral-7b-instruct-v0.2-hf | 31.19 | 19.63 | 38.33 | 40.00 | 35.94 | 20.31 | 34.68 |
|
167 |
+
| mixtral-8x7b-instruct-v0.1-hf | 41.28 | 37.85 | 52.26 | 47.33 | 50.00 | 25.78 | 43.55 |
|
168 |
+
|
169 |
+
| model | 2010-2013_English_MCQs | 2010-2022_Chinese_Modern_Lit | 2010-2022_English_Fill_in_Blanks | 2012-2022_English_Cloze_Test | 2010-2022_Geography_MCQs | 2010-2022_English_Reading_Comp | 2010-2022_Chinese_Lang_and_Usage_MCQs |
|
170 |
+
|:-----------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
|
171 |
+
| qwen1.5-0.5b-chat-hf | 32.38 | 10.34 | 0.00 | 2.31 | 27.37 | 15.11 | 18.75 |
|
172 |
+
| qwen1.5-1.8b-chat-hf | 69.52 | 42.53 | 56.33 | 2.31 | 61.05 | 32.98 | 35.00 |
|
173 |
+
| qwen1.5-4b-chat-hf | 70.48 | 58.62 | 82.33 | 16.15 | 68.42 | 68.51 | 47.50 |
|
174 |
+
| qwen1.5-7b-chat-hf | 83.81 | 71.26 | 85.17 | 57.69 | 81.05 | 78.94 | 66.25 |
|
175 |
+
| qwen1.5-14b-chat-hf | 93.33 | 78.16 | 97.17 | 71.54 | 91.58 | 94.26 | 81.25 |
|
176 |
+
| qwen1.5-32b-chat-hf | 100.00 | 81.61 | 95.83 | 90.00 | 97.89 | 92.43 | 92.86 |
|
177 |
+
| qwen1.5-72b-chat-hf | 98.10 | 83.91 | 98.00 | 90.77 | 94.74 | 96.38 | 96.25 |
|
178 |
+
| qwen1.5-110b-chat-hf | 100.00 | 91.95 | 98.50 | 97.69 | 95.35 | 98.44 | 100.00 |
|
179 |
+
| internlm2-chat-1.8b-hf | 38.10 | 6.90 | 0.67 | 1.54 | 56.84 | 23.19 | 30.00 |
|
180 |
+
| internlm2-chat-1.8b-sft-hf | 50.48 | 0.00 | 0.00 | 0.00 | 27.37 | 11.91 | 32.50 |
|
181 |
+
| internlm2-chat-7b-hf | 60.95 | 67.82 | 7.00 | 7.69 | 70.53 | 79.79 | 38.75 |
|
182 |
+
| internlm2-chat-7b-sft-hf | 60.00 | 71.26 | 6.50 | 0.77 | 68.42 | 77.02 | 42.50 |
|
183 |
+
| internlm2-chat-20b-hf | 60.95 | 43.68 | 34.83 | 4.62 | 71.58 | 62.55 | 43.75 |
|
184 |
+
| internlm2-chat-20b-sft-hf | 75.24 | 47.13 | 1.00 | 2.31 | 80.00 | 65.96 | 37.50 |
|
185 |
+
| llama-3-8b-instruct-hf | 50.48 | 36.78 | 30.83 | 21.54 | 57.89 | 81.70 | 28.75 |
|
186 |
+
| llama-3-70b-instruct-hf | 73.33 | 59.77 | 82.83 | 24.62 | 73.68 | 91.28 | 45.00 |
|
187 |
+
| llama-3-8b-instruct-lmdeploy | 52.38 | 42.53 | 21.33 | 18.46 | 58.95 | 81.28 | 26.25 |
|
188 |
+
| llama-3-70b-instruct-lmdeploy | 87.50 | 62.07 | 84.38 | 26.92 | 72.63 | 91.20 | 56.25 |
|
189 |
+
| mistral-7b-instruct-v0.1-hf | 38.10 | 18.39 | 30.50 | 6.15 | 31.58 | 38.72 | 18.75 |
|
190 |
+
| mistral-7b-instruct-v0.2-hf | 41.90 | 31.03 | 28.00 | 20.77 | 29.47 | 42.13 | 15.00 |
|
191 |
+
| mixtral-8x7b-instruct-v0.1-hf | 49.52 | 39.08 | 41.33 | 9.23 | 44.21 | 43.19 | 21.25 |
|
opencompass/configs/datasets/XLSum/XLSum_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .XLSum_gen_2bb71c import XLSum_datasets # noqa: F401, F403
|
opencompass/configs/datasets/XLSum/XLSum_gen_2bb71c.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import RougeEvaluator
|
5 |
+
from opencompass.datasets import XLSUMDataset, Xsum_postprocess
|
6 |
+
|
7 |
+
XLSum_reader_cfg = dict(input_columns=['text'], output_column='summary')
|
8 |
+
|
9 |
+
XLSum_infer_cfg = dict(
|
10 |
+
prompt_template=dict(
|
11 |
+
type=PromptTemplate,
|
12 |
+
template='Document:{text}\n'
|
13 |
+
'Based on the previous text, provide a brief single summary:'),
|
14 |
+
retriever=dict(type=ZeroRetriever),
|
15 |
+
inferencer=dict(type=GenInferencer))
|
16 |
+
|
17 |
+
XLSum_eval_cfg = dict(
|
18 |
+
evaluator=dict(type=RougeEvaluator),
|
19 |
+
pred_postprocessor=dict(type=Xsum_postprocess),
|
20 |
+
)
|
21 |
+
|
22 |
+
XLSum_datasets = [
|
23 |
+
dict(
|
24 |
+
type=XLSUMDataset,
|
25 |
+
path='csebuetnlp/xlsum',
|
26 |
+
reader_cfg=XLSum_reader_cfg,
|
27 |
+
infer_cfg=XLSum_infer_cfg,
|
28 |
+
eval_cfg=XLSum_eval_cfg)
|
29 |
+
]
|
opencompass/configs/datasets/bbh/README.md
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# BBH
|
2 |
+
|
3 |
+
```bash
|
4 |
+
python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
|
5 |
+
python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
|
6 |
+
```
|
7 |
+
|
8 |
+
## Base Models
|
9 |
+
|
10 |
+
| model | bbh |
|
11 |
+
|:------------------------:|------:|
|
12 |
+
| llama-7b-turbomind | 33.34 |
|
13 |
+
| llama-13b-turbomind | 37.99 |
|
14 |
+
| llama-30b-turbomind | 49.86 |
|
15 |
+
| llama-65b-turbomind | 58.26 |
|
16 |
+
| llama-2-7b-turbomind | 38.27 |
|
17 |
+
| llama-2-13b-turbomind | 45.68 |
|
18 |
+
| llama-2-70b-turbomind | 64.78 |
|
19 |
+
| llama-3-8b-turbomind | 59.69 |
|
20 |
+
| llama-3-70b-turbomind | 79.16 |
|
21 |
+
| internlm2-1.8b-turbomind | 36.03 |
|
22 |
+
| internlm2-7b-turbomind | 63.56 |
|
23 |
+
| internlm2-20b-turbomind | 71.29 |
|
24 |
+
| qwen-1.8b-turbomind | 22.53 |
|
25 |
+
| qwen-7b-turbomind | 45.89 |
|
26 |
+
| qwen-14b-turbomind | 56.75 |
|
27 |
+
| qwen-72b-turbomind | 63.35 |
|
28 |
+
| qwen1.5-0.5b-hf | 20.54 |
|
29 |
+
| qwen1.5-1.8b-hf | 27.01 |
|
30 |
+
| qwen1.5-4b-hf | 34.81 |
|
31 |
+
| qwen1.5-7b-hf | 39.87 |
|
32 |
+
| qwen1.5-14b-hf | 50.38 |
|
33 |
+
| qwen1.5-32b-hf | 67.47 |
|
34 |
+
| qwen1.5-72b-hf | 58.81 |
|
35 |
+
| qwen1.5-moe-a2-7b-hf | 39.46 |
|
36 |
+
| mistral-7b-v0.1-hf | 56.71 |
|
37 |
+
| mistral-7b-v0.2-hf | 57.32 |
|
38 |
+
| mixtral-8x7b-v0.1-hf | 68.46 |
|
39 |
+
| mixtral-8x22b-v0.1-hf | 79.48 |
|
40 |
+
| yi-6b-hf | 44.82 |
|
41 |
+
| yi-34b-hf | 66.37 |
|
42 |
+
| deepseek-7b-base-hf | 42.88 |
|
43 |
+
| deepseek-67b-base-hf | 71.86 |
|
44 |
+
|
45 |
+
### Details
|
46 |
+
|
47 |
+
| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
|
48 |
+
|:------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
|
49 |
+
| llama-7b-turbomind | 23.60 | 46.00 | 44.80 | 36.40 | 30.14 | 0.00 | 46.07 | 21.60 | 15.20 |
|
50 |
+
| llama-13b-turbomind | 16.80 | 50.00 | 56.80 | 36.40 | 43.15 | 0.00 | 60.67 | 29.20 | 15.20 |
|
51 |
+
| llama-30b-turbomind | 33.60 | 60.00 | 76.40 | 29.20 | 57.53 | 0.00 | 59.55 | 62.40 | 17.20 |
|
52 |
+
| llama-65b-turbomind | 84.00 | 76.00 | 84.40 | 50.00 | 65.75 | 0.00 | 62.92 | 69.60 | 31.60 |
|
53 |
+
| llama-2-7b-turbomind | 12.00 | 46.80 | 60.00 | 34.00 | 32.19 | 0.00 | 49.44 | 32.80 | 18.40 |
|
54 |
+
| llama-2-13b-turbomind | 24.00 | 40.80 | 73.20 | 36.00 | 45.89 | 0.00 | 55.06 | 37.60 | 22.40 |
|
55 |
+
| llama-2-70b-turbomind | 75.60 | 66.80 | 88.80 | 73.60 | 69.86 | 0.00 | 73.60 | 60.80 | 57.60 |
|
56 |
+
| llama-3-8b-turbomind | 65.60 | 42.00 | 78.80 | 56.80 | 69.86 | 0.00 | 56.18 | 66.00 | 30.80 |
|
57 |
+
| llama-3-70b-turbomind | 100.00 | 82.80 | 91.60 | 100.00 | 86.30 | 0.00 | 81.46 | 77.20 | 94.40 |
|
58 |
+
| internlm2-1.8b-turbomind | 31.20 | 44.00 | 60.00 | 36.00 | 35.62 | 0.00 | 44.94 | 27.20 | 12.80 |
|
59 |
+
| internlm2-7b-turbomind | 94.80 | 75.60 | 86.40 | 53.60 | 69.18 | 0.00 | 59.55 | 68.00 | 46.00 |
|
60 |
+
| internlm2-20b-turbomind | 98.40 | 83.60 | 84.00 | 72.00 | 71.92 | 0.00 | 81.46 | 78.40 | 74.40 |
|
61 |
+
| qwen-1.8b-turbomind | 26.40 | 39.60 | 33.20 | 28.40 | 28.08 | 0.00 | 44.94 | 21.60 | 12.40 |
|
62 |
+
| qwen-7b-turbomind | 38.80 | 42.80 | 64.40 | 30.80 | 45.89 | 0.00 | 55.62 | 44.00 | 14.40 |
|
63 |
+
| qwen-14b-turbomind | 57.60 | 59.20 | 67.20 | 46.40 | 67.12 | 0.00 | 51.12 | 63.60 | 30.40 |
|
64 |
+
| qwen-72b-turbomind | 72.00 | 66.80 | 77.60 | 81.20 | 84.93 | 0.00 | 78.09 | 67.20 | 63.60 |
|
65 |
+
| qwen1.5-0.5b-hf | 15.20 | 37.20 | 20.40 | 30.40 | 18.49 | 8.40 | 44.94 | 11.20 | 14.00 |
|
66 |
+
| qwen1.5-1.8b-hf | 27.60 | 40.80 | 36.00 | 24.40 | 32.19 | 0.00 | 50.56 | 20.80 | 11.20 |
|
67 |
+
| qwen1.5-4b-hf | 10.40 | 44.40 | 47.20 | 36.80 | 44.52 | 24.80 | 46.63 | 20.80 | 14.80 |
|
68 |
+
| qwen1.5-7b-hf | 37.20 | 42.40 | 52.00 | 52.40 | 56.85 | 6.80 | 48.31 | 23.60 | 18.40 |
|
69 |
+
| qwen1.5-14b-hf | 38.80 | 62.80 | 73.60 | 24.80 | 69.86 | 26.80 | 66.29 | 52.80 | 2.00 |
|
70 |
+
| qwen1.5-32b-hf | 93.60 | 77.20 | 68.40 | 70.00 | 82.88 | 36.80 | 47.75 | 70.40 | 71.20 |
|
71 |
+
| qwen1.5-72b-hf | 75.60 | 66.00 | 78.80 | 72.80 | 80.82 | 0.00 | 75.84 | 64.80 | 44.40 |
|
72 |
+
| qwen1.5-moe-a2-7b-hf | 23.20 | 59.60 | 43.20 | 27.60 | 46.58 | 25.20 | 48.88 | 16.80 | 13.20 |
|
73 |
+
| mistral-7b-v0.1-hf | 73.60 | 53.60 | 76.40 | 45.20 | 56.85 | 28.00 | 64.04 | 66.00 | 21.60 |
|
74 |
+
| mistral-7b-v0.2-hf | 76.80 | 42.00 | 73.20 | 47.20 | 60.27 | 26.00 | 66.85 | 60.80 | 26.40 |
|
75 |
+
| mixtral-8x7b-v0.1-hf | 89.60 | 70.80 | 84.80 | 81.20 | 70.55 | 25.60 | 66.29 | 71.20 | 58.80 |
|
76 |
+
| mixtral-8x22b-v0.1-hf | 98.80 | 77.60 | 92.00 | 98.80 | 83.56 | 35.60 | 80.34 | 79.20 | 82.00 |
|
77 |
+
| yi-6b-hf | 32.80 | 46.40 | 64.40 | 34.40 | 47.26 | 28.80 | 60.11 | 45.60 | 14.00 |
|
78 |
+
| yi-34b-hf | 86.00 | 76.00 | 84.80 | 54.80 | 67.81 | 24.80 | 73.60 | 66.00 | 65.60 |
|
79 |
+
| deepseek-7b-base-hf | 27.60 | 42.00 | 64.40 | 31.20 | 40.41 | 33.60 | 52.25 | 46.00 | 13.20 |
|
80 |
+
| deepseek-67b-base-hf | 95.60 | 75.60 | 86.40 | 86.40 | 76.71 | 39.20 | 76.40 | 77.20 | 82.00 |
|
81 |
+
|
82 |
+
| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
|
83 |
+
|:------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
|
84 |
+
| llama-7b-turbomind | 18.40 | 42.80 | 58.00 | 23.20 | 13.20 | 40.00 | 16.40 | 30.40 | 0.00 |
|
85 |
+
| llama-13b-turbomind | 16.00 | 48.80 | 53.60 | 30.40 | 16.40 | 61.60 | 11.20 | 44.80 | 0.80 |
|
86 |
+
| llama-30b-turbomind | 22.40 | 66.40 | 73.20 | 43.60 | 31.60 | 84.40 | 43.60 | 57.60 | 2.80 |
|
87 |
+
| llama-65b-turbomind | 41.60 | 79.20 | 74.40 | 48.40 | 39.20 | 91.20 | 40.40 | 67.20 | 20.00 |
|
88 |
+
| llama-2-7b-turbomind | 17.20 | 54.80 | 51.60 | 32.80 | 23.60 | 74.40 | 19.60 | 45.60 | 1.20 |
|
89 |
+
| llama-2-13b-turbomind | 23.20 | 63.60 | 52.40 | 46.00 | 42.00 | 68.00 | 21.60 | 62.00 | 2.00 |
|
90 |
+
| llama-2-70b-turbomind | 72.40 | 86.40 | 84.40 | 55.20 | 43.20 | 95.60 | 50.80 | 76.80 | 20.80 |
|
91 |
+
| llama-3-8b-turbomind | 40.80 | 76.40 | 93.20 | 45.20 | 36.80 | 88.80 | 53.60 | 72.80 | 30.80 |
|
92 |
+
| llama-3-70b-turbomind | 99.20 | 94.00 | 98.00 | 58.40 | 42.80 | 93.60 | 63.60 | 88.40 | 79.20 |
|
93 |
+
| internlm2-1.8b-turbomind | 16.80 | 47.60 | 63.60 | 21.60 | 12.00 | 69.20 | 16.80 | 45.20 | 5.60 |
|
94 |
+
| internlm2-7b-turbomind | 51.20 | 78.80 | 90.40 | 52.00 | 41.20 | 95.60 | 58.80 | 74.40 | 44.40 |
|
95 |
+
| internlm2-20b-turbomind | 81.20 | 95.60 | 83.60 | 62.40 | 48.00 | 94.80 | 57.60 | 75.60 | 72.80 |
|
96 |
+
| qwen-1.8b-turbomind | 14.80 | 35.60 | 51.20 | 22.40 | 15.20 | 31.20 | 12.40 | 22.00 | 3.20 |
|
97 |
+
| qwen-7b-turbomind | 20.80 | 54.80 | 76.00 | 37.60 | 27.60 | 74.80 | 41.20 | 57.60 | 23.60 |
|
98 |
+
| qwen-14b-turbomind | 35.60 | 81.20 | 78.40 | 45.20 | 40.80 | 80.00 | 44.80 | 70.40 | 65.60 |
|
99 |
+
| qwen-72b-turbomind | 66.40 | 89.20 | 90.40 | 60.00 | 50.80 | 81.60 | 56.40 | 88.00 | 70.40 |
|
100 |
+
| qwen1.5-0.5b-hf | 20.00 | 34.80 | 46.80 | 18.80 | 15.60 | 24.40 | 15.20 | 16.00 | 1.20 |
|
101 |
+
| qwen1.5-1.8b-hf | 18.00 | 32.80 | 66.00 | 18.80 | 11.20 | 24.80 | 13.60 | 27.60 | 4.80 |
|
102 |
+
| qwen1.5-4b-hf | 18.40 | 56.40 | 56.80 | 30.00 | 20.80 | 40.80 | 46.80 | 44.80 | 41.20 |
|
103 |
+
| qwen1.5-7b-hf | 32.40 | 58.40 | 67.20 | 36.00 | 28.00 | 62.80 | 49.20 | 60.40 | 48.00 |
|
104 |
+
| qwen1.5-14b-hf | 7.20 | 78.40 | 75.20 | 41.20 | 27.60 | 74.40 | 46.00 | 81.60 | 8.00 |
|
105 |
+
| qwen1.5-32b-hf | 71.60 | 88.40 | 97.60 | 58.80 | 46.40 | 68.00 | 51.60 | 88.40 | 66.80 |
|
106 |
+
| qwen1.5-72b-hf | 61.20 | 88.40 | 96.00 | 60.40 | 49.20 | 86.40 | 34.80 | 86.80 | 53.60 |
|
107 |
+
| qwen1.5-moe-a2-7b-hf | 22.80 | 49.20 | 68.00 | 28.40 | 22.40 | 58.40 | 40.80 | 42.00 | 33.60 |
|
108 |
+
| mistral-7b-v0.1-hf | 30.40 | 79.60 | 70.80 | 54.40 | 42.80 | 77.60 | 47.20 | 70.00 | 30.40 |
|
109 |
+
| mistral-7b-v0.2-hf | 32.80 | 74.00 | 77.60 | 48.00 | 40.40 | 84.00 | 49.20 | 76.00 | 35.20 |
|
110 |
+
| mixtral-8x7b-v0.1-hf | 66.80 | 86.00 | 94.80 | 50.40 | 40.40 | 86.40 | 53.20 | 82.80 | 60.80 |
|
111 |
+
| mixtral-8x22b-v0.1-hf | 87.60 | 95.20 | 99.60 | 70.00 | 54.00 | 95.20 | 58.40 | 95.20 | 82.00 |
|
112 |
+
| yi-6b-hf | 17.20 | 49.20 | 72.40 | 34.40 | 28.00 | 76.80 | 32.40 | 56.80 | 9.20 |
|
113 |
+
| yi-34b-hf | 67.20 | 85.60 | 79.60 | 49.20 | 39.60 | 86.80 | 56.00 | 81.20 | 33.20 |
|
114 |
+
| deepseek-7b-base-hf | 17.60 | 51.20 | 72.40 | 28.80 | 20.00 | 78.40 | 28.80 | 46.80 | 1.60 |
|
115 |
+
| deepseek-67b-base-hf | 82.40 | 90.00 | 78.80 | 60.40 | 44.80 | 88.80 | 56.80 | 86.40 | 38.00 |
|
116 |
+
|
117 |
+
| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
|
118 |
+
|:------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
|
119 |
+
| llama-7b-turbomind | 45.20 | 1.60 | 8.40 | 81.60 | 66.00 | 47.20 | 46.00 | 40.64 | 57.20 |
|
120 |
+
| llama-13b-turbomind | 59.20 | 0.80 | 14.40 | 76.40 | 69.20 | 46.40 | 47.20 | 53.48 | 66.80 |
|
121 |
+
| llama-30b-turbomind | 64.80 | 2.40 | 17.20 | 93.60 | 78.40 | 71.20 | 43.20 | 55.61 | 98.40 |
|
122 |
+
| llama-65b-turbomind | 72.40 | 6.80 | 21.60 | 98.80 | 81.60 | 70.00 | 40.80 | 55.61 | 99.60 |
|
123 |
+
| llama-2-7b-turbomind | 54.40 | 1.20 | 10.80 | 88.80 | 68.40 | 49.20 | 48.40 | 52.41 | 53.20 |
|
124 |
+
| llama-2-13b-turbomind | 74.40 | 2.80 | 18.80 | 97.60 | 74.40 | 52.80 | 46.40 | 54.55 | 96.00 |
|
125 |
+
| llama-2-70b-turbomind | 82.40 | 13.60 | 30.40 | 98.40 | 81.60 | 83.20 | 43.60 | 63.64 | 100.00 |
|
126 |
+
| llama-3-8b-turbomind | 90.00 | 9.20 | 38.80 | 95.20 | 87.60 | 84.80 | 51.20 | 50.27 | 100.00 |
|
127 |
+
| llama-3-70b-turbomind | 96.80 | 48.40 | 48.80 | 99.60 | 92.40 | 99.60 | 62.40 | 58.29 | 100.00 |
|
128 |
+
| internlm2-1.8b-turbomind | 64.40 | 0.40 | 3.20 | 66.40 | 54.00 | 50.00 | 49.20 | 48.13 | 46.80 |
|
129 |
+
| internlm2-7b-turbomind | 78.80 | 2.40 | 35.20 | 95.60 | 85.60 | 75.60 | 48.00 | 63.10 | 92.00 |
|
130 |
+
| internlm2-20b-turbomind | 88.80 | 15.60 | 36.00 | 96.80 | 88.80 | 76.00 | 50.40 | 56.68 | 100.00 |
|
131 |
+
| qwen-1.8b-turbomind | 50.00 | 0.00 | 0.80 | 62.80 | 29.20 | 2.40 | 6.00 | 12.83 | 1.60 |
|
132 |
+
| qwen-7b-turbomind | 62.80 | 1.60 | 18.00 | 81.60 | 75.20 | 68.80 | 50.00 | 63.64 | 66.80 |
|
133 |
+
| qwen-14b-turbomind | 75.60 | 1.20 | 26.80 | 88.80 | 80.40 | 74.40 | 50.00 | 53.48 | 96.80 |
|
134 |
+
| qwen-72b-turbomind | 56.00 | 14.40 | 35.20 | 87.60 | 91.60 | 81.60 | 5.60 | 31.55 | 62.40 |
|
135 |
+
| qwen1.5-0.5b-hf | 25.60 | 0.00 | 0.40 | 41.60 | 51.60 | 16.80 | 4.40 | 1.07 | 20.00 |
|
136 |
+
| qwen1.5-1.8b-hf | 55.60 | 0.00 | 1.60 | 63.60 | 55.20 | 47.60 | 4.40 | 28.88 | 11.20 |
|
137 |
+
| qwen1.5-4b-hf | 61.60 | 0.40 | 8.80 | 0.80 | 76.00 | 54.40 | 0.80 | 28.34 | 62.40 |
|
138 |
+
| qwen1.5-7b-hf | 63.60 | 2.40 | 20.80 | 72.40 | 69.60 | 26.80 | 0.00 | 40.64 | 0.00 |
|
139 |
+
| qwen1.5-14b-hf | 82.40 | 1.20 | 27.60 | 78.40 | 87.20 | 48.00 | 54.00 | 24.06 | 100.00 |
|
140 |
+
| qwen1.5-32b-hf | 86.80 | 5.60 | 36.80 | 90.00 | 86.40 | 66.40 | 35.60 | 62.57 | 95.60 |
|
141 |
+
| qwen1.5-72b-hf | 48.40 | 13.20 | 34.40 | 87.60 | 8.00 | 67.60 | 13.60 | 39.57 | 99.60 |
|
142 |
+
| qwen1.5-moe-a2-7b-hf | 56.80 | 2.00 | 8.80 | 79.60 | 73.60 | 66.80 | 4.00 | 53.48 | 50.40 |
|
143 |
+
| mistral-7b-v0.1-hf | 73.60 | 4.00 | 26.40 | 97.20 | 82.00 | 67.60 | 43.20 | 48.66 | 100.00 |
|
144 |
+
| mistral-7b-v0.2-hf | 72.80 | 4.00 | 30.40 | 97.20 | 81.20 | 66.80 | 46.00 | 52.41 | 100.00 |
|
145 |
+
| mixtral-8x7b-v0.1-hf | 85.60 | 18.80 | 33.60 | 98.00 | 90.80 | 85.20 | 49.60 | 55.61 | 90.80 |
|
146 |
+
| mixtral-8x22b-v0.1-hf | 92.80 | 51.60 | 40.00 | 98.40 | 91.60 | 95.60 | 54.80 | 56.15 | 100.00 |
|
147 |
+
| yi-6b-hf | 66.40 | 1.20 | 16.00 | 92.80 | 59.60 | 53.20 | 53.20 | 52.41 | 65.20 |
|
148 |
+
| yi-34b-hf | 81.20 | 18.80 | 36.40 | 97.60 | 85.60 | 84.00 | 51.20 | 59.89 | 99.60 |
|
149 |
+
| deepseek-7b-base-hf | 59.20 | 3.20 | 6.40 | 92.00 | 73.20 | 49.60 | 50.80 | 52.41 | 74.80 |
|
150 |
+
| deepseek-67b-base-hf | 85.20 | 30.00 | 33.20 | 99.60 | 84.80 | 82.40 | 46.80 | 56.68 | 99.60 |
|
151 |
+
|
152 |
+
## Chat Models
|
153 |
+
|
154 |
+
| model | bbh |
|
155 |
+
|:-----------------------------:|------:|
|
156 |
+
| qwen1.5-0.5b-chat-hf | 24.12 |
|
157 |
+
| qwen1.5-1.8b-chat-hf | 26.82 |
|
158 |
+
| qwen1.5-4b-chat-hf | 43.15 |
|
159 |
+
| qwen1.5-7b-chat-hf | 38.12 |
|
160 |
+
| qwen1.5-14b-chat-hf | 55.38 |
|
161 |
+
| qwen1.5-32b-chat-hf | 69.28 |
|
162 |
+
| qwen1.5-72b-chat-hf | 72.97 |
|
163 |
+
| qwen1.5-110b-chat-hf | 71.04 |
|
164 |
+
| internlm2-chat-1.8b-hf | 37.69 |
|
165 |
+
| internlm2-chat-1.8b-sft-hf | 37.12 |
|
166 |
+
| internlm2-chat-7b-hf | 57.83 |
|
167 |
+
| internlm2-chat-7b-sft-hf | 57.19 |
|
168 |
+
| internlm2-chat-20b-hf | 68.24 |
|
169 |
+
| internlm2-chat-20b-sft-hf | 69.38 |
|
170 |
+
| llama-3-8b-instruct-hf | 52.85 |
|
171 |
+
| llama-3-70b-instruct-hf | 82.42 |
|
172 |
+
| llama-3-8b-instruct-lmdeploy | 53.54 |
|
173 |
+
| llama-3-70b-instruct-lmdeploy | 82.58 |
|
174 |
+
| mistral-7b-instruct-v0.1-hf | 32.88 |
|
175 |
+
| mistral-7b-instruct-v0.2-hf | 48.84 |
|
176 |
+
| mixtral-8x7b-instruct-v0.1-hf | 59.64 |
|
177 |
+
|
178 |
+
### Details
|
179 |
+
|
180 |
+
| model | temporal_sequences | disambiguation_qa | date_understanding | tracking_shuffled_objects_three_objects | penguins_in_a_table | geometric_shapes | snarks | ruin_names | tracking_shuffled_objects_seven_objects |
|
181 |
+
|:-----------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
|
182 |
+
| qwen1.5-0.5b-chat-hf | 25.60 | 42.00 | 20.00 | 31.20 | 15.07 | 14.40 | 46.07 | 24.80 | 13.20 |
|
183 |
+
| qwen1.5-1.8b-chat-hf | 28.80 | 36.00 | 30.40 | 35.20 | 19.18 | 7.60 | 46.63 | 24.00 | 9.60 |
|
184 |
+
| qwen1.5-4b-chat-hf | 8.00 | 56.00 | 64.80 | 28.40 | 48.63 | 19.60 | 60.67 | 34.00 | 14.40 |
|
185 |
+
| qwen1.5-7b-chat-hf | 39.60 | 37.60 | 62.40 | 36.80 | 60.96 | 30.80 | 54.49 | 38.00 | 20.00 |
|
186 |
+
| qwen1.5-14b-chat-hf | 61.60 | 63.60 | 70.00 | 54.00 | 74.66 | 33.60 | 67.42 | 61.20 | 35.60 |
|
187 |
+
| qwen1.5-32b-chat-hf | 94.40 | 77.60 | 78.00 | 66.00 | 93.84 | 46.00 | 82.58 | 73.60 | 61.60 |
|
188 |
+
| qwen1.5-72b-chat-hf | 70.40 | 72.40 | 84.40 | 67.20 | 89.73 | 52.00 | 79.21 | 86.40 | 68.80 |
|
189 |
+
| qwen1.5-110b-chat-hf | 74.80 | 71.20 | 82.80 | 74.80 | 89.04 | 48.00 | 90.45 | 87.60 | 73.60 |
|
190 |
+
| internlm2-chat-1.8b-hf | 35.60 | 52.40 | 48.80 | 29.60 | 39.73 | 24.40 | 51.69 | 27.20 | 13.20 |
|
191 |
+
| internlm2-chat-1.8b-sft-hf | 37.20 | 53.60 | 44.00 | 30.00 | 34.93 | 22.40 | 56.74 | 28.00 | 12.00 |
|
192 |
+
| internlm2-chat-7b-hf | 72.00 | 66.40 | 73.60 | 65.20 | 60.27 | 50.00 | 62.92 | 52.40 | 44.40 |
|
193 |
+
| internlm2-chat-7b-sft-hf | 67.20 | 66.80 | 58.00 | 63.20 | 48.63 | 45.60 | 64.04 | 59.60 | 42.80 |
|
194 |
+
| internlm2-chat-20b-hf | 80.40 | 76.00 | 77.60 | 88.80 | 78.08 | 36.40 | 71.91 | 71.60 | 77.20 |
|
195 |
+
| internlm2-chat-20b-sft-hf | 80.00 | 70.80 | 78.00 | 87.60 | 82.88 | 41.20 | 76.40 | 72.80 | 71.60 |
|
196 |
+
| llama-3-8b-instruct-hf | 70.40 | 42.80 | 28.40 | 81.20 | 13.01 | 49.20 | 44.94 | 73.20 | 42.40 |
|
197 |
+
| llama-3-70b-instruct-hf | 100.00 | 84.00 | 91.60 | 95.60 | 78.08 | 52.40 | 87.08 | 89.60 | 97.60 |
|
198 |
+
| llama-3-8b-instruct-lmdeploy | 73.20 | 45.60 | 34.00 | 79.60 | 31.51 | 48.40 | 47.75 | 76.80 | 47.60 |
|
199 |
+
| llama-3-70b-instruct-lmdeploy | 100.00 | 84.00 | 90.00 | 96.80 | 83.56 | 56.00 | 87.08 | 89.20 | 97.20 |
|
200 |
+
| mistral-7b-instruct-v0.1-hf | 32.00 | 22.40 | 52.40 | 35.20 | 30.82 | 23.20 | 38.76 | 46.00 | 18.40 |
|
201 |
+
| mistral-7b-instruct-v0.2-hf | 66.00 | 58.40 | 50.40 | 48.40 | 48.63 | 37.20 | 65.73 | 40.40 | 29.20 |
|
202 |
+
| mixtral-8x7b-instruct-v0.1-hf | 63.20 | 68.40 | 65.20 | 60.00 | 78.08 | 40.40 | 74.16 | 64.00 | 46.00 |
|
203 |
+
|
204 |
+
| model | tracking_shuffled_objects_five_objects | logical_deduction_three_objects | hyperbaton | logical_deduction_five_objects | logical_deduction_seven_objects | movie_recommendation | salient_translation_error_detection | reasoning_about_colored_objects | multistep_arithmetic_two |
|
205 |
+
|:-----------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
|
206 |
+
| qwen1.5-0.5b-chat-hf | 20.40 | 34.40 | 51.60 | 21.20 | 13.20 | 26.00 | 20.80 | 17.20 | 1.20 |
|
207 |
+
| qwen1.5-1.8b-chat-hf | 18.00 | 34.80 | 48.40 | 21.20 | 16.40 | 34.80 | 24.00 | 28.80 | 4.40 |
|
208 |
+
| qwen1.5-4b-chat-hf | 19.20 | 56.80 | 65.20 | 36.40 | 35.60 | 51.60 | 40.40 | 55.20 | 29.20 |
|
209 |
+
| qwen1.5-7b-chat-hf | 31.60 | 58.80 | 53.20 | 35.60 | 27.20 | 56.00 | 44.80 | 62.00 | 50.00 |
|
210 |
+
| qwen1.5-14b-chat-hf | 43.20 | 75.20 | 52.80 | 52.40 | 50.80 | 76.40 | 48.80 | 83.60 | 65.20 |
|
211 |
+
| qwen1.5-32b-chat-hf | 68.40 | 84.00 | 81.20 | 57.20 | 46.00 | 78.80 | 54.40 | 86.00 | 86.00 |
|
212 |
+
| qwen1.5-72b-chat-hf | 76.80 | 94.40 | 85.20 | 62.80 | 54.00 | 78.40 | 63.60 | 86.40 | 82.80 |
|
213 |
+
| qwen1.5-110b-chat-hf | 79.20 | 91.60 | 88.80 | 61.20 | 50.00 | 82.40 | 59.60 | 88.80 | 78.00 |
|
214 |
+
| internlm2-chat-1.8b-hf | 20.00 | 48.40 | 56.00 | 24.40 | 26.80 | 65.20 | 18.00 | 39.60 | 7.60 |
|
215 |
+
| internlm2-chat-1.8b-sft-hf | 18.40 | 48.00 | 51.20 | 20.40 | 25.20 | 63.20 | 22.00 | 38.80 | 6.00 |
|
216 |
+
| internlm2-chat-7b-hf | 48.40 | 75.20 | 84.80 | 42.00 | 36.80 | 79.60 | 53.20 | 65.60 | 26.40 |
|
217 |
+
| internlm2-chat-7b-sft-hf | 44.00 | 72.40 | 85.60 | 41.60 | 37.20 | 82.40 | 55.60 | 52.80 | 32.00 |
|
218 |
+
| internlm2-chat-20b-hf | 88.00 | 88.80 | 88.80 | 52.80 | 50.40 | 85.20 | 56.80 | 79.60 | 40.00 |
|
219 |
+
| internlm2-chat-20b-sft-hf | 83.20 | 90.00 | 90.40 | 55.60 | 48.80 | 84.40 | 57.60 | 79.20 | 38.40 |
|
220 |
+
| llama-3-8b-instruct-hf | 49.60 | 85.60 | 76.00 | 54.00 | 29.20 | 57.60 | 46.00 | 44.80 | 52.00 |
|
221 |
+
| llama-3-70b-instruct-hf | 99.20 | 96.80 | 95.20 | 77.20 | 65.20 | 80.00 | 69.60 | 94.80 | 84.00 |
|
222 |
+
| llama-3-8b-instruct-lmdeploy | 57.20 | 78.00 | 75.60 | 36.00 | 13.20 | 59.20 | 53.60 | 54.80 | 52.80 |
|
223 |
+
| llama-3-70b-instruct-lmdeploy | 98.80 | 96.40 | 96.80 | 75.20 | 68.80 | 79.60 | 67.60 | 94.00 | 84.80 |
|
224 |
+
| mistral-7b-instruct-v0.1-hf | 26.00 | 46.00 | 60.00 | 38.00 | 24.00 | 59.20 | 1.20 | 6.00 | 12.40 |
|
225 |
+
| mistral-7b-instruct-v0.2-hf | 39.60 | 63.60 | 64.00 | 44.00 | 33.20 | 56.00 | 42.40 | 68.40 | 14.00 |
|
226 |
+
| mixtral-8x7b-instruct-v0.1-hf | 46.40 | 71.60 | 88.80 | 48.00 | 36.80 | 60.00 | 50.00 | 81.20 | 59.20 |
|
227 |
+
|
228 |
+
| model | navigate | dyck_languages | word_sorting | sports_understanding | boolean_expressions | object_counting | formal_fallacies | causal_judgement | web_of_lies |
|
229 |
+
|:-----------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
|
230 |
+
| qwen1.5-0.5b-chat-hf | 45.60 | 0.00 | 1.20 | 17.20 | 50.40 | 16.40 | 11.60 | 42.78 | 27.60 |
|
231 |
+
| qwen1.5-1.8b-chat-hf | 58.40 | 0.00 | 2.00 | 34.00 | 44.80 | 30.40 | 11.60 | 24.60 | 50.00 |
|
232 |
+
| qwen1.5-4b-chat-hf | 64.00 | 3.20 | 6.80 | 80.40 | 77.60 | 48.80 | 41.20 | 55.61 | 63.20 |
|
233 |
+
| qwen1.5-7b-chat-hf | 54.40 | 0.40 | 8.00 | 55.60 | 47.60 | 31.20 | 0.00 | 2.14 | 30.00 |
|
234 |
+
| qwen1.5-14b-chat-hf | 74.40 | 6.40 | 26.40 | 72.40 | 76.40 | 61.60 | 0.80 | 25.67 | 81.20 |
|
235 |
+
| qwen1.5-32b-chat-hf | 90.00 | 10.40 | 28.40 | 82.40 | 92.80 | 76.80 | 32.40 | 41.71 | 100.00 |
|
236 |
+
| qwen1.5-72b-chat-hf | 81.20 | 18.40 | 37.60 | 95.20 | 92.80 | 76.00 | 50.40 | 63.64 | 100.00 |
|
237 |
+
| qwen1.5-110b-chat-hf | 91.60 | 18.00 | 39.60 | 82.80 | 80.80 | 75.20 | 22.40 | 35.83 | 100.00 |
|
238 |
+
| internlm2-chat-1.8b-hf | 63.20 | 0.00 | 6.00 | 58.00 | 56.80 | 48.80 | 54.80 | 52.94 | 48.40 |
|
239 |
+
| internlm2-chat-1.8b-sft-hf | 63.20 | 0.00 | 5.60 | 58.00 | 56.80 | 50.00 | 52.40 | 56.68 | 47.60 |
|
240 |
+
| internlm2-chat-7b-hf | 73.60 | 3.60 | 18.00 | 55.20 | 83.60 | 62.80 | 50.00 | 58.29 | 97.20 |
|
241 |
+
| internlm2-chat-7b-sft-hf | 71.60 | 4.40 | 20.00 | 82.00 | 84.00 | 60.00 | 51.60 | 52.94 | 98.00 |
|
242 |
+
| internlm2-chat-20b-hf | 82.40 | 8.00 | 36.00 | 55.60 | 84.40 | 78.00 | 50.40 | 59.36 | 100.00 |
|
243 |
+
| internlm2-chat-20b-sft-hf | 81.60 | 10.40 | 36.40 | 89.20 | 82.40 | 80.40 | 48.40 | 55.61 | 100.00 |
|
244 |
+
| llama-3-8b-instruct-hf | 82.80 | 8.80 | 37.20 | 94.40 | 78.80 | 89.60 | 45.20 | 24.06 | 25.60 |
|
245 |
+
| llama-3-70b-instruct-hf | 95.20 | 18.80 | 49.20 | 98.00 | 94.00 | 90.00 | 73.20 | 68.98 | 100.00 |
|
246 |
+
| llama-3-8b-instruct-lmdeploy | 83.60 | 10.00 | 40.40 | 96.00 | 77.20 | 89.20 | 43.60 | 37.43 | 3.20 |
|
247 |
+
| llama-3-70b-instruct-lmdeploy | 95.60 | 22.40 | 48.80 | 96.80 | 91.60 | 87.20 | 72.00 | 69.52 | 100.00 |
|
248 |
+
| mistral-7b-instruct-v0.1-hf | 70.80 | 0.80 | 5.20 | 68.80 | 69.60 | 51.60 | 3.20 | 12.30 | 33.60 |
|
249 |
+
| mistral-7b-instruct-v0.2-hf | 62.40 | 4.00 | 15.60 | 81.20 | 70.40 | 50.40 | 32.00 | 34.76 | 98.40 |
|
250 |
+
| mixtral-8x7b-instruct-v0.1-hf | 76.40 | 12.80 | 23.20 | 55.20 | 85.60 | 83.60 | 40.00 | 43.32 | 88.80 |
|
opencompass/configs/datasets/bbh/bbh_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .bbh_gen_5b92b0 import bbh_datasets # noqa: F401, F403
|
opencompass/configs/datasets/bbh/bbh_gen_2879b0.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from mmengine.config import read_base
|
3 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
4 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
5 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
6 |
+
from opencompass.datasets import BBHDataset, bbh_mcq_postprocess, BBHEvaluator, BBHEvaluator_mcq
|
7 |
+
|
8 |
+
with read_base():
|
9 |
+
from .bbh_subset_settings import settings
|
10 |
+
|
11 |
+
bbh_datasets = []
|
12 |
+
for name, test_type in settings:
|
13 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{name}.txt'), 'r') as f:
|
14 |
+
hint = f.read()
|
15 |
+
|
16 |
+
task_prompt, body = hint.split('\n\nQ:', 1)
|
17 |
+
sections = ('Q:' + body).split('\n\n')
|
18 |
+
prompt_rounds = []
|
19 |
+
for index, section in enumerate(sections):
|
20 |
+
question, answer = section.split('\nA:')
|
21 |
+
answer = 'A:' + answer
|
22 |
+
if index == 0:
|
23 |
+
desc = task_prompt.strip() + '\n'
|
24 |
+
else:
|
25 |
+
desc = ''
|
26 |
+
prompt_rounds.append(dict(role='HUMAN', prompt=f'{desc}{question.strip()}'))
|
27 |
+
prompt_rounds.append(dict(role='BOT', prompt=answer.strip()))
|
28 |
+
prompt_rounds.append(dict(role='HUMAN', prompt='Q: {input}'))
|
29 |
+
|
30 |
+
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
31 |
+
|
32 |
+
bbh_infer_cfg = dict(
|
33 |
+
prompt_template=dict(type=PromptTemplate, template=dict(round=prompt_rounds)),
|
34 |
+
retriever=dict(type=ZeroRetriever),
|
35 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
36 |
+
|
37 |
+
if test_type == 'mcq':
|
38 |
+
bbh_eval_cfg = dict(
|
39 |
+
evaluator=dict(type=BBHEvaluator_mcq),
|
40 |
+
pred_role='BOT',
|
41 |
+
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
42 |
+
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
43 |
+
else:
|
44 |
+
bbh_eval_cfg = dict(
|
45 |
+
evaluator=dict(type=BBHEvaluator),
|
46 |
+
pred_role='BOT')
|
47 |
+
|
48 |
+
bbh_datasets.append(
|
49 |
+
dict(
|
50 |
+
type=BBHDataset,
|
51 |
+
path='opencompass/bbh',
|
52 |
+
name=name,
|
53 |
+
abbr='bbh-' + name,
|
54 |
+
reader_cfg=bbh_reader_cfg.copy(),
|
55 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
56 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
opencompass/configs/datasets/bbh/bbh_gen_4a31fa.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
6 |
+
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
7 |
+
|
8 |
+
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
9 |
+
|
10 |
+
bbh_multiple_choice_sets = [
|
11 |
+
'temporal_sequences',
|
12 |
+
'disambiguation_qa',
|
13 |
+
'date_understanding',
|
14 |
+
'tracking_shuffled_objects_three_objects',
|
15 |
+
'penguins_in_a_table',
|
16 |
+
'geometric_shapes',
|
17 |
+
'snarks',
|
18 |
+
'ruin_names',
|
19 |
+
'tracking_shuffled_objects_seven_objects',
|
20 |
+
'tracking_shuffled_objects_five_objects',
|
21 |
+
'logical_deduction_three_objects',
|
22 |
+
'hyperbaton',
|
23 |
+
'logical_deduction_five_objects',
|
24 |
+
'logical_deduction_seven_objects',
|
25 |
+
'movie_recommendation',
|
26 |
+
'salient_translation_error_detection',
|
27 |
+
'reasoning_about_colored_objects',
|
28 |
+
]
|
29 |
+
bbh_free_form_sets = [
|
30 |
+
'multistep_arithmetic_two',
|
31 |
+
'navigate',
|
32 |
+
'dyck_languages',
|
33 |
+
'word_sorting',
|
34 |
+
'sports_understanding',
|
35 |
+
'boolean_expressions',
|
36 |
+
'object_counting',
|
37 |
+
'formal_fallacies',
|
38 |
+
'causal_judgement',
|
39 |
+
'web_of_lies',
|
40 |
+
]
|
41 |
+
|
42 |
+
bbh_datasets = []
|
43 |
+
for _name in bbh_multiple_choice_sets:
|
44 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
45 |
+
_hint = f.read()
|
46 |
+
bbh_infer_cfg = dict(
|
47 |
+
prompt_template=dict(
|
48 |
+
type=PromptTemplate,
|
49 |
+
template=dict(round=[
|
50 |
+
dict(
|
51 |
+
role='HUMAN',
|
52 |
+
prompt=
|
53 |
+
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
|
54 |
+
)
|
55 |
+
])),
|
56 |
+
retriever=dict(type=ZeroRetriever),
|
57 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
58 |
+
bbh_eval_cfg = dict(
|
59 |
+
evaluator=dict(type=BBHEvaluator_mcq),
|
60 |
+
pred_role='BOT',
|
61 |
+
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
62 |
+
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
63 |
+
|
64 |
+
bbh_datasets.append(
|
65 |
+
dict(
|
66 |
+
type=BBHDataset,
|
67 |
+
path='opencompass/bbh',
|
68 |
+
name=_name,
|
69 |
+
abbr='bbh-' + _name,
|
70 |
+
reader_cfg=bbh_reader_cfg,
|
71 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
72 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
73 |
+
|
74 |
+
for _name in bbh_free_form_sets:
|
75 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
76 |
+
_hint = f.read()
|
77 |
+
bbh_infer_cfg = dict(
|
78 |
+
prompt_template=dict(
|
79 |
+
type=PromptTemplate,
|
80 |
+
template=dict(round=[
|
81 |
+
dict(
|
82 |
+
role='HUMAN',
|
83 |
+
prompt=
|
84 |
+
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step. And you must give your final answer by starting with 'So the answer is' "
|
85 |
+
)
|
86 |
+
])),
|
87 |
+
retriever=dict(type=ZeroRetriever),
|
88 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
89 |
+
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
|
90 |
+
|
91 |
+
bbh_datasets.append(
|
92 |
+
dict(
|
93 |
+
type=BBHDataset,
|
94 |
+
path='opencompass/bbh',
|
95 |
+
name=_name,
|
96 |
+
abbr='bbh-' + _name,
|
97 |
+
reader_cfg=bbh_reader_cfg,
|
98 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
99 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
opencompass/configs/datasets/bbh/bbh_gen_5b92b0.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
6 |
+
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
7 |
+
|
8 |
+
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
9 |
+
|
10 |
+
bbh_multiple_choice_sets = [
|
11 |
+
'temporal_sequences',
|
12 |
+
'disambiguation_qa',
|
13 |
+
'date_understanding',
|
14 |
+
'tracking_shuffled_objects_three_objects',
|
15 |
+
'penguins_in_a_table',
|
16 |
+
'geometric_shapes',
|
17 |
+
'snarks',
|
18 |
+
'ruin_names',
|
19 |
+
'tracking_shuffled_objects_seven_objects',
|
20 |
+
'tracking_shuffled_objects_five_objects',
|
21 |
+
'logical_deduction_three_objects',
|
22 |
+
'hyperbaton',
|
23 |
+
'logical_deduction_five_objects',
|
24 |
+
'logical_deduction_seven_objects',
|
25 |
+
'movie_recommendation',
|
26 |
+
'salient_translation_error_detection',
|
27 |
+
'reasoning_about_colored_objects',
|
28 |
+
]
|
29 |
+
bbh_free_form_sets = [
|
30 |
+
'multistep_arithmetic_two',
|
31 |
+
'navigate',
|
32 |
+
'dyck_languages',
|
33 |
+
'word_sorting',
|
34 |
+
'sports_understanding',
|
35 |
+
'boolean_expressions',
|
36 |
+
'object_counting',
|
37 |
+
'formal_fallacies',
|
38 |
+
'causal_judgement',
|
39 |
+
'web_of_lies',
|
40 |
+
]
|
41 |
+
|
42 |
+
bbh_datasets = []
|
43 |
+
for _name in bbh_multiple_choice_sets:
|
44 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
45 |
+
_hint = f.read()
|
46 |
+
bbh_infer_cfg = dict(
|
47 |
+
prompt_template=dict(
|
48 |
+
type=PromptTemplate,
|
49 |
+
template=dict(round=[
|
50 |
+
dict(
|
51 |
+
role='HUMAN',
|
52 |
+
prompt=
|
53 |
+
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
|
54 |
+
)
|
55 |
+
])),
|
56 |
+
retriever=dict(type=ZeroRetriever),
|
57 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
58 |
+
bbh_eval_cfg = dict(
|
59 |
+
evaluator=dict(type=BBHEvaluator_mcq),
|
60 |
+
pred_role='BOT',
|
61 |
+
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
62 |
+
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
63 |
+
|
64 |
+
bbh_datasets.append(
|
65 |
+
dict(
|
66 |
+
type=BBHDataset,
|
67 |
+
path='opencompass/bbh',
|
68 |
+
name=_name,
|
69 |
+
abbr='bbh-' + _name,
|
70 |
+
reader_cfg=bbh_reader_cfg,
|
71 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
72 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
73 |
+
|
74 |
+
for _name in bbh_free_form_sets:
|
75 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
76 |
+
_hint = f.read()
|
77 |
+
bbh_infer_cfg = dict(
|
78 |
+
prompt_template=dict(
|
79 |
+
type=PromptTemplate,
|
80 |
+
template=dict(round=[
|
81 |
+
dict(
|
82 |
+
role='HUMAN',
|
83 |
+
prompt=
|
84 |
+
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
|
85 |
+
)
|
86 |
+
])),
|
87 |
+
retriever=dict(type=ZeroRetriever),
|
88 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
89 |
+
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
|
90 |
+
|
91 |
+
bbh_datasets.append(
|
92 |
+
dict(
|
93 |
+
type=BBHDataset,
|
94 |
+
path='opencompass/bbh',
|
95 |
+
name=_name,
|
96 |
+
abbr='bbh-' + _name,
|
97 |
+
reader_cfg=bbh_reader_cfg,
|
98 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
99 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
opencompass/configs/datasets/bbh/bbh_gen_5bf00b.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
6 |
+
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
7 |
+
|
8 |
+
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
9 |
+
|
10 |
+
bbh_multiple_choice_sets = [
|
11 |
+
'temporal_sequences',
|
12 |
+
'disambiguation_qa',
|
13 |
+
'date_understanding',
|
14 |
+
'tracking_shuffled_objects_three_objects',
|
15 |
+
'penguins_in_a_table',
|
16 |
+
'geometric_shapes',
|
17 |
+
'snarks',
|
18 |
+
'ruin_names',
|
19 |
+
'tracking_shuffled_objects_seven_objects',
|
20 |
+
'tracking_shuffled_objects_five_objects',
|
21 |
+
'logical_deduction_three_objects',
|
22 |
+
'hyperbaton',
|
23 |
+
'logical_deduction_five_objects',
|
24 |
+
'logical_deduction_seven_objects',
|
25 |
+
'movie_recommendation',
|
26 |
+
'salient_translation_error_detection',
|
27 |
+
'reasoning_about_colored_objects',
|
28 |
+
]
|
29 |
+
bbh_free_form_sets = [
|
30 |
+
'multistep_arithmetic_two',
|
31 |
+
'navigate',
|
32 |
+
'dyck_languages',
|
33 |
+
'word_sorting',
|
34 |
+
'sports_understanding',
|
35 |
+
'boolean_expressions',
|
36 |
+
'object_counting',
|
37 |
+
'formal_fallacies',
|
38 |
+
'causal_judgement',
|
39 |
+
'web_of_lies',
|
40 |
+
]
|
41 |
+
|
42 |
+
bbh_datasets = []
|
43 |
+
for _name in bbh_multiple_choice_sets:
|
44 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
45 |
+
_hint = f.read()
|
46 |
+
bbh_infer_cfg = dict(
|
47 |
+
prompt_template=dict(
|
48 |
+
type=PromptTemplate,
|
49 |
+
template=dict(round=[
|
50 |
+
dict(
|
51 |
+
role='HUMAN',
|
52 |
+
prompt=
|
53 |
+
f'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
|
54 |
+
)
|
55 |
+
])),
|
56 |
+
retriever=dict(type=ZeroRetriever),
|
57 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
58 |
+
bbh_eval_cfg = dict(
|
59 |
+
evaluator=dict(type=BBHEvaluator_mcq),
|
60 |
+
pred_role='BOT',
|
61 |
+
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
62 |
+
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
63 |
+
|
64 |
+
bbh_datasets.append(
|
65 |
+
dict(
|
66 |
+
type=BBHDataset,
|
67 |
+
path='opencompass/bbh',
|
68 |
+
name=_name,
|
69 |
+
abbr='bbh-' + _name,
|
70 |
+
reader_cfg=bbh_reader_cfg,
|
71 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
72 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
73 |
+
|
74 |
+
for _name in bbh_free_form_sets:
|
75 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
76 |
+
_hint = f.read()
|
77 |
+
bbh_infer_cfg = dict(
|
78 |
+
prompt_template=dict(
|
79 |
+
type=PromptTemplate,
|
80 |
+
template=dict(round=[
|
81 |
+
dict(
|
82 |
+
role='HUMAN',
|
83 |
+
prompt=
|
84 |
+
f'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
|
85 |
+
)
|
86 |
+
])),
|
87 |
+
retriever=dict(type=ZeroRetriever),
|
88 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
89 |
+
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
|
90 |
+
|
91 |
+
bbh_datasets.append(
|
92 |
+
dict(
|
93 |
+
type=BBHDataset,
|
94 |
+
path='opencompass/bbh',
|
95 |
+
name=_name,
|
96 |
+
abbr='bbh-' + _name,
|
97 |
+
reader_cfg=bbh_reader_cfg,
|
98 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
99 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
opencompass/configs/datasets/bbh/bbh_gen_98fba6.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
6 |
+
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
|
7 |
+
|
8 |
+
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
9 |
+
|
10 |
+
bbh_multiple_choice_sets = [
|
11 |
+
'temporal_sequences',
|
12 |
+
'disambiguation_qa',
|
13 |
+
'date_understanding',
|
14 |
+
'tracking_shuffled_objects_three_objects',
|
15 |
+
'penguins_in_a_table',
|
16 |
+
'geometric_shapes',
|
17 |
+
'snarks',
|
18 |
+
'ruin_names',
|
19 |
+
'tracking_shuffled_objects_seven_objects',
|
20 |
+
'tracking_shuffled_objects_five_objects',
|
21 |
+
'logical_deduction_three_objects',
|
22 |
+
'hyperbaton',
|
23 |
+
'logical_deduction_five_objects',
|
24 |
+
'logical_deduction_seven_objects',
|
25 |
+
'movie_recommendation',
|
26 |
+
'salient_translation_error_detection',
|
27 |
+
'reasoning_about_colored_objects',
|
28 |
+
]
|
29 |
+
bbh_free_form_sets = [
|
30 |
+
'multistep_arithmetic_two',
|
31 |
+
'navigate',
|
32 |
+
'dyck_languages',
|
33 |
+
'word_sorting',
|
34 |
+
'sports_understanding',
|
35 |
+
'boolean_expressions',
|
36 |
+
'object_counting',
|
37 |
+
'formal_fallacies',
|
38 |
+
'causal_judgement',
|
39 |
+
'web_of_lies',
|
40 |
+
]
|
41 |
+
|
42 |
+
bbh_datasets = []
|
43 |
+
for _name in bbh_multiple_choice_sets:
|
44 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
45 |
+
_hint = f.read()
|
46 |
+
bbh_infer_cfg = dict(
|
47 |
+
prompt_template=dict(
|
48 |
+
type=PromptTemplate,
|
49 |
+
template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
|
50 |
+
),
|
51 |
+
retriever=dict(type=ZeroRetriever),
|
52 |
+
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
|
53 |
+
bbh_eval_cfg = dict(
|
54 |
+
evaluator=dict(type=BBHEvaluator_mcq),
|
55 |
+
pred_role='BOT',
|
56 |
+
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
57 |
+
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
58 |
+
|
59 |
+
bbh_datasets.append(
|
60 |
+
dict(
|
61 |
+
type=BBHDataset,
|
62 |
+
path='opencompass/bbh',
|
63 |
+
name=_name,
|
64 |
+
abbr='bbh-' + _name,
|
65 |
+
reader_cfg=bbh_reader_cfg,
|
66 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
67 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
68 |
+
|
69 |
+
|
70 |
+
for _name in bbh_free_form_sets:
|
71 |
+
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
72 |
+
_hint = f.read()
|
73 |
+
bbh_infer_cfg = dict(
|
74 |
+
prompt_template=dict(
|
75 |
+
type=PromptTemplate,
|
76 |
+
template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
|
77 |
+
),
|
78 |
+
retriever=dict(type=ZeroRetriever),
|
79 |
+
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
|
80 |
+
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
|
81 |
+
|
82 |
+
bbh_datasets.append(
|
83 |
+
dict(
|
84 |
+
type=BBHDataset,
|
85 |
+
path='opencompass/bbh',
|
86 |
+
name=_name,
|
87 |
+
abbr='bbh-' + _name,
|
88 |
+
reader_cfg=bbh_reader_cfg,
|
89 |
+
infer_cfg=bbh_infer_cfg.copy(),
|
90 |
+
eval_cfg=bbh_eval_cfg.copy()))
|
opencompass/configs/datasets/bbh/bbh_subset_settings.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
settings = [
|
2 |
+
('temporal_sequences', 'mcq'),
|
3 |
+
('disambiguation_qa', 'mcq'),
|
4 |
+
('date_understanding', 'mcq'),
|
5 |
+
('tracking_shuffled_objects_three_objects', 'mcq'),
|
6 |
+
('penguins_in_a_table', 'mcq'),
|
7 |
+
('geometric_shapes', 'mcq'),
|
8 |
+
('snarks', 'mcq'),
|
9 |
+
('ruin_names', 'mcq'),
|
10 |
+
('tracking_shuffled_objects_seven_objects', 'mcq'),
|
11 |
+
('tracking_shuffled_objects_five_objects', 'mcq'),
|
12 |
+
('logical_deduction_three_objects', 'mcq'),
|
13 |
+
('hyperbaton', 'mcq'),
|
14 |
+
('logical_deduction_five_objects', 'mcq'),
|
15 |
+
('logical_deduction_seven_objects', 'mcq'),
|
16 |
+
('movie_recommendation', 'mcq'),
|
17 |
+
('salient_translation_error_detection', 'mcq'),
|
18 |
+
('reasoning_about_colored_objects', 'mcq'),
|
19 |
+
('multistep_arithmetic_two', 'free_form'),
|
20 |
+
('navigate', 'free_form'),
|
21 |
+
('dyck_languages', 'free_form'),
|
22 |
+
('word_sorting', 'free_form'),
|
23 |
+
('sports_understanding', 'free_form'),
|
24 |
+
('boolean_expressions', 'free_form'),
|
25 |
+
('object_counting', 'free_form'),
|
26 |
+
('formal_fallacies', 'free_form'),
|
27 |
+
('causal_judgement', 'free_form'),
|
28 |
+
('web_of_lies', 'free_form'),
|
29 |
+
]
|
opencompass/configs/datasets/cmmlu/cmmlu_0shot_cot_gen_305931.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CMMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import match_answer_pattern
|
7 |
+
|
8 |
+
cmmlu_subject_mapping = {
|
9 |
+
'agronomy': '农学',
|
10 |
+
'anatomy': '解剖学',
|
11 |
+
'ancient_chinese': '古汉语',
|
12 |
+
'arts': '艺术学',
|
13 |
+
'astronomy': '天文学',
|
14 |
+
'business_ethics': '商业伦理',
|
15 |
+
'chinese_civil_service_exam': '中国公务员考试',
|
16 |
+
'chinese_driving_rule': '中国驾驶规则',
|
17 |
+
'chinese_food_culture': '中国饮食文化',
|
18 |
+
'chinese_foreign_policy': '中国外交政策',
|
19 |
+
'chinese_history': '中国历史',
|
20 |
+
'chinese_literature': '中国文学',
|
21 |
+
'chinese_teacher_qualification': '中国教师资格',
|
22 |
+
'clinical_knowledge': '临床知识',
|
23 |
+
'college_actuarial_science': '大学精算学',
|
24 |
+
'college_education': '大学教育学',
|
25 |
+
'college_engineering_hydrology': '大学工程水文学',
|
26 |
+
'college_law': '大学法律',
|
27 |
+
'college_mathematics': '大学数学',
|
28 |
+
'college_medical_statistics': '大学医学统计',
|
29 |
+
'college_medicine': '大学医学',
|
30 |
+
'computer_science': '计算机科学',
|
31 |
+
'computer_security': '计算机安全',
|
32 |
+
'conceptual_physics': '概念物理学',
|
33 |
+
'construction_project_management': '建设工程管理',
|
34 |
+
'economics': '经济学',
|
35 |
+
'education': '教育学',
|
36 |
+
'electrical_engineering': '电气工程',
|
37 |
+
'elementary_chinese': '小学语文',
|
38 |
+
'elementary_commonsense': '小学常识',
|
39 |
+
'elementary_information_and_technology': '小学信息技术',
|
40 |
+
'elementary_mathematics': '初等数学',
|
41 |
+
'ethnology': '民族学',
|
42 |
+
'food_science': '食品科学',
|
43 |
+
'genetics': '遗传学',
|
44 |
+
'global_facts': '全球事实',
|
45 |
+
'high_school_biology': '高中生物',
|
46 |
+
'high_school_chemistry': '高中化学',
|
47 |
+
'high_school_geography': '高中地理',
|
48 |
+
'high_school_mathematics': '高中数学',
|
49 |
+
'high_school_physics': '高中物理学',
|
50 |
+
'high_school_politics': '高中政治',
|
51 |
+
'human_sexuality': '人类性行为',
|
52 |
+
'international_law': '国际法学',
|
53 |
+
'journalism': '新闻学',
|
54 |
+
'jurisprudence': '法理学',
|
55 |
+
'legal_and_moral_basis': '法律与道德基础',
|
56 |
+
'logical': '逻辑学',
|
57 |
+
'machine_learning': '机器学习',
|
58 |
+
'management': '管理学',
|
59 |
+
'marketing': '市场营销',
|
60 |
+
'marxist_theory': '马克思主义理论',
|
61 |
+
'modern_chinese': '现代汉语',
|
62 |
+
'nutrition': '营养学',
|
63 |
+
'philosophy': '哲学',
|
64 |
+
'professional_accounting': '专业会计',
|
65 |
+
'professional_law': '专业法学',
|
66 |
+
'professional_medicine': '专业医学',
|
67 |
+
'professional_psychology': '专业心理学',
|
68 |
+
'public_relations': '公共关系',
|
69 |
+
'security_study': '安全研究',
|
70 |
+
'sociology': '社会学',
|
71 |
+
'sports_science': '体育学',
|
72 |
+
'traditional_chinese_medicine': '中医中药',
|
73 |
+
'virology': '病毒学',
|
74 |
+
'world_history': '世界历史',
|
75 |
+
'world_religions': '世界宗教'
|
76 |
+
}
|
77 |
+
|
78 |
+
QUERY_TEMPLATE = """
|
79 |
+
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一. 请在回答之前一步步思考.
|
80 |
+
|
81 |
+
{question}
|
82 |
+
|
83 |
+
A) {A}
|
84 |
+
B) {B}
|
85 |
+
C) {C}
|
86 |
+
D) {D}
|
87 |
+
""".strip()
|
88 |
+
|
89 |
+
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
|
90 |
+
|
91 |
+
cmmlu_datasets = []
|
92 |
+
for _name in cmmlu_all_sets:
|
93 |
+
_ch_name = cmmlu_subject_mapping[_name]
|
94 |
+
prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
|
95 |
+
cmmlu_infer_cfg = dict(
|
96 |
+
prompt_template=dict(
|
97 |
+
type=PromptTemplate,
|
98 |
+
template=dict(
|
99 |
+
round=[
|
100 |
+
dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
|
101 |
+
],
|
102 |
+
),
|
103 |
+
),
|
104 |
+
retriever=dict(type=ZeroRetriever),
|
105 |
+
inferencer=dict(type=GenInferencer),
|
106 |
+
)
|
107 |
+
cmmlu_eval_cfg = dict(
|
108 |
+
evaluator=dict(type=AccEvaluator),
|
109 |
+
pred_postprocessor=dict(
|
110 |
+
type=match_answer_pattern,
|
111 |
+
# answer_pattern=r'(?i)答案\s*:\s*([A-D])'
|
112 |
+
answer_pattern=r'(?i)答案\s*:\s*[\W]*([A-D])[\W]*',
|
113 |
+
)
|
114 |
+
)
|
115 |
+
cmmlu_datasets.append(
|
116 |
+
dict(
|
117 |
+
type=CMMLUDataset,
|
118 |
+
path='opencompass/cmmlu',
|
119 |
+
name=_name,
|
120 |
+
abbr=f'cmmlu-{_name}',
|
121 |
+
reader_cfg=dict(
|
122 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
123 |
+
output_column='answer',
|
124 |
+
train_split='dev',
|
125 |
+
test_split='test'),
|
126 |
+
infer_cfg=cmmlu_infer_cfg,
|
127 |
+
eval_cfg=cmmlu_eval_cfg,
|
128 |
+
))
|
129 |
+
|
130 |
+
del _name, _ch_name
|
opencompass/configs/datasets/cmmlu/cmmlu_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, F403
|
opencompass/configs/datasets/cmmlu/cmmlu_gen_c13365.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
5 |
+
from opencompass.datasets import CMMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
|
9 |
+
cmmlu_subject_mapping = {
|
10 |
+
'agronomy': '农学',
|
11 |
+
'anatomy': '解剖学',
|
12 |
+
'ancient_chinese': '古汉语',
|
13 |
+
'arts': '艺术学',
|
14 |
+
'astronomy': '天文学',
|
15 |
+
'business_ethics': '商业伦理',
|
16 |
+
'chinese_civil_service_exam': '中国公务员考试',
|
17 |
+
'chinese_driving_rule': '中国驾驶规则',
|
18 |
+
'chinese_food_culture': '中国饮食文化',
|
19 |
+
'chinese_foreign_policy': '中国外交政策',
|
20 |
+
'chinese_history': '中国历史',
|
21 |
+
'chinese_literature': '中国文学',
|
22 |
+
'chinese_teacher_qualification': '中国教师资格',
|
23 |
+
'clinical_knowledge': '临床知识',
|
24 |
+
'college_actuarial_science': '大学精算学',
|
25 |
+
'college_education': '大学教育学',
|
26 |
+
'college_engineering_hydrology': '大学工程水文学',
|
27 |
+
'college_law': '大学法律',
|
28 |
+
'college_mathematics': '大学数学',
|
29 |
+
'college_medical_statistics': '大学医学统计',
|
30 |
+
'college_medicine': '大学医学',
|
31 |
+
'computer_science': '计算机科学',
|
32 |
+
'computer_security': '计算机安全',
|
33 |
+
'conceptual_physics': '概念物理学',
|
34 |
+
'construction_project_management': '建设工程管理',
|
35 |
+
'economics': '经济学',
|
36 |
+
'education': '教育学',
|
37 |
+
'electrical_engineering': '电气工程',
|
38 |
+
'elementary_chinese': '小学语文',
|
39 |
+
'elementary_commonsense': '小学常识',
|
40 |
+
'elementary_information_and_technology': '小学信息技术',
|
41 |
+
'elementary_mathematics': '初等数学',
|
42 |
+
'ethnology': '民族学',
|
43 |
+
'food_science': '食品科学',
|
44 |
+
'genetics': '遗传学',
|
45 |
+
'global_facts': '全球事实',
|
46 |
+
'high_school_biology': '高中生物',
|
47 |
+
'high_school_chemistry': '高中化学',
|
48 |
+
'high_school_geography': '高中地理',
|
49 |
+
'high_school_mathematics': '高中数学',
|
50 |
+
'high_school_physics': '高中物理学',
|
51 |
+
'high_school_politics': '高中政治',
|
52 |
+
'human_sexuality': '人类性行为',
|
53 |
+
'international_law': '国际法学',
|
54 |
+
'journalism': '新闻学',
|
55 |
+
'jurisprudence': '法理学',
|
56 |
+
'legal_and_moral_basis': '法律与道德基础',
|
57 |
+
'logical': '逻辑学',
|
58 |
+
'machine_learning': '机器学习',
|
59 |
+
'management': '管理学',
|
60 |
+
'marketing': '市场营销',
|
61 |
+
'marxist_theory': '马克思主义理论',
|
62 |
+
'modern_chinese': '现代汉语',
|
63 |
+
'nutrition': '营养学',
|
64 |
+
'philosophy': '哲学',
|
65 |
+
'professional_accounting': '专业会计',
|
66 |
+
'professional_law': '专业法学',
|
67 |
+
'professional_medicine': '专业医学',
|
68 |
+
'professional_psychology': '专业心理学',
|
69 |
+
'public_relations': '公共关系',
|
70 |
+
'security_study': '安全研究',
|
71 |
+
'sociology': '社会学',
|
72 |
+
'sports_science': '体育学',
|
73 |
+
'traditional_chinese_medicine': '中医中药',
|
74 |
+
'virology': '病毒学',
|
75 |
+
'world_history': '世界历史',
|
76 |
+
'world_religions': '世界宗教'
|
77 |
+
}
|
78 |
+
|
79 |
+
|
80 |
+
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
|
81 |
+
|
82 |
+
cmmlu_datasets = []
|
83 |
+
for _name in cmmlu_all_sets:
|
84 |
+
_ch_name = cmmlu_subject_mapping[_name]
|
85 |
+
cmmlu_infer_cfg = dict(
|
86 |
+
ice_template=dict(
|
87 |
+
type=PromptTemplate,
|
88 |
+
template=dict(
|
89 |
+
begin='</E>',
|
90 |
+
round=[
|
91 |
+
dict(
|
92 |
+
role='HUMAN',
|
93 |
+
prompt=
|
94 |
+
f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
|
95 |
+
),
|
96 |
+
dict(role='BOT', prompt='答案是: {answer}'),
|
97 |
+
]),
|
98 |
+
ice_token='</E>',
|
99 |
+
),
|
100 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
101 |
+
inferencer=dict(type=GenInferencer),
|
102 |
+
)
|
103 |
+
|
104 |
+
cmmlu_eval_cfg = dict(
|
105 |
+
evaluator=dict(type=AccwithDetailsEvaluator),
|
106 |
+
pred_postprocessor=dict(type=first_capital_postprocess))
|
107 |
+
|
108 |
+
cmmlu_datasets.append(
|
109 |
+
dict(
|
110 |
+
type=CMMLUDataset,
|
111 |
+
path='opencompass/cmmlu',
|
112 |
+
name=_name,
|
113 |
+
abbr=f'cmmlu-{_name}',
|
114 |
+
reader_cfg=dict(
|
115 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
116 |
+
output_column='answer',
|
117 |
+
train_split='dev',
|
118 |
+
test_split='test'),
|
119 |
+
infer_cfg=cmmlu_infer_cfg,
|
120 |
+
eval_cfg=cmmlu_eval_cfg,
|
121 |
+
))
|
122 |
+
|
123 |
+
del _name, _ch_name
|
opencompass/configs/datasets/cmmlu/cmmlu_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .cmmlu_ppl_8b9c76 import cmmlu_datasets # noqa: F401, F403
|
opencompass/configs/datasets/cmmlu/cmmlu_ppl_041cbf.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
5 |
+
from opencompass.datasets import CMMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
cmmlu_subject_mapping = {
|
9 |
+
'agronomy': '农学',
|
10 |
+
'anatomy': '解剖学',
|
11 |
+
'ancient_chinese': '古汉语',
|
12 |
+
'arts': '艺术学',
|
13 |
+
'astronomy': '天文学',
|
14 |
+
'business_ethics': '商业伦理',
|
15 |
+
'chinese_civil_service_exam': '中国公务员考试',
|
16 |
+
'chinese_driving_rule': '中国驾驶规则',
|
17 |
+
'chinese_food_culture': '中国饮食文化',
|
18 |
+
'chinese_foreign_policy': '中国外交政策',
|
19 |
+
'chinese_history': '中国历史',
|
20 |
+
'chinese_literature': '中国文学',
|
21 |
+
'chinese_teacher_qualification': '中国教师资格',
|
22 |
+
'clinical_knowledge': '临床知识',
|
23 |
+
'college_actuarial_science': '大学精算学',
|
24 |
+
'college_education': '大学教育学',
|
25 |
+
'college_engineering_hydrology': '大学工程水文学',
|
26 |
+
'college_law': '大学法律',
|
27 |
+
'college_mathematics': '大学数学',
|
28 |
+
'college_medical_statistics': '大学医学统计',
|
29 |
+
'college_medicine': '大学医学',
|
30 |
+
'computer_science': '计算机科学',
|
31 |
+
'computer_security': '计算机安全',
|
32 |
+
'conceptual_physics': '概念物理学',
|
33 |
+
'construction_project_management': '建设工程管理',
|
34 |
+
'economics': '经济学',
|
35 |
+
'education': '教育学',
|
36 |
+
'electrical_engineering': '电气工程',
|
37 |
+
'elementary_chinese': '小学语文',
|
38 |
+
'elementary_commonsense': '小学常识',
|
39 |
+
'elementary_information_and_technology': '小学信息技术',
|
40 |
+
'elementary_mathematics': '初等数学',
|
41 |
+
'ethnology': '民族学',
|
42 |
+
'food_science': '食品科学',
|
43 |
+
'genetics': '遗传学',
|
44 |
+
'global_facts': '全球事实',
|
45 |
+
'high_school_biology': '高中生物',
|
46 |
+
'high_school_chemistry': '高中化学',
|
47 |
+
'high_school_geography': '高中地理',
|
48 |
+
'high_school_mathematics': '高中数学',
|
49 |
+
'high_school_physics': '高中物理学',
|
50 |
+
'high_school_politics': '高中政治',
|
51 |
+
'human_sexuality': '人类性行为',
|
52 |
+
'international_law': '国际法学',
|
53 |
+
'journalism': '新闻学',
|
54 |
+
'jurisprudence': '法理学',
|
55 |
+
'legal_and_moral_basis': '法律与道德基础',
|
56 |
+
'logical': '逻辑学',
|
57 |
+
'machine_learning': '机器学习',
|
58 |
+
'management': '管理学',
|
59 |
+
'marketing': '市场营销',
|
60 |
+
'marxist_theory': '马克思主义理论',
|
61 |
+
'modern_chinese': '现代汉语',
|
62 |
+
'nutrition': '营养学',
|
63 |
+
'philosophy': '哲学',
|
64 |
+
'professional_accounting': '专业会计',
|
65 |
+
'professional_law': '专业法学',
|
66 |
+
'professional_medicine': '专业医学',
|
67 |
+
'professional_psychology': '专业心理学',
|
68 |
+
'public_relations': '公共关系',
|
69 |
+
'security_study': '安全研究',
|
70 |
+
'sociology': '社会学',
|
71 |
+
'sports_science': '体育学',
|
72 |
+
'traditional_chinese_medicine': '中医中药',
|
73 |
+
'virology': '病毒学',
|
74 |
+
'world_history': '世界历史',
|
75 |
+
'world_religions': '世界宗教'
|
76 |
+
}
|
77 |
+
|
78 |
+
|
79 |
+
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
|
80 |
+
|
81 |
+
cmmlu_datasets = []
|
82 |
+
for _name in cmmlu_all_sets:
|
83 |
+
_ch_name = cmmlu_subject_mapping[_name]
|
84 |
+
hint = f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。'
|
85 |
+
question_and_options = '题目:{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
|
86 |
+
cmmlu_infer_cfg = dict(
|
87 |
+
ice_template=dict(
|
88 |
+
type=PromptTemplate,
|
89 |
+
template={answer: f'{question_and_options}\n答案是: {answer}\n' for answer in ['A', 'B', 'C', 'D']},
|
90 |
+
),
|
91 |
+
prompt_template=dict(
|
92 |
+
type=PromptTemplate,
|
93 |
+
template={answer: f'{hint}\n</E>{question_and_options}\n答案是: {answer}' for answer in ['A', 'B', 'C', 'D']},
|
94 |
+
ice_token='</E>',
|
95 |
+
),
|
96 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
97 |
+
inferencer=dict(type=PPLInferencer),
|
98 |
+
)
|
99 |
+
|
100 |
+
cmmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
|
101 |
+
|
102 |
+
cmmlu_datasets.append(
|
103 |
+
dict(
|
104 |
+
type=CMMLUDataset,
|
105 |
+
path='opencompass/cmmlu',
|
106 |
+
name=_name,
|
107 |
+
abbr=f'cmmlu-{_name}',
|
108 |
+
reader_cfg=dict(
|
109 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
110 |
+
output_column='answer',
|
111 |
+
train_split='dev',
|
112 |
+
test_split='test'),
|
113 |
+
infer_cfg=cmmlu_infer_cfg,
|
114 |
+
eval_cfg=cmmlu_eval_cfg,
|
115 |
+
))
|
116 |
+
|
117 |
+
del _name, _ch_name
|
opencompass/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CMMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
cmmlu_subject_mapping = {
|
9 |
+
'agronomy': '农学',
|
10 |
+
'anatomy': '解剖学',
|
11 |
+
'ancient_chinese': '古汉语',
|
12 |
+
'arts': '艺术学',
|
13 |
+
'astronomy': '天文学',
|
14 |
+
'business_ethics': '商业伦理',
|
15 |
+
'chinese_civil_service_exam': '中国公务员考试',
|
16 |
+
'chinese_driving_rule': '中国驾驶规则',
|
17 |
+
'chinese_food_culture': '中国饮食文化',
|
18 |
+
'chinese_foreign_policy': '中国外交政策',
|
19 |
+
'chinese_history': '中国历史',
|
20 |
+
'chinese_literature': '中国文学',
|
21 |
+
'chinese_teacher_qualification': '中国教师资格',
|
22 |
+
'clinical_knowledge': '临床知识',
|
23 |
+
'college_actuarial_science': '大学精算学',
|
24 |
+
'college_education': '大学教育学',
|
25 |
+
'college_engineering_hydrology': '大学工程水文学',
|
26 |
+
'college_law': '大学法律',
|
27 |
+
'college_mathematics': '大学数学',
|
28 |
+
'college_medical_statistics': '大学医学统计',
|
29 |
+
'college_medicine': '大学医学',
|
30 |
+
'computer_science': '计算机科学',
|
31 |
+
'computer_security': '计算机安全',
|
32 |
+
'conceptual_physics': '概念物理学',
|
33 |
+
'construction_project_management': '建设工程管理',
|
34 |
+
'economics': '经济学',
|
35 |
+
'education': '教育学',
|
36 |
+
'electrical_engineering': '电气工程',
|
37 |
+
'elementary_chinese': '小学语文',
|
38 |
+
'elementary_commonsense': '小学常识',
|
39 |
+
'elementary_information_and_technology': '小学信息技术',
|
40 |
+
'elementary_mathematics': '初等数学',
|
41 |
+
'ethnology': '民族学',
|
42 |
+
'food_science': '食品科学',
|
43 |
+
'genetics': '遗传学',
|
44 |
+
'global_facts': '全球事实',
|
45 |
+
'high_school_biology': '高中生物',
|
46 |
+
'high_school_chemistry': '高中化学',
|
47 |
+
'high_school_geography': '高中地理',
|
48 |
+
'high_school_mathematics': '高中数学',
|
49 |
+
'high_school_physics': '高中物理学',
|
50 |
+
'high_school_politics': '高中政治',
|
51 |
+
'human_sexuality': '人类性行为',
|
52 |
+
'international_law': '国际法学',
|
53 |
+
'journalism': '新闻学',
|
54 |
+
'jurisprudence': '法理学',
|
55 |
+
'legal_and_moral_basis': '法律与道德基础',
|
56 |
+
'logical': '逻辑学',
|
57 |
+
'machine_learning': '机器学习',
|
58 |
+
'management': '管理学',
|
59 |
+
'marketing': '市场营销',
|
60 |
+
'marxist_theory': '马克思主义理论',
|
61 |
+
'modern_chinese': '现代汉语',
|
62 |
+
'nutrition': '营养学',
|
63 |
+
'philosophy': '哲学',
|
64 |
+
'professional_accounting': '专业会计',
|
65 |
+
'professional_law': '专业法学',
|
66 |
+
'professional_medicine': '专业医学',
|
67 |
+
'professional_psychology': '专业心理学',
|
68 |
+
'public_relations': '公共关系',
|
69 |
+
'security_study': '安全研究',
|
70 |
+
'sociology': '社会学',
|
71 |
+
'sports_science': '体育学',
|
72 |
+
'traditional_chinese_medicine': '中医中药',
|
73 |
+
'virology': '病毒学',
|
74 |
+
'world_history': '世界历史',
|
75 |
+
'world_religions': '世界宗教'
|
76 |
+
}
|
77 |
+
|
78 |
+
|
79 |
+
cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
|
80 |
+
|
81 |
+
cmmlu_datasets = []
|
82 |
+
for _name in cmmlu_all_sets:
|
83 |
+
_ch_name = cmmlu_subject_mapping[_name]
|
84 |
+
cmmlu_infer_cfg = dict(
|
85 |
+
ice_template=dict(
|
86 |
+
type=PromptTemplate,
|
87 |
+
template={
|
88 |
+
answer: dict(
|
89 |
+
begin='</E>',
|
90 |
+
round=[
|
91 |
+
dict(
|
92 |
+
role='HUMAN',
|
93 |
+
prompt=f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
|
94 |
+
),
|
95 |
+
dict(role='BOT', prompt=f'答案是: {answer}'),
|
96 |
+
])
|
97 |
+
for answer in ['A', 'B', 'C', 'D']
|
98 |
+
},
|
99 |
+
ice_token='</E>',
|
100 |
+
),
|
101 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
102 |
+
inferencer=dict(type=PPLInferencer),
|
103 |
+
)
|
104 |
+
|
105 |
+
cmmlu_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
106 |
+
|
107 |
+
cmmlu_datasets.append(
|
108 |
+
dict(
|
109 |
+
type=CMMLUDataset,
|
110 |
+
path='opencompass/cmmlu',
|
111 |
+
name=_name,
|
112 |
+
abbr=f'cmmlu-{_name}',
|
113 |
+
reader_cfg=dict(
|
114 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
115 |
+
output_column='answer',
|
116 |
+
train_split='dev',
|
117 |
+
test_split='test'),
|
118 |
+
infer_cfg=cmmlu_infer_cfg,
|
119 |
+
eval_cfg=cmmlu_eval_cfg,
|
120 |
+
))
|
121 |
+
|
122 |
+
del _name, _ch_name
|
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .commonsenseqacn_gen_d380d0 import commonsenseqacn_datasets # noqa: F401, F403
|
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen_d380d0.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CommonsenseQADataset_CN
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
commonsenseqacn_reader_cfg = dict(
|
9 |
+
input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
|
10 |
+
output_column='answerKey',
|
11 |
+
test_split='validation',
|
12 |
+
)
|
13 |
+
|
14 |
+
_ice_template = dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(
|
17 |
+
begin='</E>',
|
18 |
+
round=[
|
19 |
+
dict(
|
20 |
+
role='HUMAN',
|
21 |
+
prompt='{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\n答案:',
|
22 |
+
),
|
23 |
+
dict(role='BOT', prompt='{answerKey}'),
|
24 |
+
],
|
25 |
+
),
|
26 |
+
ice_token='</E>',
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
commonsenseqacn_infer_cfg = dict(
|
31 |
+
prompt_template=_ice_template,
|
32 |
+
retriever=dict(type=ZeroRetriever),
|
33 |
+
inferencer=dict(type=GenInferencer),
|
34 |
+
)
|
35 |
+
|
36 |
+
commonsenseqacn_eval_cfg = dict(
|
37 |
+
evaluator=dict(type=AccEvaluator),
|
38 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
39 |
+
)
|
40 |
+
|
41 |
+
commonsenseqacn_datasets = [
|
42 |
+
dict(
|
43 |
+
abbr='commonsenseqa_cn',
|
44 |
+
type=CommonsenseQADataset_CN,
|
45 |
+
path='./data/commonsenseqa_cn/validation.jsonl',
|
46 |
+
reader_cfg=commonsenseqacn_reader_cfg,
|
47 |
+
infer_cfg=commonsenseqacn_infer_cfg,
|
48 |
+
eval_cfg=commonsenseqacn_eval_cfg,
|
49 |
+
)
|
50 |
+
]
|
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .commonsenseqacn_ppl_971f48 import commonsenseqacn_datasets # noqa: F401, F403
|
opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_ppl_971f48.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CommonsenseQADataset_CN
|
6 |
+
|
7 |
+
commonsenseqacn_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'A', 'B', 'C', 'D', 'E'],
|
9 |
+
output_column='answerKey',
|
10 |
+
test_split='validation',
|
11 |
+
)
|
12 |
+
|
13 |
+
_ice_template = dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
ans: dict(
|
17 |
+
begin='</E>',
|
18 |
+
round=[
|
19 |
+
dict(role='HUMAN', prompt='问题: {question}\n答案: '),
|
20 |
+
dict(role='BOT', prompt=ans_token),
|
21 |
+
],
|
22 |
+
)
|
23 |
+
for ans, ans_token in [
|
24 |
+
['A', '{A}'],
|
25 |
+
['B', '{B}'],
|
26 |
+
['C', '{C}'],
|
27 |
+
['D', '{D}'],
|
28 |
+
['E', '{E}'],
|
29 |
+
]
|
30 |
+
},
|
31 |
+
ice_token='</E>',
|
32 |
+
)
|
33 |
+
|
34 |
+
|
35 |
+
commonsenseqacn_infer_cfg = dict(
|
36 |
+
prompt_template=_ice_template,
|
37 |
+
retriever=dict(type=ZeroRetriever),
|
38 |
+
inferencer=dict(type=PPLInferencer),
|
39 |
+
)
|
40 |
+
|
41 |
+
commonsenseqacn_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
42 |
+
|
43 |
+
commonsenseqacn_datasets = [
|
44 |
+
dict(
|
45 |
+
abbr='commonsenseqa_cn',
|
46 |
+
type=CommonsenseQADataset_CN,
|
47 |
+
path='./data/commonsenseqa_cn/validation.jsonl',
|
48 |
+
reader_cfg=commonsenseqacn_reader_cfg,
|
49 |
+
infer_cfg=commonsenseqacn_infer_cfg,
|
50 |
+
eval_cfg=commonsenseqacn_eval_cfg,
|
51 |
+
)
|
52 |
+
]
|
opencompass/configs/datasets/demo/demo_cmmlu_base_ppl.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets
|
5 |
+
|
6 |
+
for d in cmmlu_datasets:
|
7 |
+
d['abbr'] = 'demo_' + d['abbr']
|
8 |
+
d['reader_cfg']['test_range'] = '[0:4]'
|
opencompass/configs/datasets/demo/demo_cmmlu_chat_gen.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
5 |
+
|
6 |
+
for d in cmmlu_datasets:
|
7 |
+
d['abbr'] = 'demo_' + d['abbr']
|
8 |
+
d['reader_cfg']['test_range'] = '[0:4]'
|
opencompass/configs/datasets/demo/demo_gsm8k_base_gen.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets
|
5 |
+
|
6 |
+
gsm8k_datasets[0]['abbr'] = 'demo_' + gsm8k_datasets[0]['abbr']
|
7 |
+
gsm8k_datasets[0]['reader_cfg']['test_range'] = '[0:64]'
|
opencompass/configs/datasets/demo/demo_gsm8k_chat_gen.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
5 |
+
|
6 |
+
gsm8k_datasets[0]['abbr'] = 'demo_' + gsm8k_datasets[0]['abbr']
|
7 |
+
gsm8k_datasets[0]['reader_cfg']['test_range'] = '[0:64]'
|
opencompass/configs/datasets/demo/demo_math_base_gen.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..math.math_4shot_base_gen_db136b import math_datasets
|
5 |
+
|
6 |
+
math_datasets[0]['abbr'] = 'demo_' + math_datasets[0]['abbr']
|
7 |
+
math_datasets[0]['reader_cfg']['test_range'] = '[0:64]'
|
opencompass/configs/datasets/demo/demo_math_chat_gen.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..math.math_0shot_gen_393424 import math_datasets
|
5 |
+
|
6 |
+
math_datasets[0]['abbr'] = 'demo_' + math_datasets[0]['abbr']
|
7 |
+
math_datasets[0]['reader_cfg']['test_range'] = '[0:64]'
|
opencompass/configs/datasets/gpqa/README.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GPQA
|
2 |
+
|
3 |
+
```bash
|
4 |
+
python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
|
5 |
+
python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
|
6 |
+
```
|
7 |
+
|
8 |
+
## Base Models
|
9 |
+
|
10 |
+
| model | GPQA_diamond |
|
11 |
+
|:------------------------:|---------------:|
|
12 |
+
| llama-7b-turbomind | 24.24 |
|
13 |
+
| llama-13b-turbomind | 25.25 |
|
14 |
+
| llama-30b-turbomind | 22.73 |
|
15 |
+
| llama-65b-turbomind | 21.72 |
|
16 |
+
| llama-2-7b-turbomind | 25.25 |
|
17 |
+
| llama-2-13b-turbomind | 23.74 |
|
18 |
+
| llama-2-70b-turbomind | 28.28 |
|
19 |
+
| llama-3-8b-turbomind | 31.82 |
|
20 |
+
| llama-3-70b-turbomind | 40.91 |
|
21 |
+
| internlm2-1.8b-turbomind | 24.24 |
|
22 |
+
| internlm2-7b-turbomind | 28.28 |
|
23 |
+
| internlm2-20b-turbomind | 31.31 |
|
24 |
+
| qwen-1.8b-turbomind | 28.79 |
|
25 |
+
| qwen-7b-turbomind | 24.75 |
|
26 |
+
| qwen-14b-turbomind | 27.78 |
|
27 |
+
| qwen-72b-turbomind | 31.31 |
|
28 |
+
| qwen1.5-0.5b-hf | 23.74 |
|
29 |
+
| qwen1.5-1.8b-hf | 28.79 |
|
30 |
+
| qwen1.5-4b-hf | 23.23 |
|
31 |
+
| qwen1.5-7b-hf | 20.71 |
|
32 |
+
| qwen1.5-14b-hf | 32.32 |
|
33 |
+
| qwen1.5-32b-hf | 30.81 |
|
34 |
+
| qwen1.5-72b-hf | 31.82 |
|
35 |
+
| qwen1.5-moe-a2-7b-hf | 28.79 |
|
36 |
+
| mistral-7b-v0.1-hf | 24.75 |
|
37 |
+
| mistral-7b-v0.2-hf | 23.74 |
|
38 |
+
| mixtral-8x7b-v0.1-hf | 28.79 |
|
39 |
+
| mixtral-8x22b-v0.1-hf | 36.36 |
|
40 |
+
| yi-6b-hf | 28.28 |
|
41 |
+
| yi-34b-hf | 35.86 |
|
42 |
+
| deepseek-7b-base-hf | 20.71 |
|
43 |
+
| deepseek-67b-base-hf | 25.25 |
|
44 |
+
|
45 |
+
## Chat Models
|
46 |
+
|
47 |
+
| model | GPQA_diamond |
|
48 |
+
|:-----------------------------:|---------------:|
|
49 |
+
| qwen1.5-0.5b-chat-hf | 19.70 |
|
50 |
+
| qwen1.5-1.8b-chat-hf | 29.80 |
|
51 |
+
| qwen1.5-4b-chat-hf | 25.25 |
|
52 |
+
| qwen1.5-7b-chat-hf | 31.82 |
|
53 |
+
| qwen1.5-14b-chat-hf | 30.30 |
|
54 |
+
| qwen1.5-32b-chat-hf | 31.31 |
|
55 |
+
| qwen1.5-72b-chat-hf | 32.83 |
|
56 |
+
| qwen1.5-110b-chat-hf | 35.86 |
|
57 |
+
| internlm2-chat-1.8b-hf | 25.76 |
|
58 |
+
| internlm2-chat-1.8b-sft-hf | 26.26 |
|
59 |
+
| internlm2-chat-7b-hf | 28.28 |
|
60 |
+
| internlm2-chat-7b-sft-hf | 27.27 |
|
61 |
+
| internlm2-chat-20b-hf | 30.30 |
|
62 |
+
| internlm2-chat-20b-sft-hf | 29.29 |
|
63 |
+
| llama-3-8b-instruct-hf | 25.76 |
|
64 |
+
| llama-3-70b-instruct-hf | 37.88 |
|
65 |
+
| llama-3-8b-instruct-lmdeploy | 25.76 |
|
66 |
+
| llama-3-70b-instruct-lmdeploy | 37.88 |
|
67 |
+
| mistral-7b-instruct-v0.1-hf | 30.30 |
|
68 |
+
| mistral-7b-instruct-v0.2-hf | 25.25 |
|
69 |
+
| mixtral-8x7b-instruct-v0.1-hf | 30.30 |
|
opencompass/configs/datasets/gpqa/gpqa_few_shot_ppl_4b5a83.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
5 |
+
from opencompass.datasets import GPQADataset, GPQAEvaluator
|
6 |
+
from opencompass.utils import first_option_postprocess
|
7 |
+
|
8 |
+
gpqa_reader_cfg = dict(
|
9 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
10 |
+
output_column='answer')
|
11 |
+
|
12 |
+
hint = f'For the multiple choice question below, please provide the correct answer option directly.'
|
13 |
+
question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
|
14 |
+
gpqa_infer_cfg = dict(
|
15 |
+
ice_template=dict(
|
16 |
+
type=PromptTemplate,
|
17 |
+
template={
|
18 |
+
opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
|
19 |
+
),
|
20 |
+
prompt_template=dict(
|
21 |
+
type=PromptTemplate,
|
22 |
+
template={
|
23 |
+
opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
|
24 |
+
},
|
25 |
+
ice_token='</E>'
|
26 |
+
),
|
27 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
28 |
+
inferencer=dict(type=PPLInferencer))
|
29 |
+
|
30 |
+
gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
|
31 |
+
|
32 |
+
gpqa_datasets = []
|
33 |
+
gpqa_subsets = {
|
34 |
+
# 'extended': 'gpqa_extended.csv',
|
35 |
+
# 'main': 'gpqa_main.csv',
|
36 |
+
'diamond': 'gpqa_diamond.csv'
|
37 |
+
}
|
38 |
+
|
39 |
+
for split in list(gpqa_subsets.keys()):
|
40 |
+
gpqa_datasets.append(
|
41 |
+
dict(
|
42 |
+
abbr='GPQA_' + split,
|
43 |
+
type=GPQADataset,
|
44 |
+
path='./data/gpqa/',
|
45 |
+
name=gpqa_subsets[split],
|
46 |
+
reader_cfg=gpqa_reader_cfg,
|
47 |
+
infer_cfg=gpqa_infer_cfg,
|
48 |
+
eval_cfg=gpqa_eval_cfg)
|
49 |
+
)
|
opencompass/configs/datasets/gpqa/gpqa_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
|
opencompass/configs/datasets/gpqa/gpqa_gen_015262.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import GPQADataset, GPQAEvaluator
|
5 |
+
from opencompass.utils import first_option_postprocess
|
6 |
+
|
7 |
+
gpqa_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
9 |
+
output_column='answer')
|
10 |
+
|
11 |
+
gpqa_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(
|
15 |
+
round=[
|
16 |
+
dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
|
17 |
+
'(A){A}\n'
|
18 |
+
'(B){B}\n'
|
19 |
+
'(C){C}\n'
|
20 |
+
'(D){D}\n'
|
21 |
+
'Format your response as follows: "The correct answer is (insert answer here)"'),
|
22 |
+
], )),
|
23 |
+
retriever=dict(type=ZeroRetriever),
|
24 |
+
inferencer=dict(type=GenInferencer))
|
25 |
+
|
26 |
+
gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
|
27 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
28 |
+
|
29 |
+
gpqa_datasets = []
|
30 |
+
gpqa_subsets = {
|
31 |
+
'extended': 'gpqa_extended.csv',
|
32 |
+
'main': 'gpqa_main.csv',
|
33 |
+
'diamond': 'gpqa_diamond.csv'
|
34 |
+
}
|
35 |
+
|
36 |
+
for split in list(gpqa_subsets.keys()):
|
37 |
+
gpqa_datasets.append(
|
38 |
+
dict(
|
39 |
+
abbr='GPQA_' + split,
|
40 |
+
type=GPQADataset,
|
41 |
+
path='./data/gpqa/',
|
42 |
+
name=gpqa_subsets[split],
|
43 |
+
reader_cfg=gpqa_reader_cfg,
|
44 |
+
infer_cfg=gpqa_infer_cfg,
|
45 |
+
eval_cfg=gpqa_eval_cfg)
|
46 |
+
)
|
opencompass/configs/datasets/gpqa/gpqa_gen_4baadb.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import GPQADataset, GPQAEvaluator
|
5 |
+
from opencompass.utils import first_option_postprocess
|
6 |
+
|
7 |
+
gpqa_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'A', 'B', 'C', 'D'],
|
9 |
+
output_column='answer')
|
10 |
+
|
11 |
+
gpqa_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(
|
15 |
+
round=[
|
16 |
+
dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
|
17 |
+
'(A){A}\n'
|
18 |
+
'(B){B}\n'
|
19 |
+
'(C){C}\n'
|
20 |
+
'(D){D}\n'
|
21 |
+
'Format your response as follows: "The correct answer is (insert answer here)"'),
|
22 |
+
], )),
|
23 |
+
retriever=dict(type=ZeroRetriever),
|
24 |
+
inferencer=dict(type=GenInferencer))
|
25 |
+
|
26 |
+
gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
|
27 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
28 |
+
|
29 |
+
gpqa_datasets = []
|
30 |
+
gpqa_subsets = {
|
31 |
+
# 'extended': 'gpqa_extended.csv',
|
32 |
+
# 'main': 'gpqa_main.csv',
|
33 |
+
'diamond': 'gpqa_diamond.csv'
|
34 |
+
}
|
35 |
+
|
36 |
+
for split in list(gpqa_subsets.keys()):
|
37 |
+
gpqa_datasets.append(
|
38 |
+
dict(
|
39 |
+
abbr='GPQA_' + split,
|
40 |
+
type=GPQADataset,
|
41 |
+
path='./data/gpqa/',
|
42 |
+
name=gpqa_subsets[split],
|
43 |
+
reader_cfg=gpqa_reader_cfg,
|
44 |
+
infer_cfg=gpqa_infer_cfg,
|
45 |
+
eval_cfg=gpqa_eval_cfg)
|
46 |
+
)
|