Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py +4 -0
- opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py +35 -0
- opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_3749cd.py +33 -0
- opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_8484b9.py +27 -0
- opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_941108.py +34 -0
- opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py +4 -0
- opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py +49 -0
- opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl.py +4 -0
- opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl_1ce587.py +41 -0
- opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl_f1e631.py +49 -0
- opencompass/configs/datasets/PJExam/PJExam_gen.py +4 -0
- opencompass/configs/datasets/PJExam/PJExam_gen_8cd97c.py +54 -0
- opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py +4 -0
- opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py +44 -0
- opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_ppl.py +4 -0
- opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_ppl_54058d.py +34 -0
- opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_ppl_5c24f1.py +45 -0
- opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_ppl_9f3618.py +49 -0
- opencompass/configs/datasets/adv_glue/__init__.py +11 -0
- opencompass/configs/datasets/collections/base_core.py +20 -0
- opencompass/configs/datasets/collections/base_medium.py +56 -0
- opencompass/configs/datasets/collections/base_medium_llama.py +56 -0
- opencompass/configs/datasets/collections/base_small.py +38 -0
- opencompass/configs/datasets/collections/chat_core.py +20 -0
- opencompass/configs/datasets/collections/chat_medium.py +56 -0
- opencompass/configs/datasets/collections/chat_small.py +39 -0
- opencompass/configs/datasets/collections/example.py +7 -0
- opencompass/configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py +69 -0
- opencompass/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py +68 -0
- opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py +84 -0
- opencompass/configs/datasets/ds1000/ds1000_gen_cbc84f.py +67 -0
- opencompass/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py +67 -0
- opencompass/configs/datasets/gaokao_math/README.md +108 -0
- opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py +48 -0
- opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py +4 -0
- opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py +37 -0
- opencompass/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py +37 -0
- opencompass/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py +38 -0
- opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py +4 -0
- opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp_1af0ae.py +43 -0
- opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp_fe50d8.py +47 -0
- opencompass/configs/datasets/livecodebench/README.md +42 -0
- opencompass/configs/datasets/livecodebench/livecodebench_gen.py +4 -0
- opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py +164 -0
- opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py +163 -0
- opencompass/configs/datasets/mbpp_plus/deprecated_mbpp_plus_gen_94815c.py +64 -0
- opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py +64 -0
- opencompass/configs/datasets/nq_cn/nqcn_gen.py +4 -0
- opencompass/configs/datasets/nq_cn/nqcn_gen_141737.py +34 -0
- opencompass/configs/datasets/race/race_cot_gen_d95929.py +68 -0
opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .CLUE_CMRC_gen_1bd3c8 import CMRC_datasets # noqa: F401, F403
|
opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import EMEvaluator
|
5 |
+
from opencompass.datasets import CMRCDataset, cmrc_postprocess
|
6 |
+
|
7 |
+
CMRC_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'context'], output_column='answers')
|
9 |
+
|
10 |
+
CMRC_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='根据文章回答问题。你的答案应该尽可能简练,请以 ‘答案是’ 开头的句式作答。\n文章:{context}\n问:{question}\n答:'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer))
|
20 |
+
|
21 |
+
CMRC_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=EMEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
pred_postprocessor=dict(type=cmrc_postprocess),
|
25 |
+
)
|
26 |
+
|
27 |
+
CMRC_datasets = [
|
28 |
+
dict(
|
29 |
+
type=CMRCDataset,
|
30 |
+
abbr='CMRC_dev',
|
31 |
+
path='opencompass/cmrc_dev',
|
32 |
+
reader_cfg=CMRC_reader_cfg,
|
33 |
+
infer_cfg=CMRC_infer_cfg,
|
34 |
+
eval_cfg=CMRC_eval_cfg),
|
35 |
+
]
|
opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_3749cd.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import EMEvaluator
|
5 |
+
from opencompass.datasets import CMRCDataset
|
6 |
+
|
7 |
+
CMRC_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'context'], output_column='answers')
|
9 |
+
|
10 |
+
CMRC_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(role='HUMAN', prompt='文章:{context}\n根据上文,回答如下问题:{question}'),
|
15 |
+
dict(role='BOT', prompt='答:'),
|
16 |
+
])),
|
17 |
+
retriever=dict(type=ZeroRetriever),
|
18 |
+
inferencer=dict(type=GenInferencer))
|
19 |
+
|
20 |
+
CMRC_eval_cfg = dict(
|
21 |
+
evaluator=dict(type=EMEvaluator),
|
22 |
+
pred_role='BOT',
|
23 |
+
)
|
24 |
+
|
25 |
+
CMRC_datasets = [
|
26 |
+
dict(
|
27 |
+
type=CMRCDataset,
|
28 |
+
abbr='CMRC_dev',
|
29 |
+
path='opencompass/cmrc_dev',
|
30 |
+
reader_cfg=CMRC_reader_cfg,
|
31 |
+
infer_cfg=CMRC_infer_cfg,
|
32 |
+
eval_cfg=CMRC_eval_cfg),
|
33 |
+
]
|
opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_8484b9.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import EMEvaluator
|
5 |
+
from opencompass.datasets import CMRCDataset
|
6 |
+
|
7 |
+
CMRC_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'context'], output_column='answers')
|
9 |
+
|
10 |
+
CMRC_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template='文章:{context}\n根据上文,回答如下问题: {question}\n答:'),
|
14 |
+
retriever=dict(type=ZeroRetriever),
|
15 |
+
inferencer=dict(type=GenInferencer))
|
16 |
+
|
17 |
+
CMRC_eval_cfg = dict(evaluator=dict(type=EMEvaluator), )
|
18 |
+
|
19 |
+
CMRC_datasets = [
|
20 |
+
dict(
|
21 |
+
type=CMRCDataset,
|
22 |
+
abbr='CMRC_dev',
|
23 |
+
path='opencompass/cmrc_dev',
|
24 |
+
reader_cfg=CMRC_reader_cfg,
|
25 |
+
infer_cfg=CMRC_infer_cfg,
|
26 |
+
eval_cfg=CMRC_eval_cfg),
|
27 |
+
]
|
opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_941108.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import EMEvaluator
|
5 |
+
from opencompass.datasets import CMRCDataset
|
6 |
+
|
7 |
+
CMRC_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'context'], output_column='answers')
|
9 |
+
|
10 |
+
CMRC_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='文章:{context}\n根据上文,回答如下问题:\n{question}\n答:'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer))
|
20 |
+
|
21 |
+
CMRC_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=EMEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
)
|
25 |
+
|
26 |
+
CMRC_datasets = [
|
27 |
+
dict(
|
28 |
+
type=CMRCDataset,
|
29 |
+
abbr='CMRC_dev',
|
30 |
+
path='opencompass/cmrc_dev',
|
31 |
+
reader_cfg=CMRC_reader_cfg,
|
32 |
+
infer_cfg=CMRC_infer_cfg,
|
33 |
+
eval_cfg=CMRC_eval_cfg),
|
34 |
+
]
|
opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import EprstmtDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
eprstmt_reader_cfg = dict(
|
9 |
+
input_columns=['sentence'], output_column='label', test_split='train')
|
10 |
+
|
11 |
+
eprstmt_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(round=[
|
15 |
+
dict(
|
16 |
+
role='HUMAN',
|
17 |
+
prompt=
|
18 |
+
'内容: "{sentence}"。请对上述内容进行情绪分类。\nA. 积极\nB. 消极\n请从”A“,”B“中进行选择。\n答:'
|
19 |
+
),
|
20 |
+
]),
|
21 |
+
),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer),
|
24 |
+
)
|
25 |
+
|
26 |
+
eprstmt_eval_cfg = dict(
|
27 |
+
evaluator=dict(type=AccEvaluator),
|
28 |
+
pred_role='BOT',
|
29 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
30 |
+
)
|
31 |
+
|
32 |
+
eprstmt_datasets = [
|
33 |
+
dict(
|
34 |
+
abbr='eprstmt-dev',
|
35 |
+
type=EprstmtDatasetV2,
|
36 |
+
path='./data/FewCLUE/eprstmt/dev_few_all.json',
|
37 |
+
reader_cfg=eprstmt_reader_cfg,
|
38 |
+
infer_cfg=eprstmt_infer_cfg,
|
39 |
+
eval_cfg=eprstmt_eval_cfg,
|
40 |
+
),
|
41 |
+
dict(
|
42 |
+
abbr='eprstmt-test',
|
43 |
+
type=EprstmtDatasetV2,
|
44 |
+
path='./data/FewCLUE/eprstmt/test_public.json',
|
45 |
+
reader_cfg=eprstmt_reader_cfg,
|
46 |
+
infer_cfg=eprstmt_infer_cfg,
|
47 |
+
eval_cfg=eprstmt_eval_cfg,
|
48 |
+
),
|
49 |
+
]
|
opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl_1ce587.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
eprstmt_reader_cfg = dict(
|
8 |
+
input_columns=['sentence'], output_column='label', test_split='train')
|
9 |
+
|
10 |
+
eprstmt_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template={
|
14 |
+
'Negative': ' 内容: "{sentence}"。情绪分类:消极。',
|
15 |
+
'Positive': ' 内容: "{sentence}"。情绪分类:积极。',
|
16 |
+
}),
|
17 |
+
retriever=dict(type=ZeroRetriever),
|
18 |
+
inferencer=dict(type=PPLInferencer))
|
19 |
+
|
20 |
+
eprstmt_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
21 |
+
|
22 |
+
eprstmt_datasets = [
|
23 |
+
dict(
|
24 |
+
type=HFDataset,
|
25 |
+
abbr='eprstmt-dev',
|
26 |
+
path='json',
|
27 |
+
data_files='./data/FewCLUE/eprstmt/dev_few_all.json',
|
28 |
+
split='train',
|
29 |
+
reader_cfg=eprstmt_reader_cfg,
|
30 |
+
infer_cfg=eprstmt_infer_cfg,
|
31 |
+
eval_cfg=eprstmt_eval_cfg),
|
32 |
+
dict(
|
33 |
+
type=HFDataset,
|
34 |
+
abbr='eprstmt-test',
|
35 |
+
path='json',
|
36 |
+
data_files='./data/FewCLUE/eprstmt/test_public.json',
|
37 |
+
split='train',
|
38 |
+
reader_cfg=eprstmt_reader_cfg,
|
39 |
+
infer_cfg=eprstmt_infer_cfg,
|
40 |
+
eval_cfg=eprstmt_eval_cfg)
|
41 |
+
]
|
opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_ppl_f1e631.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
eprstmt_reader_cfg = dict(
|
8 |
+
input_columns=['sentence'], output_column='label', test_split='train')
|
9 |
+
|
10 |
+
eprstmt_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template={
|
14 |
+
'Negative':
|
15 |
+
dict(round=[
|
16 |
+
dict(role='HUMAN', prompt='内容: "{sentence}"。情绪分类:'),
|
17 |
+
dict(role='BOT', prompt='消极。')
|
18 |
+
]),
|
19 |
+
'Positive':
|
20 |
+
dict(round=[
|
21 |
+
dict(role='HUMAN', prompt='内容: "{sentence}"。情绪分类:'),
|
22 |
+
dict(role='BOT', prompt='积极。')
|
23 |
+
]),
|
24 |
+
}),
|
25 |
+
retriever=dict(type=ZeroRetriever),
|
26 |
+
inferencer=dict(type=PPLInferencer))
|
27 |
+
|
28 |
+
eprstmt_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
29 |
+
|
30 |
+
eprstmt_datasets = [
|
31 |
+
dict(
|
32 |
+
type=HFDataset,
|
33 |
+
abbr='eprstmt-dev',
|
34 |
+
path='json',
|
35 |
+
data_files='./data/FewCLUE/eprstmt/dev_few_all.json',
|
36 |
+
split='train',
|
37 |
+
reader_cfg=eprstmt_reader_cfg,
|
38 |
+
infer_cfg=eprstmt_infer_cfg,
|
39 |
+
eval_cfg=eprstmt_eval_cfg),
|
40 |
+
dict(
|
41 |
+
type=HFDataset,
|
42 |
+
abbr='eprstmt-test',
|
43 |
+
path='json',
|
44 |
+
data_files='./data/FewCLUE/eprstmt/test_public.json',
|
45 |
+
split='train',
|
46 |
+
reader_cfg=eprstmt_reader_cfg,
|
47 |
+
infer_cfg=eprstmt_infer_cfg,
|
48 |
+
eval_cfg=eprstmt_eval_cfg)
|
49 |
+
]
|
opencompass/configs/datasets/PJExam/PJExam_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .PJExam_gen_8cd97c import PJExam_datasets # noqa: F401, F403
|
opencompass/configs/datasets/PJExam/PJExam_gen_8cd97c.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import PJExamDataset, PJExamEvaluator
|
5 |
+
|
6 |
+
PJExam_datasets = []
|
7 |
+
for _name in [
|
8 |
+
'gk-2022-v1', 'gk-2022-v1-math', 'gk-2023-v1', 'gk-2023-v1-math',
|
9 |
+
'gk-2023-v2', 'gk-2023-v2-math', 'zk-2022-v1'
|
10 |
+
]:
|
11 |
+
_hint = '请你做一道</major>选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】A<eoa>\n完整的题目回答的格式如下:\n【解析】...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:\n'
|
12 |
+
_reader_cfg = {
|
13 |
+
'input_columns': ['question'],
|
14 |
+
'output_column': 'std_ans',
|
15 |
+
},
|
16 |
+
_infer_cfg = {
|
17 |
+
'ice_template': {
|
18 |
+
'type': PromptTemplate,
|
19 |
+
'template': {
|
20 |
+
'round': [{
|
21 |
+
'role': 'HUMAN',
|
22 |
+
'prompt': _hint + '{question}',
|
23 |
+
}]
|
24 |
+
},
|
25 |
+
'ice_token': '</E>'
|
26 |
+
},
|
27 |
+
'retriever': {
|
28 |
+
'type': ZeroRetriever
|
29 |
+
},
|
30 |
+
'inferencer': {
|
31 |
+
'type': GenInferencer,
|
32 |
+
'max_out_len': 1024,
|
33 |
+
}
|
34 |
+
}
|
35 |
+
_eval_cfg = {
|
36 |
+
'evaluator': {
|
37 |
+
'type': PJExamEvaluator
|
38 |
+
},
|
39 |
+
'pred_role': 'BOT',
|
40 |
+
'ds_column': 'eval_infos'
|
41 |
+
}
|
42 |
+
_dataset = {
|
43 |
+
'type': PJExamDataset,
|
44 |
+
'abbr': 'PJExamDataset-' + _name,
|
45 |
+
'path': './data/PJExam',
|
46 |
+
'name': _name,
|
47 |
+
'reader_cfg': _reader_cfg,
|
48 |
+
'infer_cfg': _infer_cfg,
|
49 |
+
'eval_cfg': _eval_cfg,
|
50 |
+
}
|
51 |
+
|
52 |
+
PJExam_datasets.append(_dataset)
|
53 |
+
|
54 |
+
del _name, _hint, _reader_cfg, _infer_cfg, _eval_cfg, _dataset
|
opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .SuperGLUE_COPA_gen_91ca53 import COPA_datasets # noqa: F401, F403
|
opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import COPADatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
COPA_reader_cfg = dict(
|
9 |
+
input_columns=['question', 'premise', 'choice1', 'choice2'],
|
10 |
+
output_column='label',
|
11 |
+
)
|
12 |
+
|
13 |
+
COPA_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(
|
17 |
+
round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt=
|
21 |
+
'{premise}\nQuestion: Which may be the {question}?\nA. {choice1}\nB. {choice2}\nAnswer:'
|
22 |
+
),
|
23 |
+
], ),
|
24 |
+
),
|
25 |
+
retriever=dict(type=ZeroRetriever),
|
26 |
+
inferencer=dict(type=GenInferencer),
|
27 |
+
)
|
28 |
+
|
29 |
+
COPA_eval_cfg = dict(
|
30 |
+
evaluator=dict(type=AccEvaluator),
|
31 |
+
pred_role='BOT',
|
32 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
33 |
+
)
|
34 |
+
|
35 |
+
COPA_datasets = [
|
36 |
+
dict(
|
37 |
+
abbr='COPA',
|
38 |
+
type=COPADatasetV2,
|
39 |
+
path='./data/SuperGLUE/COPA/val.jsonl',
|
40 |
+
reader_cfg=COPA_reader_cfg,
|
41 |
+
infer_cfg=COPA_infer_cfg,
|
42 |
+
eval_cfg=COPA_eval_cfg,
|
43 |
+
)
|
44 |
+
]
|
opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .SuperGLUE_COPA_ppl_9f3618 import COPA_datasets # noqa: F401, F403
|
opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_ppl_54058d.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
COPA_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'premise', 'choice1', 'choice2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
COPA_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0: 'Premise:{premise}。\nQuestion:{question}。\nAnswer: {choice1}.',
|
17 |
+
1: 'Passage:{premise}。\nQuestion:{question}。\nAnswer: {choice2}.',
|
18 |
+
}),
|
19 |
+
retriever=dict(type=ZeroRetriever),
|
20 |
+
inferencer=dict(type=PPLInferencer))
|
21 |
+
|
22 |
+
COPA_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
23 |
+
|
24 |
+
COPA_datasets = [
|
25 |
+
dict(
|
26 |
+
type=HFDataset,
|
27 |
+
abbr='COPA',
|
28 |
+
path='json',
|
29 |
+
data_files='./data/SuperGLUE/COPA/val.jsonl',
|
30 |
+
split='train',
|
31 |
+
reader_cfg=COPA_reader_cfg,
|
32 |
+
infer_cfg=COPA_infer_cfg,
|
33 |
+
eval_cfg=COPA_eval_cfg)
|
34 |
+
]
|
opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_ppl_5c24f1.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
COPA_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'premise', 'choice1', 'choice2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
COPA_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0:
|
17 |
+
dict(round=[
|
18 |
+
dict(role='HUMAN', prompt='{premise}\nQuestion: {question}\nAnswer:'),
|
19 |
+
dict(role='BOT', prompt='{choice1}'),
|
20 |
+
]),
|
21 |
+
1:
|
22 |
+
dict(round=[
|
23 |
+
dict(role='HUMAN', prompt='{premise}\nQuestion: {question}\nAnswer:'),
|
24 |
+
dict(role='BOT', prompt='{choice2}'),
|
25 |
+
]),
|
26 |
+
},
|
27 |
+
),
|
28 |
+
retriever=dict(type=ZeroRetriever),
|
29 |
+
inferencer=dict(type=PPLInferencer),
|
30 |
+
)
|
31 |
+
|
32 |
+
COPA_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
33 |
+
|
34 |
+
COPA_datasets = [
|
35 |
+
dict(
|
36 |
+
type=HFDataset,
|
37 |
+
abbr='COPA',
|
38 |
+
path='json',
|
39 |
+
data_files='./data/SuperGLUE/COPA/val.jsonl',
|
40 |
+
split='train',
|
41 |
+
reader_cfg=COPA_reader_cfg,
|
42 |
+
infer_cfg=COPA_infer_cfg,
|
43 |
+
eval_cfg=COPA_eval_cfg,
|
44 |
+
)
|
45 |
+
]
|
opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_ppl_9f3618.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HFDataset
|
6 |
+
|
7 |
+
COPA_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'premise', 'choice1', 'choice2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
COPA_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0:
|
17 |
+
dict(round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt='{premise}\nQuestion: What may be the {question}?\nAnswer:'),
|
21 |
+
dict(role='BOT', prompt='{choice1}'),
|
22 |
+
]),
|
23 |
+
1:
|
24 |
+
dict(round=[
|
25 |
+
dict(
|
26 |
+
role='HUMAN',
|
27 |
+
prompt='{premise}\nQuestion: What may be the {question}?\nAnswer:'),
|
28 |
+
dict(role='BOT', prompt='{choice2}'),
|
29 |
+
]),
|
30 |
+
},
|
31 |
+
),
|
32 |
+
retriever=dict(type=ZeroRetriever),
|
33 |
+
inferencer=dict(type=PPLInferencer),
|
34 |
+
)
|
35 |
+
|
36 |
+
COPA_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
37 |
+
|
38 |
+
COPA_datasets = [
|
39 |
+
dict(
|
40 |
+
type=HFDataset,
|
41 |
+
abbr='COPA',
|
42 |
+
path='json',
|
43 |
+
data_files='./data/SuperGLUE/COPA/val.jsonl',
|
44 |
+
split='train',
|
45 |
+
reader_cfg=COPA_reader_cfg,
|
46 |
+
infer_cfg=COPA_infer_cfg,
|
47 |
+
eval_cfg=COPA_eval_cfg,
|
48 |
+
)
|
49 |
+
]
|
opencompass/configs/datasets/adv_glue/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .adv_glue_sst2.adv_glue_sst2_gen import adv_sst2_datasets
|
5 |
+
from .adv_glue_qqp.adv_glue_qqp_gen import adv_qqp_datasets
|
6 |
+
from .adv_glue_rte.adv_glue_rte_gen import adv_rte_datasets
|
7 |
+
from .adv_glue_qnli.adv_glue_qnli_gen import adv_qnli_datasets
|
8 |
+
from .adv_glue_mnli.adv_glue_mnli_gen import adv_mnli_datasets
|
9 |
+
from .adv_glue_mnli_mm.adv_glue_mnli_mm_gen import adv_mnli_mm_datasets
|
10 |
+
|
11 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/collections/base_core.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
|
5 |
+
from ..cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets
|
6 |
+
from ..ceval.ceval_ppl_1cd8bf import ceval_datasets
|
7 |
+
from ..GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import GaokaoBench_datasets
|
8 |
+
from ..triviaqa.triviaqa_wiki_1shot_gen_20a989 import triviaqa_datasets
|
9 |
+
from ..nq.nq_open_1shot_gen_20a989 import nq_datasets
|
10 |
+
from ..race.race_ppl_abed12 import race_datasets
|
11 |
+
from ..winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets
|
12 |
+
from ..hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
|
13 |
+
from ..bbh.bbh_gen_98fba6 import bbh_datasets
|
14 |
+
from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets
|
15 |
+
from ..math.math_evaluatorv2_gen_2f4a71 import math_datasets
|
16 |
+
from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
|
17 |
+
from ..humaneval.deprecated_humaneval_gen_d2537e import humaneval_datasets
|
18 |
+
from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
|
19 |
+
|
20 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/collections/base_medium.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
|
5 |
+
from ..ceval.ceval_ppl_578f8d import ceval_datasets
|
6 |
+
from ..agieval.agieval_mixed_713d14 import agieval_datasets
|
7 |
+
from ..GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
|
8 |
+
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
9 |
+
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
10 |
+
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
11 |
+
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
|
12 |
+
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
13 |
+
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
14 |
+
from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
|
15 |
+
from ..CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
|
16 |
+
from ..CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
|
17 |
+
from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
|
18 |
+
from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
|
19 |
+
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_4284a0 import cluewsc_datasets
|
20 |
+
from ..FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
|
21 |
+
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
|
22 |
+
from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets
|
23 |
+
from ..FewCLUE_tnews.FewCLUE_tnews_ppl_d10e8a import tnews_datasets
|
24 |
+
from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
|
25 |
+
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
26 |
+
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
|
27 |
+
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
|
28 |
+
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
|
29 |
+
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets
|
30 |
+
from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
|
31 |
+
from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
|
32 |
+
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
|
33 |
+
from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
|
34 |
+
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
|
35 |
+
from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
|
36 |
+
from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
|
37 |
+
from ..race.race_ppl_a138cd import race_datasets
|
38 |
+
from ..Xsum.Xsum_gen_31397e import Xsum_datasets
|
39 |
+
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
40 |
+
from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
|
41 |
+
from ..math.math_gen_265cce import math_datasets
|
42 |
+
from ..TheoremQA.TheoremQA_gen_ef26ca import TheoremQA_datasets
|
43 |
+
from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets
|
44 |
+
from ..ARC_e.ARC_e_ppl_a450bd import ARC_e_datasets
|
45 |
+
from ..ARC_c.ARC_c_ppl_a450bd import ARC_c_datasets
|
46 |
+
from ..commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
|
47 |
+
from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets
|
48 |
+
from ..siqa.siqa_ppl_ced5f6 import siqa_datasets
|
49 |
+
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
|
50 |
+
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
|
51 |
+
from ..obqa.obqa_ppl_c7c154 import obqa_datasets
|
52 |
+
from ..nq.nq_gen_c788f6 import nq_datasets
|
53 |
+
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
54 |
+
from ..flores.flores_gen_806ede import flores_datasets
|
55 |
+
|
56 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/collections/base_medium_llama.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..mmlu.mmlu_ppl_ac766d import mmlu_datasets
|
5 |
+
from ..ceval.ceval_ppl_578f8d import ceval_datasets
|
6 |
+
from ..agieval.agieval_mixed_713d14 import agieval_datasets
|
7 |
+
from ..GaokaoBench.GaokaoBench_mixed_9af5ee import GaokaoBench_datasets
|
8 |
+
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
9 |
+
from ..humaneval.deprecated_humaneval_gen_a82cae import humaneval_datasets
|
10 |
+
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
11 |
+
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
|
12 |
+
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
13 |
+
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
14 |
+
from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
|
15 |
+
from ..CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
|
16 |
+
from ..CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
|
17 |
+
from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
|
18 |
+
from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
|
19 |
+
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_4284a0 import cluewsc_datasets
|
20 |
+
from ..FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
|
21 |
+
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
|
22 |
+
from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets
|
23 |
+
from ..FewCLUE_tnews.FewCLUE_tnews_ppl_d10e8a import tnews_datasets
|
24 |
+
from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
|
25 |
+
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
26 |
+
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
|
27 |
+
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
|
28 |
+
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
|
29 |
+
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
|
30 |
+
from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
|
31 |
+
from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
|
32 |
+
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
|
33 |
+
from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
|
34 |
+
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
|
35 |
+
from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
|
36 |
+
from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
|
37 |
+
from ..race.race_ppl_5831a0 import race_datasets
|
38 |
+
from ..Xsum.Xsum_gen_31397e import Xsum_datasets
|
39 |
+
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
40 |
+
from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
|
41 |
+
from ..math.math_gen_265cce import math_datasets
|
42 |
+
from ..TheoremQA.TheoremQA_gen_ef26ca import TheoremQA_datasets
|
43 |
+
from ..hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
|
44 |
+
from ..ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
|
45 |
+
from ..ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
|
46 |
+
from ..commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
|
47 |
+
from ..piqa.piqa_ppl_0cfff2 import piqa_datasets
|
48 |
+
from ..siqa.siqa_ppl_e8d8c5 import siqa_datasets
|
49 |
+
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
|
50 |
+
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
|
51 |
+
from ..obqa.obqa_ppl_6aac9e import obqa_datasets
|
52 |
+
from ..nq.nq_gen_0356ec import nq_datasets
|
53 |
+
from ..triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
|
54 |
+
from ..flores.flores_gen_806ede import flores_datasets
|
55 |
+
|
56 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/collections/base_small.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..ceval.ceval_ppl_578f8d import ceval_datasets
|
5 |
+
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
6 |
+
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
7 |
+
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
8 |
+
from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
|
9 |
+
from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets
|
10 |
+
from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
|
11 |
+
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets
|
12 |
+
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
|
13 |
+
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
14 |
+
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
15 |
+
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
16 |
+
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
|
17 |
+
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
|
18 |
+
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
|
19 |
+
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314b96 import BoolQ_datasets
|
20 |
+
from ..SuperGLUE_CB.SuperGLUE_CB_ppl_0143fe import CB_datasets
|
21 |
+
from ..SuperGLUE_COPA.SuperGLUE_COPA_ppl_9f3618 import COPA_datasets
|
22 |
+
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets
|
23 |
+
from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
|
24 |
+
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
|
25 |
+
from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
|
26 |
+
from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_d0f531 import WSC_datasets
|
27 |
+
from ..race.race_ppl_a138cd import race_datasets
|
28 |
+
from ..math.math_gen_265cce import math_datasets
|
29 |
+
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
30 |
+
from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
|
31 |
+
from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets
|
32 |
+
from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets
|
33 |
+
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
|
34 |
+
from ..obqa.obqa_ppl_c7c154 import obqa_datasets
|
35 |
+
from ..nq.nq_gen_c788f6 import nq_datasets
|
36 |
+
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
37 |
+
|
38 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/collections/chat_core.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
|
5 |
+
from ..cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
6 |
+
from ..ceval.ceval_internal_gen_2daf24 import ceval_datasets
|
7 |
+
from ..GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
|
8 |
+
from ..triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import triviaqa_datasets
|
9 |
+
from ..nq.nq_open_1shot_gen_2e45e5 import nq_datasets
|
10 |
+
from ..race.race_gen_69ee4f import race_datasets
|
11 |
+
from ..winogrande.winogrande_5shot_gen_6447e6 import winogrande_datasets
|
12 |
+
from ..hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
|
13 |
+
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
14 |
+
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
15 |
+
from ..math.math_evaluatorv2_gen_cecb31 import math_datasets
|
16 |
+
from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
|
17 |
+
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
18 |
+
from ..mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
|
19 |
+
|
20 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/collections/chat_medium.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
|
5 |
+
from ..ceval.ceval_gen_5f30c7 import ceval_datasets
|
6 |
+
from ..agieval.agieval_gen_64afd3 import agieval_datasets
|
7 |
+
from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
|
8 |
+
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
9 |
+
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
10 |
+
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
11 |
+
from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
12 |
+
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
13 |
+
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
14 |
+
from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
|
15 |
+
from ..CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
|
16 |
+
from ..CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
|
17 |
+
from ..FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
|
18 |
+
from ..FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
|
19 |
+
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
|
20 |
+
from ..FewCLUE_csl.FewCLUE_csl_gen_28b223 import csl_datasets
|
21 |
+
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
|
22 |
+
from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
|
23 |
+
from ..FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
|
24 |
+
from ..lcsts.lcsts_gen_8ee1fe import lcsts_datasets
|
25 |
+
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
26 |
+
from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
|
27 |
+
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
|
28 |
+
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
|
29 |
+
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
|
30 |
+
from ..SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
|
31 |
+
from ..SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
|
32 |
+
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
|
33 |
+
from ..SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
|
34 |
+
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
|
35 |
+
from ..SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
36 |
+
from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
|
37 |
+
from ..race.race_gen_69ee4f import race_datasets
|
38 |
+
from ..Xsum.Xsum_gen_31397e import Xsum_datasets
|
39 |
+
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
40 |
+
from ..summedits.summedits_gen_315438 import summedits_datasets
|
41 |
+
from ..math.math_gen_265cce import math_datasets
|
42 |
+
from ..TheoremQA.TheoremQA_gen_7009de import TheoremQA_datasets
|
43 |
+
from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
|
44 |
+
from ..ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
|
45 |
+
from ..ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
|
46 |
+
from ..commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
|
47 |
+
from ..piqa.piqa_gen_1194eb import piqa_datasets
|
48 |
+
from ..siqa.siqa_gen_e78df3 import siqa_datasets
|
49 |
+
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
|
50 |
+
from ..winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
|
51 |
+
from ..obqa.obqa_gen_9069e4 import obqa_datasets
|
52 |
+
from ..nq.nq_gen_c788f6 import nq_datasets
|
53 |
+
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
54 |
+
from ..flores.flores_gen_806ede import flores_datasets
|
55 |
+
|
56 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/collections/chat_small.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..mmlu.mmlu_gen_4d595a import mmlu_datasets
|
5 |
+
from ..ceval.ceval_gen_5f30c7 import ceval_datasets
|
6 |
+
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
7 |
+
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
8 |
+
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
9 |
+
from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
|
10 |
+
from ..FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
|
11 |
+
from ..FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
|
12 |
+
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
|
13 |
+
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
|
14 |
+
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
15 |
+
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
16 |
+
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
17 |
+
from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
|
18 |
+
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
|
19 |
+
from ..SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
|
20 |
+
from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
|
21 |
+
from ..SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
|
22 |
+
from ..SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
|
23 |
+
from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
|
24 |
+
from ..SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
|
25 |
+
from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
|
26 |
+
from ..SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
27 |
+
from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_fe4bf3 import WSC_datasets
|
28 |
+
from ..race.race_gen_69ee4f import race_datasets
|
29 |
+
from ..math.math_gen_265cce import math_datasets
|
30 |
+
from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
31 |
+
from ..summedits.summedits_gen_315438 import summedits_datasets
|
32 |
+
from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
|
33 |
+
from ..piqa.piqa_gen_1194eb import piqa_datasets
|
34 |
+
from ..winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
|
35 |
+
from ..obqa.obqa_gen_9069e4 import obqa_datasets
|
36 |
+
from ..nq.nq_gen_c788f6 import nq_datasets
|
37 |
+
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
38 |
+
|
39 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/collections/example.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from ..piqa.piqa_gen_1194eb import piqa_datasets
|
5 |
+
from ..nq.nq_gen_c788f6 import nq_datasets
|
6 |
+
|
7 |
+
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
opencompass/configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import (DS1000Dataset, ds1000_completion_postprocess,
|
5 |
+
ds1000_matplotlib_postprocess,
|
6 |
+
DS1000Evaluator)
|
7 |
+
|
8 |
+
ds1000_reader_cfg = dict(
|
9 |
+
input_columns=['prompt'],
|
10 |
+
output_column='test_column',
|
11 |
+
train_split='test',
|
12 |
+
test_split='test')
|
13 |
+
|
14 |
+
ds1000_infer_cfg = dict(
|
15 |
+
prompt_template=dict(
|
16 |
+
type=PromptTemplate,
|
17 |
+
template=dict(round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt='{prompt}',
|
21 |
+
),
|
22 |
+
]),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
ds1000_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=DS1000Evaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=ds1000_completion_postprocess),
|
32 |
+
)
|
33 |
+
|
34 |
+
# The DS-1000 dataset can be downloaded from
|
35 |
+
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
36 |
+
ds1000_datasets = [
|
37 |
+
dict(
|
38 |
+
abbr=f'ds1000_{lib}',
|
39 |
+
type=DS1000Dataset,
|
40 |
+
path='./data/ds1000_data/',
|
41 |
+
libs=f'{lib}',
|
42 |
+
mode='Completion',
|
43 |
+
reader_cfg=ds1000_reader_cfg,
|
44 |
+
infer_cfg=ds1000_infer_cfg,
|
45 |
+
eval_cfg=ds1000_eval_cfg,
|
46 |
+
) for lib in [
|
47 |
+
'Pandas',
|
48 |
+
'Numpy',
|
49 |
+
'Tensorflow',
|
50 |
+
'Scipy',
|
51 |
+
'Sklearn',
|
52 |
+
'Pytorch',
|
53 |
+
]
|
54 |
+
]
|
55 |
+
ds1000_datasets.append(
|
56 |
+
dict(
|
57 |
+
abbr='ds1000_Matplotlib',
|
58 |
+
type=DS1000Dataset,
|
59 |
+
path='./data/ds1000_data/',
|
60 |
+
libs='Matplotlib',
|
61 |
+
mode='Completion',
|
62 |
+
reader_cfg=ds1000_reader_cfg,
|
63 |
+
infer_cfg=ds1000_infer_cfg,
|
64 |
+
eval_cfg=dict(
|
65 |
+
evaluator=dict(type=DS1000Evaluator),
|
66 |
+
pred_role='BOT',
|
67 |
+
pred_postprocessor=dict(type=ds1000_matplotlib_postprocess),
|
68 |
+
),
|
69 |
+
))
|
opencompass/configs/datasets/ds1000/ds1000_compl_service_eval_gen_cbc84f.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import DS1000Dataset, DS1000ServiceEvaluator
|
5 |
+
|
6 |
+
ds1000_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'],
|
8 |
+
output_column='test_column',
|
9 |
+
train_split='test',
|
10 |
+
test_split='test')
|
11 |
+
|
12 |
+
ds1000_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template=dict(round=[
|
16 |
+
dict(
|
17 |
+
role='HUMAN',
|
18 |
+
prompt='{prompt}',
|
19 |
+
),
|
20 |
+
]),
|
21 |
+
),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer),
|
24 |
+
)
|
25 |
+
|
26 |
+
ds1000_eval_cfg_dict = {
|
27 |
+
lib: dict(
|
28 |
+
evaluator=dict(
|
29 |
+
type=DS1000ServiceEvaluator,
|
30 |
+
lib=lib,
|
31 |
+
ip_address=
|
32 |
+
'localhost', # replace to your code_eval_server ip_address, port
|
33 |
+
port=5000
|
34 |
+
),
|
35 |
+
pred_role='BOT')
|
36 |
+
for lib in [
|
37 |
+
'Pandas',
|
38 |
+
'Numpy',
|
39 |
+
'Tensorflow',
|
40 |
+
'Scipy',
|
41 |
+
'Sklearn',
|
42 |
+
'Pytorch',
|
43 |
+
'Matplotlib',
|
44 |
+
]
|
45 |
+
}
|
46 |
+
|
47 |
+
# The DS-1000 dataset can be downloaded from
|
48 |
+
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
49 |
+
ds1000_datasets = [
|
50 |
+
dict(
|
51 |
+
abbr=f'ds1000_{lib}',
|
52 |
+
type=DS1000Dataset,
|
53 |
+
path='./data/ds1000_data/',
|
54 |
+
libs=f'{lib}',
|
55 |
+
mode='Completion',
|
56 |
+
reader_cfg=ds1000_reader_cfg,
|
57 |
+
infer_cfg=ds1000_infer_cfg,
|
58 |
+
eval_cfg=ds1000_eval_cfg_dict[lib],
|
59 |
+
) for lib in [
|
60 |
+
'Pandas',
|
61 |
+
'Numpy',
|
62 |
+
'Tensorflow',
|
63 |
+
'Scipy',
|
64 |
+
'Sklearn',
|
65 |
+
'Pytorch',
|
66 |
+
'Matplotlib',
|
67 |
+
]
|
68 |
+
]
|
opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import AgentInferencer
|
4 |
+
from opencompass.datasets import DS1000Dataset_Interperter, DS1000InterpreterEvaluator
|
5 |
+
|
6 |
+
ds1000_example = """
|
7 |
+
In the following task, you should generate code with one assertion to testify the correctness of your code.
|
8 |
+
|
9 |
+
Example:
|
10 |
+
|
11 |
+
<HUMAN>Problem:
|
12 |
+
How do I get the dimensions of an array? For instance, this is (2, 2):
|
13 |
+
a = np.array([[1,2],[3,4]])
|
14 |
+
<ASSISTANT>{thought} In Python, Numpy provides a method called `shape` which helps to get the dimensions of an array.
|
15 |
+
{action} PythonInterpreter
|
16 |
+
{action_input}
|
17 |
+
```python
|
18 |
+
import numpy as np
|
19 |
+
def solution(x):
|
20 |
+
# Convert to np.ndarray
|
21 |
+
x = np.array(x)
|
22 |
+
# Getting the dimensions of the array
|
23 |
+
dimensions = x.shape
|
24 |
+
return dimensions
|
25 |
+
assert solution([[1,2],[3,4]]) == (2, 2)
|
26 |
+
```
|
27 |
+
<SYSTEM>{response}True
|
28 |
+
<ASSISTANT> {thought} By running this code, you can get the dimensions of an array.
|
29 |
+
{finish}
|
30 |
+
```python
|
31 |
+
import numpy as np
|
32 |
+
def solution(x):
|
33 |
+
# Convert to np.ndarray
|
34 |
+
x = np.array(x)
|
35 |
+
# Getting the dimensions of the array
|
36 |
+
dimensions = x.shape
|
37 |
+
return dimensions
|
38 |
+
```
|
39 |
+
"""
|
40 |
+
|
41 |
+
ds1000_reader_cfg = dict(
|
42 |
+
input_columns=['prompt'],
|
43 |
+
output_column='test_column',
|
44 |
+
train_split='test',
|
45 |
+
test_split='test',
|
46 |
+
)
|
47 |
+
|
48 |
+
ds1000_infer_cfg = dict(
|
49 |
+
prompt_template=dict(
|
50 |
+
type=PromptTemplate,
|
51 |
+
template="""{prompt}""",
|
52 |
+
),
|
53 |
+
retriever=dict(type=ZeroRetriever),
|
54 |
+
inferencer=dict(type=AgentInferencer, example=ds1000_example),
|
55 |
+
)
|
56 |
+
|
57 |
+
ds1000_eval_cfg = dict(
|
58 |
+
evaluator=dict(type=DS1000InterpreterEvaluator),
|
59 |
+
pred_role='BOT',
|
60 |
+
)
|
61 |
+
|
62 |
+
# The DS-1000 dataset can be downloaded from
|
63 |
+
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
64 |
+
|
65 |
+
# Matplotlib cannot fit this setting
|
66 |
+
ds1000_datasets = [
|
67 |
+
dict(
|
68 |
+
abbr=f'ds1000_{lib}',
|
69 |
+
type=DS1000Dataset_Interperter, # bustm share the same format with AFQMC
|
70 |
+
path='./data/ds1000_data/',
|
71 |
+
libs=f'{lib}',
|
72 |
+
reader_cfg=ds1000_reader_cfg,
|
73 |
+
infer_cfg=ds1000_infer_cfg,
|
74 |
+
eval_cfg=ds1000_eval_cfg,
|
75 |
+
)
|
76 |
+
for lib in [
|
77 |
+
'Pandas',
|
78 |
+
'Numpy',
|
79 |
+
# 'Tensorflow', # error using tensorflow, skipped temporarily
|
80 |
+
'Scipy',
|
81 |
+
'Sklearn',
|
82 |
+
'Pytorch',
|
83 |
+
]
|
84 |
+
]
|
opencompass/configs/datasets/ds1000/ds1000_gen_cbc84f.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import (DS1000Dataset, ds1000_postprocess,
|
5 |
+
ds1000_matplotlib_postprocess,
|
6 |
+
DS1000Evaluator)
|
7 |
+
|
8 |
+
ds1000_reader_cfg = dict(
|
9 |
+
input_columns=['prompt'],
|
10 |
+
output_column='test_column',
|
11 |
+
train_split='test',
|
12 |
+
test_split='test')
|
13 |
+
|
14 |
+
ds1000_infer_cfg = dict(
|
15 |
+
prompt_template=dict(
|
16 |
+
type=PromptTemplate,
|
17 |
+
template=dict(round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt='{prompt}',
|
21 |
+
),
|
22 |
+
]),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
ds1000_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=DS1000Evaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=ds1000_postprocess),
|
32 |
+
)
|
33 |
+
|
34 |
+
# The DS-1000 dataset can be downloaded from
|
35 |
+
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
36 |
+
ds1000_datasets = [
|
37 |
+
dict(
|
38 |
+
abbr=f'ds1000_{lib}',
|
39 |
+
type=DS1000Dataset,
|
40 |
+
path='./data/ds1000_data/',
|
41 |
+
libs=f'{lib}',
|
42 |
+
reader_cfg=ds1000_reader_cfg,
|
43 |
+
infer_cfg=ds1000_infer_cfg,
|
44 |
+
eval_cfg=ds1000_eval_cfg,
|
45 |
+
) for lib in [
|
46 |
+
'Pandas',
|
47 |
+
'Numpy',
|
48 |
+
'Tensorflow',
|
49 |
+
'Scipy',
|
50 |
+
'Sklearn',
|
51 |
+
'Pytorch',
|
52 |
+
]
|
53 |
+
]
|
54 |
+
ds1000_datasets.append(
|
55 |
+
dict(
|
56 |
+
abbr='ds1000_Matplotlib',
|
57 |
+
type=DS1000Dataset,
|
58 |
+
path='./data/ds1000_data/',
|
59 |
+
libs='Matplotlib',
|
60 |
+
reader_cfg=ds1000_reader_cfg,
|
61 |
+
infer_cfg=ds1000_infer_cfg,
|
62 |
+
eval_cfg=dict(
|
63 |
+
evaluator=dict(type=DS1000Evaluator),
|
64 |
+
pred_role='BOT',
|
65 |
+
pred_postprocessor=dict(type=ds1000_matplotlib_postprocess),
|
66 |
+
),
|
67 |
+
))
|
opencompass/configs/datasets/ds1000/ds1000_service_eval_gen_cbc84f.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import DS1000Dataset, DS1000ServiceEvaluator
|
5 |
+
|
6 |
+
ds1000_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'],
|
8 |
+
output_column='test_column',
|
9 |
+
train_split='test',
|
10 |
+
test_split='test')
|
11 |
+
|
12 |
+
ds1000_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template=dict(round=[
|
16 |
+
dict(
|
17 |
+
role='HUMAN',
|
18 |
+
prompt='{prompt}',
|
19 |
+
),
|
20 |
+
]),
|
21 |
+
),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer),
|
24 |
+
)
|
25 |
+
|
26 |
+
ds1000_eval_cfg_dict = {
|
27 |
+
lib: dict(
|
28 |
+
evaluator=dict(
|
29 |
+
type=DS1000ServiceEvaluator,
|
30 |
+
lib=lib,
|
31 |
+
ip_address=
|
32 |
+
'localhost', # replace to your code_eval_server ip_address, port
|
33 |
+
port=5000
|
34 |
+
),
|
35 |
+
pred_role='BOT')
|
36 |
+
for lib in [
|
37 |
+
'Pandas',
|
38 |
+
'Numpy',
|
39 |
+
'Tensorflow',
|
40 |
+
'Scipy',
|
41 |
+
'Sklearn',
|
42 |
+
'Pytorch',
|
43 |
+
'Matplotlib',
|
44 |
+
]
|
45 |
+
}
|
46 |
+
|
47 |
+
# The DS-1000 dataset can be downloaded from
|
48 |
+
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
49 |
+
ds1000_datasets = [
|
50 |
+
dict(
|
51 |
+
abbr=f'ds1000_{lib}',
|
52 |
+
type=DS1000Dataset,
|
53 |
+
path='./data/ds1000_data/',
|
54 |
+
libs=f'{lib}',
|
55 |
+
reader_cfg=ds1000_reader_cfg,
|
56 |
+
infer_cfg=ds1000_infer_cfg,
|
57 |
+
eval_cfg=ds1000_eval_cfg_dict[lib],
|
58 |
+
) for lib in [
|
59 |
+
'Pandas',
|
60 |
+
'Numpy',
|
61 |
+
'Tensorflow',
|
62 |
+
'Scipy',
|
63 |
+
'Sklearn',
|
64 |
+
'Pytorch',
|
65 |
+
'Matplotlib',
|
66 |
+
]
|
67 |
+
]
|
opencompass/configs/datasets/gaokao_math/README.md
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# GaoKao MATH Answer Evaluation Dataset
|
2 |
+
A dataset for testing the performance of the model in the GaoKao MATH Answer Extraction task.
|
3 |
+
Now support the following format of GAOKAO math questions:
|
4 |
+
1. '单选题':Single choice question
|
5 |
+
2. '多选题':Multiple choice question
|
6 |
+
3. '填空题':Fill in the blank question, can be multiple blanks
|
7 |
+
4. '解答题':Answer question, can be multiple answers
|
8 |
+
|
9 |
+
Sample data:
|
10 |
+
```json
|
11 |
+
[
|
12 |
+
{
|
13 |
+
"id": "3b270bc4-570a-4d77-b122-a2fc372f7d6a",
|
14 |
+
"question": "过椭圆${x^2\\over {16}} +{ y^2 \\over {4}}=1$ %内一点$M(2,1)$ %引一条弦,使该弦被点$M$ %平分,则这条弦所在直线的方程为( ).\nA. $x+2y-4=0$ %\nB. $x-2y-4=0$ %\nC. $x+2y+4=0$ %\nD. $x-2y+4=0$ %\n\n",
|
15 |
+
"response": "本题主要考查直线与圆锥曲线.设所求直线与椭圆的一个交点为$A(x,y)$ %,由于中点$M(2,1)$ %,所以另一个交点$B$ %为$(4-x,2-y)$ %.因为$A$ %,$B$ %两点都在椭圆上,所以$x^2+4y^2=16$ %,$(4-x)^2+4(2-y)^2=16$ %,两式相减,整理可得$x+2y-4=0$ %.由于过$A$ %,$B$ %两点的直线只有一条,所以这条弦所在直线的方程为$x+2y-4=0$ %.故本题正确答案为A.\n答案是:A",
|
16 |
+
"extract_answer": "A",
|
17 |
+
"question_type": "单选题"
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"id": "d60e42d7-30ee-44f9-a94d-aff6a8127750",
|
21 |
+
"question": "若函数$f(x)$ 具有下列性质:1.定义域为$(-1,1)$ ;2.对于任意的$x,y\\in(-1,1)$ ,都有$f(x)+f(y)=f\\left({\\dfrac{x+y}{1+xy}}\\right)$ ;3.当$-1< x< 0$ 时,$f(x)>0$ ,则称函数$f(x)$ 为$δ$ 的函数$.$ 若函数$f(x)$ 为$δ$ 的函数,则以下结论正确的是$(\\quad)$\nA. $\nB. x)$ 为奇函数\nC. $\nD. x)$ 为偶函数\nE. $\nF. x)$ 为单调递减函数\nG. $\nH. x)$ 为单调递增函数\n\n",
|
22 |
+
"response": "函数$f(x)$ 为$δ$ 的函数,令$x=y=0$ ,则$f(0)+f(0)=f(0)$ ,即$f(0)=0$ ,令$y=-x$ ,则$f(x)+f(-x)=f\\left(\\dfrac{x-x}{1-{x}^{2}}\\right)=f(0)=0$ ,则$f(-x)=-f(x)$ ,即函数$f(x)$ 是奇函数,设$-1< x< y< 1$ ,则$f(x)-f(y)=f(x)+f(-y)=f\\left(\\dfrac{x-y}{1-xy}\\right)$ ,$∵-1< x< y< 1$ ,$∴-1< \\dfrac{x-y}{1-xy}< 0$ ,则$f\\left(\\dfrac{x-y}{1-xy}\\right)>0$ ,即$f(x)-f(y)>0$ ,则$f(x)>f(y)$ ,即$f(x)$ 在$(-1,1)$ 上是减函数.故选$AC.$ 本题考查函数的奇偶性和单调性的判断,注意运用定义法,考查运算能力和推理能力,属于中档题.可令$x=y=0$ ,求得$f(0)=0$ ,再令$y=-x$ 可得$f(-x)=-f(x)$ ,可得$f(x)$ 的奇偶性;再令$-1< x< y< 1$ ,运用单调性的定义,结合其偶性的定义可得其单调性.\n答案是:A; C",
|
23 |
+
"extract_answer": "A, C",
|
24 |
+
"question_type": "多选题"
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"id": "31b3f702-e60c-4a20-9a40-73bd72b92d1e",
|
28 |
+
"question": "请完成以下题目(1)曲线$$y=-5\\text{e}^{x}+3$$在点$$(0,-2)$$处的切线方程为___.(2)若曲线$$f(x)=x \\sin x+1$$在$$x=\\dfrac{ \\pi }{2}$$处的切线与直线$$ax+2y+1=0$$相互垂直,则实数$$a=$$___.\n\n",
|
29 |
+
"response": "(1)由$$y=-5\\text{e}^{x}+3$$,得$$y'=-5\\text{e}^{x}$$,所以切线的斜率$$k=y'|_{x=0}=-5$$,所以切线方程为$$y+2=-5(x-0)$$,即$$5x+y+2=0$$.(2)因为$$f'(x)= \\sin x+x \\cos x$$,所以$$f'\\left(\\dfrac{ \\pi }{2}\\right)= \\sin \\dfrac{ \\pi }{2}+\\dfrac{ \\pi }{2}\\cdot \\cos \\dfrac{ \\pi }{2}=1$$.又直线$$ax+2y+1=0$$的斜率为$$-\\dfrac{a}{2}$$,所以根据题意得$$1\\times \\left(-\\dfrac{a}{2}\\right)=-1$$,解得$$a=2$$.\n答案是:(1)$$5x+y+2=0$$ (2)$$2$$",
|
30 |
+
"extract_answer": "['(1)$$5x+y+2=0$$ (2)$$2$$']",
|
31 |
+
"question_type": "填空题"
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"id": "16878941-1772-4290-bc61-00b193d5cf70",
|
35 |
+
"question": "已知函数$f\\left( x \\right)=\\left| 2x-1 \\right|$.(1)若不等式$f\\left( x+\\frac{1}{2} \\right)\\ge 2m+1\\left( m > 0 \\right)$的解集为$\\left( -\\infty ,-2 \\right]\\bigcup \\left[ 2,+\\infty \\right)$,求实数$m$的值;(2)若不等式$f\\left( x \\right)\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}+\\left| 2x+3 \\right|$对任意的实数$x,y\\in R$恒成立,求实数$a$的最小值.\n\n",
|
36 |
+
"response": "(1)直接写出不等式,解含有绝对值的函数不等式即可;(2)这是恒成立求参的问题,根据绝对值三角不等式得到左侧函数的最值,再结合均值不等式得最值.(1)由条件得$\\left| 2x \\right|\\le 2m+1$得$-m-\\frac{1}{2}\\le x\\le m+\\frac{1}{2}$,所以$m=\\frac{3}{2}$.(2)原不等式等价于$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le {{2}^{y}}+\\frac{a}{{{2}^{y}}}$,而$\\left| 2x-1 \\right|-\\left| 2x+3 \\right|\\le \\left| \\left( 2x-1 \\right)-\\left( 2x+3 \\right) \\right|=4$,所以${{2}^{y}}+\\frac{a}{{{2}^{y}}}\\ge 4$,则$a\\ge {{\\left[ {{2}^{y}}\\left( 4-{{2}^{y}} \\right) \\right]}_{\\text{max}}}=4$,当且仅当$y=1$时取得.\n答案是:(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$.",
|
37 |
+
"extract_answer": [
|
38 |
+
"(1) $m=\\frac{3}{2}$;(2) 最小值为$a=4$."
|
39 |
+
],
|
40 |
+
"question_type": "解答题"
|
41 |
+
}
|
42 |
+
]
|
43 |
+
```
|
44 |
+
## How to use
|
45 |
+
|
46 |
+
### 1. Prepare the dataset
|
47 |
+
```bash
|
48 |
+
cd opencompass
|
49 |
+
cp -rf /cpfs01/shared/public/liuhongwei/data/gaokao_math_dataset/gaokao_math ./data
|
50 |
+
```
|
51 |
+
📢:If you want to evaluate your own gaokao math data, replace the `test_v2.jsonl` with your own data, but follow the format above.
|
52 |
+
|
53 |
+
### 2. Set the evaluation model
|
54 |
+
|
55 |
+
open `opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f` and set the model name and api url for evaluation, multiple urls are supported for acceleration.
|
56 |
+
|
57 |
+
```python
|
58 |
+
...
|
59 |
+
|
60 |
+
gaokao_math_eval_cfg = dict(
|
61 |
+
evaluator=dict(type=GaoKaoMATHEvaluator, model_name='EVALUATE_MODEL_NAME', url=['http://0.0.0.0:23333/v1', 'http://...']))
|
62 |
+
|
63 |
+
...
|
64 |
+
|
65 |
+
```
|
66 |
+
We recommand `Qwen2.5-72B-Instruct` model for evaluation.
|
67 |
+
|
68 |
+
|
69 |
+
### 3. Set Extractor model and run the evaluation
|
70 |
+
|
71 |
+
```python
|
72 |
+
from mmengine.config import read_base
|
73 |
+
from opencompass.models import HuggingFacewithChatTemplate
|
74 |
+
|
75 |
+
|
76 |
+
with read_base():
|
77 |
+
from opencompass.datasets.gaokao_math.gaokao_math_gen_9b076f import gaokao_math_datasets
|
78 |
+
|
79 |
+
|
80 |
+
trained_qwen2_1_5b_model = [ # trained extractor model
|
81 |
+
dict(
|
82 |
+
type=HuggingFacewithChatTemplate,
|
83 |
+
abbr='gaokao_math_extractor_1_5b_v02',
|
84 |
+
path='/cpfs01/shared/public/liuhongwei/models/gaokao_math_trained/gaokao_math_extractor_1_5b_v02',
|
85 |
+
max_out_len=1024,
|
86 |
+
batch_size=8,
|
87 |
+
run_cfg=dict(num_gpus=1),
|
88 |
+
)
|
89 |
+
]
|
90 |
+
|
91 |
+
datasets = sum([v for k, v in locals().items() if k.endswith("_datasets")], [])
|
92 |
+
models = sum([v for k, v in locals().items() if k.endswith("_model")], [])
|
93 |
+
|
94 |
+
...
|
95 |
+
```
|
96 |
+
|
97 |
+
### 4. Run the evaluation
|
98 |
+
|
99 |
+
```bash
|
100 |
+
python run.py eval.py --dump-eval-details # eval and dump the evaluation details to `results` folder
|
101 |
+
```
|
102 |
+
|
103 |
+
|
104 |
+
### 5. Evaluation results
|
105 |
+
|
106 |
+
| Evaluator / Extractor | Qwen2.5-72B-Instruct | gaokao_math_extractor_1.5b_v0.2 |
|
107 |
+
|-----------------------|-----------------------|----------------------------------|
|
108 |
+
| Qwen2.5-72B-Instruct (ACC) | 95.85 | 95.2 |
|
opencompass/configs/datasets/gaokao_math/gaokao_math_gen_f5fd28.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import GaoKaoMATHDataset, GaoKaoMATHEvaluator
|
5 |
+
|
6 |
+
|
7 |
+
MATH_CN_PROMPT="""
|
8 |
+
你是一个数学阅卷专家,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。
|
9 |
+
—
|
10 |
+
我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。
|
11 |
+
|
12 |
+
对于单选题,答案应该是选项字母,例如 "A";
|
13 |
+
对于多选题,答案应该是一个选项字母的列表,例如 ["A"] 或 ["A", "B", "C"];
|
14 |
+
对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。
|
15 |
+
对于问答题,类似填空题,为每个小问抽出相应答案,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。
|
16 |
+
|
17 |
+
如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。
|
18 |
+
—
|
19 |
+
问题类型: {question_type}
|
20 |
+
原始问题: {question}
|
21 |
+
回答: {response}
|
22 |
+
提取的关键答案:
|
23 |
+
"""
|
24 |
+
|
25 |
+
gaokao_math_reader_cfg = dict(input_columns=['question', 'response', 'question_type'], output_column='extract_answer')
|
26 |
+
|
27 |
+
|
28 |
+
gaokao_math_infer_cfg = dict(
|
29 |
+
prompt_template=dict(
|
30 |
+
type=PromptTemplate,
|
31 |
+
template=dict(round=[
|
32 |
+
dict(role='HUMAN', prompt=MATH_CN_PROMPT),
|
33 |
+
])),
|
34 |
+
retriever=dict(type=ZeroRetriever),
|
35 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
36 |
+
|
37 |
+
gaokao_math_eval_cfg = dict(
|
38 |
+
evaluator=dict(type=GaoKaoMATHEvaluator, model_name='Qwen/Qwen2.5-72B-Instruct', url=['http://22.8.73.119:23333/v1', 'http://22.8.4.97:23333/v1', 'http://22.8.22.254:23333/v1', 'http://22.8.17.14:23333/v1']))
|
39 |
+
|
40 |
+
gaokao_math_datasets = [
|
41 |
+
dict(
|
42 |
+
type=GaoKaoMATHDataset,
|
43 |
+
abbr='GaoKaoMATH',
|
44 |
+
path='./data/gaokao_math/test_2k.json',
|
45 |
+
reader_cfg=gaokao_math_reader_cfg,
|
46 |
+
infer_cfg=gaokao_math_infer_cfg,
|
47 |
+
eval_cfg=gaokao_math_eval_cfg)
|
48 |
+
]
|
opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .humaneval_cn_gen_6313aa import humaneval_cn_datasets # noqa: F401, F403
|
opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen_6313aa.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='完成以下Python代码任务:\n{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_cn_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval_cn',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval_cn',
|
33 |
+
reader_cfg=humaneval_reader_cfg,
|
34 |
+
infer_cfg=humaneval_infer_cfg,
|
35 |
+
eval_cfg=humaneval_eval_cfg)
|
36 |
+
|
37 |
+
]
|
opencompass/configs/datasets/humaneval_cn/humaneval_cn_passk_gen_6313aa.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='完成以下Python代码任务:\n{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_cn_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval_cn_passk',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval_cn',
|
33 |
+
reader_cfg=humaneval_reader_cfg,
|
34 |
+
infer_cfg=humaneval_infer_cfg,
|
35 |
+
eval_cfg=humaneval_eval_cfg)
|
36 |
+
|
37 |
+
]
|
opencompass/configs/datasets/humaneval_cn/humaneval_cn_repeat10_gen_6313aa.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='完成以下Python代码任务:\n{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_cn_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval_cn_repeat10',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval_cn',
|
33 |
+
num_repeats=10,
|
34 |
+
reader_cfg=humaneval_reader_cfg,
|
35 |
+
infer_cfg=humaneval_infer_cfg,
|
36 |
+
eval_cfg=humaneval_eval_cfg)
|
37 |
+
|
38 |
+
]
|
opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .jigsawmultilingual_clp_fe50d8 import jigsawmultilingual_datasets # noqa: F401, F403
|
opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp_1af0ae.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import CLPInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AUCROCEvaluator
|
5 |
+
from opencompass.datasets import JigsawMultilingualDataset
|
6 |
+
|
7 |
+
jigsawmultilingual_reader_cfg = dict(
|
8 |
+
input_columns=['text'],
|
9 |
+
output_column='label',
|
10 |
+
train_split='test',
|
11 |
+
test_split='test')
|
12 |
+
|
13 |
+
# Jigsaw multilingual toxic dataset is not supported in Huggingface
|
14 |
+
# Please download the test dataset files `test.csv` and `test_labels.csv` from
|
15 |
+
# below link and move them the data/ folder to perform evaluation.
|
16 |
+
# https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
|
17 |
+
jigsawmultilingual_infer_cfg = dict(
|
18 |
+
prompt_template=dict(
|
19 |
+
type=PromptTemplate,
|
20 |
+
template='Text: {text}\nQuestion: '
|
21 |
+
'Does the above text contain rude, hateful, aggressive, disrespectful '
|
22 |
+
'or unreasonable language?\nAnswer:'),
|
23 |
+
retriever=dict(type=ZeroRetriever),
|
24 |
+
inferencer=dict(type=CLPInferencer))
|
25 |
+
|
26 |
+
jigsawmultilingual_eval_cfg = dict(evaluator=dict(type=AUCROCEvaluator), )
|
27 |
+
|
28 |
+
lang = ['es', 'fr', 'it', 'pt', 'ru', 'tr']
|
29 |
+
jigsawmultilingual_datasets = []
|
30 |
+
|
31 |
+
for _l in lang:
|
32 |
+
jigsawmultilingual_datasets.append(
|
33 |
+
dict(
|
34 |
+
abbr=f'jigsaw_multilingual_{_l}',
|
35 |
+
type=JigsawMultilingualDataset,
|
36 |
+
path='data/jigsawmultilingual/test.csv',
|
37 |
+
label='data/jigsawmultilingual/test_labels.csv',
|
38 |
+
lang=_l,
|
39 |
+
reader_cfg=jigsawmultilingual_reader_cfg,
|
40 |
+
infer_cfg=jigsawmultilingual_infer_cfg,
|
41 |
+
eval_cfg=jigsawmultilingual_eval_cfg))
|
42 |
+
|
43 |
+
del lang, _l
|
opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp_fe50d8.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import CLPInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AUCROCEvaluator
|
5 |
+
from opencompass.datasets import JigsawMultilingualDataset
|
6 |
+
|
7 |
+
jigsawmultilingual_reader_cfg = dict(
|
8 |
+
input_columns=['text'],
|
9 |
+
output_column='label',
|
10 |
+
train_split='test',
|
11 |
+
test_split='test')
|
12 |
+
|
13 |
+
# Jigsaw multilingual toxic dataset is not supported in Huggingface
|
14 |
+
# Please download the test dataset files `test.csv` and `test_labels.csv` from
|
15 |
+
# below link and move them the data/ folder to perform evaluation.
|
16 |
+
# https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
|
17 |
+
jigsawmultilingual_infer_cfg = dict(
|
18 |
+
prompt_template=dict(
|
19 |
+
type=PromptTemplate,
|
20 |
+
template=dict(round=[
|
21 |
+
dict(
|
22 |
+
role='HUMAN',
|
23 |
+
prompt='Text: {text}\nQuestion: Does the above text contain '
|
24 |
+
'rude, hateful, aggressive, disrespectful or unreasonable '
|
25 |
+
'language?\nAnswer:')
|
26 |
+
])),
|
27 |
+
retriever=dict(type=ZeroRetriever),
|
28 |
+
inferencer=dict(type=CLPInferencer))
|
29 |
+
|
30 |
+
jigsawmultilingual_eval_cfg = dict(evaluator=dict(type=AUCROCEvaluator), )
|
31 |
+
|
32 |
+
lang = ['es', 'fr', 'it', 'pt', 'ru', 'tr']
|
33 |
+
jigsawmultilingual_datasets = []
|
34 |
+
|
35 |
+
for _l in lang:
|
36 |
+
jigsawmultilingual_datasets.append(
|
37 |
+
dict(
|
38 |
+
abbr=f'jigsaw_multilingual_{_l}',
|
39 |
+
type=JigsawMultilingualDataset,
|
40 |
+
path='data/jigsawmultilingual/test.csv',
|
41 |
+
label='data/jigsawmultilingual/test_labels.csv',
|
42 |
+
lang=_l,
|
43 |
+
reader_cfg=jigsawmultilingual_reader_cfg,
|
44 |
+
infer_cfg=jigsawmultilingual_infer_cfg,
|
45 |
+
eval_cfg=jigsawmultilingual_eval_cfg))
|
46 |
+
|
47 |
+
del lang, _l
|
opencompass/configs/datasets/livecodebench/README.md
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LiveCodeBench
|
2 |
+
|
3 |
+
## Dataset
|
4 |
+
|
5 |
+
LiveCodeBench provides holistic and contamination-free evaluation of coding capabilities of LLMs. Particularly, LiveCodeBench continuously collects new problems over time from contests across three competition platforms -- LeetCode, AtCoder, and CodeForces. Next, LiveCodeBench also focuses on a broader range of code-related capabilities, such as self-repair, code execution, and test output prediction, beyond just code generation. Currently, LiveCodeBench hosts four hundred high-quality coding problems that were published between May 2023 and March 2024.
|
6 |
+
|
7 |
+
- Origin Project: https://livecodebench.github.io/leaderboard.html
|
8 |
+
|
9 |
+
## Setting
|
10 |
+
|
11 |
+
| Model Type | Code Generation | Test Output Prediction | Code Execution |
|
12 |
+
|------------|--------|--------|--------|
|
13 |
+
| Base Model | ❌ | ❌ | ❌ |
|
14 |
+
| Chat Model | ✅ | ✅ | ✅ |
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
## Baseline Performance
|
19 |
+
|
20 |
+
|
21 |
+
| Model Type | Code Generation(pass@1) | Test Output Prediction(pass@1) | Code Execution(pass@1) |
|
22 |
+
|------------|--------|--------|--------|
|
23 |
+
| Qwen2.5-7B-Instruct(HF) | 39.25 | 48.64 | 41.96 |
|
24 |
+
| Meta-Llama-3.1-8B-Instruct(HF) | 20.25 | 24.66 | 17.12 |
|
25 |
+
|
26 |
+
|
27 |
+
## Citation
|
28 |
+
|
29 |
+
```bibtex
|
30 |
+
@article{jain2024livecodebench,
|
31 |
+
author = {Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida Wang, Armando Solar-Lezama, Koushik Sen, Ion Stoica},
|
32 |
+
title = {LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code},
|
33 |
+
year = {2024},
|
34 |
+
journal = {arXiv preprint},
|
35 |
+
}
|
36 |
+
@misc{2023opencompass,
|
37 |
+
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
38 |
+
author={OpenCompass Contributors},
|
39 |
+
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
40 |
+
year={2023}
|
41 |
+
}
|
42 |
+
```
|
opencompass/configs/datasets/livecodebench/livecodebench_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .livecodebench_gen_6966bc import LCB_datasets # noqa: F401, F403
|
opencompass/configs/datasets/livecodebench/livecodebench_gen_6966bc.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import (
|
5 |
+
LCBCodeGenerationDataset,
|
6 |
+
LCBCodeExecutionDataset,
|
7 |
+
LCBTestOutputPredictionDataset,
|
8 |
+
LCBCodeGenerationEvaluator,
|
9 |
+
LCBCodeExecutionEvaluator,
|
10 |
+
LCBTestOutputEvaluator
|
11 |
+
)
|
12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
13 |
+
|
14 |
+
|
15 |
+
lcb_code_generation_reader_cfg = dict(
|
16 |
+
input_columns=[
|
17 |
+
'question_content',
|
18 |
+
'format_prompt',
|
19 |
+
],
|
20 |
+
# output_column='evaluation_sample',
|
21 |
+
output_column='question_id',
|
22 |
+
)
|
23 |
+
|
24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
25 |
+
|
26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
28 |
+
|
29 |
+
|
30 |
+
# Code Generation Tasks
|
31 |
+
lcb_code_generation_infer_cfg = dict(
|
32 |
+
prompt_template=dict(
|
33 |
+
type=PromptTemplate,
|
34 |
+
template=dict(
|
35 |
+
round=[
|
36 |
+
dict(
|
37 |
+
role='HUMAN',
|
38 |
+
prompt=prompt_template
|
39 |
+
)
|
40 |
+
]
|
41 |
+
)
|
42 |
+
),
|
43 |
+
retriever=dict(type=ZeroRetriever),
|
44 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
45 |
+
)
|
46 |
+
|
47 |
+
lcb_code_generation_eval_cfg = dict(
|
48 |
+
evaluator=dict(
|
49 |
+
type=LCBCodeGenerationEvaluator,
|
50 |
+
num_process_evaluate=4,
|
51 |
+
timeout=6,
|
52 |
+
),
|
53 |
+
pred_role='BOT',
|
54 |
+
)
|
55 |
+
|
56 |
+
LCBCodeGeneration_dataset = dict(
|
57 |
+
type=LCBCodeGenerationDataset,
|
58 |
+
abbr='lcb_code_generation',
|
59 |
+
path='opencompass/code_generation_lite',
|
60 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
61 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
62 |
+
eval_cfg=lcb_code_generation_eval_cfg
|
63 |
+
)
|
64 |
+
|
65 |
+
# Code Execution Dataset
|
66 |
+
lcb_code_execution_reader_cfg = dict(
|
67 |
+
input_columns=[
|
68 |
+
'prompt',
|
69 |
+
],
|
70 |
+
output_column='evaluation_sample',
|
71 |
+
)
|
72 |
+
|
73 |
+
lcb_code_execution_infer_cfg = dict(
|
74 |
+
prompt_template=dict(
|
75 |
+
type=PromptTemplate,
|
76 |
+
template=dict(
|
77 |
+
begin=[
|
78 |
+
dict(
|
79 |
+
role='SYSTEM',
|
80 |
+
fallback_role='HUMAN',
|
81 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
82 |
+
),
|
83 |
+
],
|
84 |
+
round=[
|
85 |
+
dict(
|
86 |
+
role='HUMAN',
|
87 |
+
prompt='{prompt}'
|
88 |
+
)
|
89 |
+
]
|
90 |
+
)
|
91 |
+
),
|
92 |
+
retriever=dict(type=ZeroRetriever),
|
93 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
94 |
+
)
|
95 |
+
|
96 |
+
lcb_code_execution_eval_cfg = dict(
|
97 |
+
evaluator=dict(
|
98 |
+
type=LCBCodeExecutionEvaluator,
|
99 |
+
),
|
100 |
+
pred_role='BOT',
|
101 |
+
)
|
102 |
+
|
103 |
+
LCBCodeExecution_dataset = dict(
|
104 |
+
type=LCBCodeExecutionDataset,
|
105 |
+
abbr='lcb_code_execution',
|
106 |
+
path='opencompass/execution-v2',
|
107 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
108 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
109 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
110 |
+
)
|
111 |
+
|
112 |
+
# TestOuputput Dataset
|
113 |
+
lcb_test_output_reader_cfg = dict(
|
114 |
+
input_columns=[
|
115 |
+
'prompt',
|
116 |
+
],
|
117 |
+
output_column='evaluation_sample',
|
118 |
+
)
|
119 |
+
|
120 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
121 |
+
|
122 |
+
lcb_test_output_infer_cfg = dict(
|
123 |
+
prompt_template=dict(
|
124 |
+
type=PromptTemplate,
|
125 |
+
template=dict(
|
126 |
+
# begin=[
|
127 |
+
# dict(
|
128 |
+
# role='SYSTEM',
|
129 |
+
# prompt=system_prompt
|
130 |
+
# ),
|
131 |
+
# ],
|
132 |
+
round=[
|
133 |
+
dict(
|
134 |
+
role='HUMAN',
|
135 |
+
prompt='{prompt}'
|
136 |
+
)
|
137 |
+
]
|
138 |
+
)
|
139 |
+
),
|
140 |
+
retriever=dict(type=ZeroRetriever),
|
141 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
142 |
+
)
|
143 |
+
|
144 |
+
lcb_test_output_eval_cfg = dict(
|
145 |
+
evaluator=dict(
|
146 |
+
type=LCBTestOutputEvaluator,
|
147 |
+
),
|
148 |
+
pred_role='BOT',
|
149 |
+
)
|
150 |
+
|
151 |
+
LCBTestOutput_dataset = dict(
|
152 |
+
type=LCBTestOutputPredictionDataset,
|
153 |
+
abbr='lcb_test_output',
|
154 |
+
path='opencompass/test_generation',
|
155 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
156 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
157 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
158 |
+
)
|
159 |
+
|
160 |
+
LCB_datasets = [
|
161 |
+
LCBCodeGeneration_dataset,
|
162 |
+
LCBCodeExecution_dataset,
|
163 |
+
LCBTestOutput_dataset,
|
164 |
+
]
|
opencompass/configs/datasets/livecodebench/livecodebench_gen_b2b0fd.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import (
|
5 |
+
LCBCodeGenerationDataset,
|
6 |
+
LCBCodeExecutionDataset,
|
7 |
+
LCBTestOutputPredictionDataset,
|
8 |
+
LCBCodeGenerationEvaluator,
|
9 |
+
LCBCodeExecutionEvaluator,
|
10 |
+
LCBTestOutputEvaluator
|
11 |
+
)
|
12 |
+
from opencompass.datasets.livecodebench import TestOutputPromptConstants
|
13 |
+
|
14 |
+
|
15 |
+
lcb_code_generation_reader_cfg = dict(
|
16 |
+
input_columns=[
|
17 |
+
'question_content',
|
18 |
+
'format_prompt',
|
19 |
+
],
|
20 |
+
# output_column='evaluation_sample',
|
21 |
+
output_column='question_id',
|
22 |
+
)
|
23 |
+
|
24 |
+
SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
25 |
+
|
26 |
+
prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
|
27 |
+
'### Answer: (use the provided format with backticks)\n\n'
|
28 |
+
|
29 |
+
|
30 |
+
# Code Generation Tasks
|
31 |
+
lcb_code_generation_infer_cfg = dict(
|
32 |
+
prompt_template=dict(
|
33 |
+
type=PromptTemplate,
|
34 |
+
template=dict(
|
35 |
+
round=[
|
36 |
+
dict(
|
37 |
+
role='HUMAN',
|
38 |
+
prompt=prompt_template
|
39 |
+
)
|
40 |
+
]
|
41 |
+
)
|
42 |
+
),
|
43 |
+
retriever=dict(type=ZeroRetriever),
|
44 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
45 |
+
)
|
46 |
+
|
47 |
+
lcb_code_generation_eval_cfg = dict(
|
48 |
+
evaluator=dict(
|
49 |
+
type=LCBCodeGenerationEvaluator,
|
50 |
+
num_process_evaluate=4,
|
51 |
+
timeout=6,
|
52 |
+
),
|
53 |
+
pred_role='BOT',
|
54 |
+
)
|
55 |
+
|
56 |
+
LCBCodeGeneration_dataset = dict(
|
57 |
+
type=LCBCodeGenerationDataset,
|
58 |
+
abbr='lcb_code_generation',
|
59 |
+
path='opencompass/code_generation_lite',
|
60 |
+
reader_cfg=lcb_code_generation_reader_cfg,
|
61 |
+
infer_cfg=lcb_code_generation_infer_cfg,
|
62 |
+
eval_cfg=lcb_code_generation_eval_cfg
|
63 |
+
)
|
64 |
+
|
65 |
+
# Code Execution Dataset
|
66 |
+
lcb_code_execution_reader_cfg = dict(
|
67 |
+
input_columns=[
|
68 |
+
'prompt',
|
69 |
+
],
|
70 |
+
output_column='evaluation_sample',
|
71 |
+
)
|
72 |
+
|
73 |
+
lcb_code_execution_infer_cfg = dict(
|
74 |
+
prompt_template=dict(
|
75 |
+
type=PromptTemplate,
|
76 |
+
template=dict(
|
77 |
+
begin=[
|
78 |
+
dict(
|
79 |
+
role='SYSTEM',
|
80 |
+
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
|
81 |
+
),
|
82 |
+
],
|
83 |
+
round=[
|
84 |
+
dict(
|
85 |
+
role='HUMAN',
|
86 |
+
prompt='{prompt}'
|
87 |
+
)
|
88 |
+
]
|
89 |
+
)
|
90 |
+
),
|
91 |
+
retriever=dict(type=ZeroRetriever),
|
92 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
93 |
+
)
|
94 |
+
|
95 |
+
lcb_code_execution_eval_cfg = dict(
|
96 |
+
evaluator=dict(
|
97 |
+
type=LCBCodeExecutionEvaluator,
|
98 |
+
),
|
99 |
+
pred_role='BOT',
|
100 |
+
)
|
101 |
+
|
102 |
+
LCBCodeExecution_dataset = dict(
|
103 |
+
type=LCBCodeExecutionDataset,
|
104 |
+
abbr='lcb_code_execution',
|
105 |
+
path='opencompass/execution-v2',
|
106 |
+
reader_cfg=lcb_code_execution_reader_cfg,
|
107 |
+
infer_cfg=lcb_code_execution_infer_cfg,
|
108 |
+
eval_cfg=lcb_code_execution_eval_cfg,
|
109 |
+
)
|
110 |
+
|
111 |
+
# TestOuputput Dataset
|
112 |
+
lcb_test_output_reader_cfg = dict(
|
113 |
+
input_columns=[
|
114 |
+
'prompt',
|
115 |
+
],
|
116 |
+
output_column='evaluation_sample',
|
117 |
+
)
|
118 |
+
|
119 |
+
system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
|
120 |
+
|
121 |
+
lcb_test_output_infer_cfg = dict(
|
122 |
+
prompt_template=dict(
|
123 |
+
type=PromptTemplate,
|
124 |
+
template=dict(
|
125 |
+
# begin=[
|
126 |
+
# dict(
|
127 |
+
# role='SYSTEM',
|
128 |
+
# prompt=system_prompt
|
129 |
+
# ),
|
130 |
+
# ],
|
131 |
+
round=[
|
132 |
+
dict(
|
133 |
+
role='HUMAN',
|
134 |
+
prompt='{prompt}'
|
135 |
+
)
|
136 |
+
]
|
137 |
+
)
|
138 |
+
),
|
139 |
+
retriever=dict(type=ZeroRetriever),
|
140 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
141 |
+
)
|
142 |
+
|
143 |
+
lcb_test_output_eval_cfg = dict(
|
144 |
+
evaluator=dict(
|
145 |
+
type=LCBTestOutputEvaluator,
|
146 |
+
),
|
147 |
+
pred_role='BOT',
|
148 |
+
)
|
149 |
+
|
150 |
+
LCBTestOutput_dataset = dict(
|
151 |
+
type=LCBTestOutputPredictionDataset,
|
152 |
+
abbr='lcb_test_output',
|
153 |
+
path='opencompass/test_generation',
|
154 |
+
reader_cfg=lcb_test_output_reader_cfg,
|
155 |
+
infer_cfg=lcb_test_output_infer_cfg,
|
156 |
+
eval_cfg=lcb_test_output_eval_cfg,
|
157 |
+
)
|
158 |
+
|
159 |
+
LCB_datasets = [
|
160 |
+
LCBCodeGeneration_dataset,
|
161 |
+
LCBCodeExecution_dataset,
|
162 |
+
LCBTestOutput_dataset,
|
163 |
+
]
|
opencompass/configs/datasets/mbpp_plus/deprecated_mbpp_plus_gen_94815c.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MBPPEvaluator, MBPPPlusDataset
|
5 |
+
|
6 |
+
mbpp_plus_reader_cfg = dict(
|
7 |
+
input_columns=['text', 'test_list'], output_column='task_id')
|
8 |
+
|
9 |
+
mbpp_plus_infer_cfg = dict(
|
10 |
+
prompt_template=dict(
|
11 |
+
type=PromptTemplate,
|
12 |
+
template=dict(
|
13 |
+
round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt=
|
17 |
+
'You are an expert Python programmer, and here is your task: Write a function to find the shared elements from the given two lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'
|
18 |
+
),
|
19 |
+
dict(
|
20 |
+
role='BOT',
|
21 |
+
prompt=
|
22 |
+
"[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n return tuple(set(test_tup1) & set(test_tup2))' \n[DONE] \n\n "
|
23 |
+
),
|
24 |
+
dict(
|
25 |
+
role='HUMAN',
|
26 |
+
prompt=
|
27 |
+
'You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'
|
28 |
+
),
|
29 |
+
dict(
|
30 |
+
role='BOT',
|
31 |
+
prompt=
|
32 |
+
"[BEGIN]\n 'import math\ndef is_not_prime(n):\n if n == 1:\n return True\n for i in range(2, int(math.sqrt(n))+1):\n if n % i == 0:\n return True\n return False' \n[DONE] \n\n "
|
33 |
+
),
|
34 |
+
dict(
|
35 |
+
role='HUMAN',
|
36 |
+
prompt=
|
37 |
+
'You are an expert Python programmer, and here is your task: Write a function to find the n largest integers from a given list of numbers, returned in descending order. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'
|
38 |
+
),
|
39 |
+
dict(
|
40 |
+
role='BOT',
|
41 |
+
prompt=
|
42 |
+
"[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums: list,n: int) -> list:\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE] \n\n "
|
43 |
+
),
|
44 |
+
dict(
|
45 |
+
role='HUMAN',
|
46 |
+
prompt=
|
47 |
+
'You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'
|
48 |
+
),
|
49 |
+
dict(role='BOT', prompt='[BEGIN]\n'),
|
50 |
+
], )),
|
51 |
+
retriever=dict(type=ZeroRetriever),
|
52 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
53 |
+
|
54 |
+
mbpp_plus_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator, metric='MBPPPlus'), pred_role='BOT')
|
55 |
+
|
56 |
+
mbpp_plus_datasets = [
|
57 |
+
dict(
|
58 |
+
type=MBPPPlusDataset,
|
59 |
+
abbr='mbpp_plus',
|
60 |
+
path='./data/mbpp_plus/mbpp_plus.jsonl',
|
61 |
+
reader_cfg=mbpp_plus_reader_cfg,
|
62 |
+
infer_cfg=mbpp_plus_infer_cfg,
|
63 |
+
eval_cfg=mbpp_plus_eval_cfg)
|
64 |
+
]
|
opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MBPPEvaluator, MBPPPlusDataset
|
5 |
+
|
6 |
+
mbpp_plus_reader_cfg = dict(
|
7 |
+
input_columns=['text', 'test_list'], output_column='task_id')
|
8 |
+
|
9 |
+
mbpp_plus_infer_cfg = dict(
|
10 |
+
prompt_template=dict(
|
11 |
+
type=PromptTemplate,
|
12 |
+
template=dict(
|
13 |
+
round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt=
|
17 |
+
'You are an expert Python programmer, and here is your task: Write a function to find the shared elements from the given two lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'
|
18 |
+
),
|
19 |
+
dict(
|
20 |
+
role='BOT',
|
21 |
+
prompt=
|
22 |
+
"[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n return tuple(set(test_tup1) & set(test_tup2))' \n[DONE] \n\n "
|
23 |
+
),
|
24 |
+
dict(
|
25 |
+
role='HUMAN',
|
26 |
+
prompt=
|
27 |
+
'You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'
|
28 |
+
),
|
29 |
+
dict(
|
30 |
+
role='BOT',
|
31 |
+
prompt=
|
32 |
+
"[BEGIN]\n 'import math\ndef is_not_prime(n):\n if n == 1:\n return True\n for i in range(2, int(math.sqrt(n))+1):\n if n % i == 0:\n return True\n return False' \n[DONE] \n\n "
|
33 |
+
),
|
34 |
+
dict(
|
35 |
+
role='HUMAN',
|
36 |
+
prompt=
|
37 |
+
'You are an expert Python programmer, and here is your task: Write a function to find the n largest integers from a given list of numbers, returned in descending order. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'
|
38 |
+
),
|
39 |
+
dict(
|
40 |
+
role='BOT',
|
41 |
+
prompt=
|
42 |
+
"[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums: list,n: int) -> list:\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE] \n\n "
|
43 |
+
),
|
44 |
+
dict(
|
45 |
+
role='HUMAN',
|
46 |
+
prompt=
|
47 |
+
'You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'
|
48 |
+
),
|
49 |
+
dict(role='BOT', prompt='[BEGIN]\n'),
|
50 |
+
], )),
|
51 |
+
retriever=dict(type=ZeroRetriever),
|
52 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
53 |
+
|
54 |
+
mbpp_plus_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator, metric='MBPPPlus'), pred_role='BOT')
|
55 |
+
|
56 |
+
mbpp_plus_datasets = [
|
57 |
+
dict(
|
58 |
+
type=MBPPPlusDataset,
|
59 |
+
abbr='mbpp_plus',
|
60 |
+
path='./data/mbpp_plus/mbpp_plus.jsonl',
|
61 |
+
reader_cfg=mbpp_plus_reader_cfg,
|
62 |
+
infer_cfg=mbpp_plus_infer_cfg,
|
63 |
+
eval_cfg=mbpp_plus_eval_cfg)
|
64 |
+
]
|
opencompass/configs/datasets/nq_cn/nqcn_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .nqcn_gen_141737 import nqcn_datasets # noqa: F401, F403
|
opencompass/configs/datasets/nq_cn/nqcn_gen_141737.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import NaturalQuestionDatasetCN, NQEvaluatorCN
|
5 |
+
|
6 |
+
nqcn_reader_cfg = dict(
|
7 |
+
input_columns=['question'], output_column='answer', train_split='test'
|
8 |
+
)
|
9 |
+
|
10 |
+
nqcn_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(
|
14 |
+
round=[
|
15 |
+
dict(role='HUMAN', prompt='问题: {question}?\n答案是:'),
|
16 |
+
],
|
17 |
+
),
|
18 |
+
),
|
19 |
+
retriever=dict(type=ZeroRetriever),
|
20 |
+
inferencer=dict(type=GenInferencer),
|
21 |
+
)
|
22 |
+
|
23 |
+
nqcn_eval_cfg = dict(evaluator=dict(type=NQEvaluatorCN), pred_role='BOT')
|
24 |
+
|
25 |
+
nqcn_datasets = [
|
26 |
+
dict(
|
27 |
+
abbr='nq_cn',
|
28 |
+
type=NaturalQuestionDatasetCN,
|
29 |
+
path='./data/nq_cn',
|
30 |
+
reader_cfg=nqcn_reader_cfg,
|
31 |
+
infer_cfg=nqcn_infer_cfg,
|
32 |
+
eval_cfg=nqcn_eval_cfg,
|
33 |
+
)
|
34 |
+
]
|
opencompass/configs/datasets/race/race_cot_gen_d95929.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import RaceDataset
|
6 |
+
from opencompass.utils.text_postprocessors import (
|
7 |
+
first_option_postprocess,
|
8 |
+
)
|
9 |
+
|
10 |
+
QUERY_TEMPLATE = """
|
11 |
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
|
12 |
+
|
13 |
+
Article: {article}
|
14 |
+
|
15 |
+
Q: {question}
|
16 |
+
|
17 |
+
A. {A}
|
18 |
+
B. {B}
|
19 |
+
C. {C}
|
20 |
+
D. {D}
|
21 |
+
""".strip()
|
22 |
+
|
23 |
+
race_reader_cfg = dict(
|
24 |
+
input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
|
25 |
+
output_column='answer',
|
26 |
+
train_split='validation',
|
27 |
+
test_split='test',
|
28 |
+
)
|
29 |
+
|
30 |
+
race_infer_cfg = dict(
|
31 |
+
prompt_template=dict(
|
32 |
+
type=PromptTemplate,
|
33 |
+
template=dict(
|
34 |
+
round=[
|
35 |
+
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
36 |
+
]
|
37 |
+
),
|
38 |
+
),
|
39 |
+
retriever=dict(type=ZeroRetriever),
|
40 |
+
inferencer=dict(type=GenInferencer),
|
41 |
+
)
|
42 |
+
|
43 |
+
race_eval_cfg = dict(
|
44 |
+
evaluator=dict(type=AccEvaluator),
|
45 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
46 |
+
pred_role='BOT',
|
47 |
+
)
|
48 |
+
|
49 |
+
race_datasets = [
|
50 |
+
dict(
|
51 |
+
abbr='race-middle',
|
52 |
+
type=RaceDataset,
|
53 |
+
path='opencompass/race',
|
54 |
+
name='middle',
|
55 |
+
reader_cfg=race_reader_cfg,
|
56 |
+
infer_cfg=race_infer_cfg,
|
57 |
+
eval_cfg=race_eval_cfg,
|
58 |
+
),
|
59 |
+
dict(
|
60 |
+
abbr='race-high',
|
61 |
+
type=RaceDataset,
|
62 |
+
path='opencompass/race',
|
63 |
+
name='high',
|
64 |
+
reader_cfg=race_reader_cfg,
|
65 |
+
infer_cfg=race_infer_cfg,
|
66 |
+
eval_cfg=race_eval_cfg,
|
67 |
+
),
|
68 |
+
]
|