Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py +4 -0
- opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py +43 -0
- opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py +43 -0
- opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl.py +4 -0
- opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_98dd6e.py +34 -0
- opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_ef69e7.py +50 -0
- opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_fdc6de.py +54 -0
- opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py +4 -0
- opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py +51 -0
- opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py +51 -0
- opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_ppl.py +4 -0
- opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_ppl_769f8d.py +45 -0
- opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_ppl_841b62.py +41 -0
- opencompass/configs/datasets/Xsum/Xsum_gen.py +4 -0
- opencompass/configs/datasets/Xsum/Xsum_gen_31397e.py +39 -0
- opencompass/configs/datasets/Xsum/Xsum_gen_8ea5f8.py +30 -0
- opencompass/configs/datasets/calm/README.md +117 -0
- opencompass/configs/datasets/calm/calm.py +160 -0
- opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py +4 -0
- opencompass/configs/datasets/cvalues/cvalues_responsibility_gen_543378.py +37 -0
- opencompass/configs/datasets/dingo/dingo_gen.py +34 -0
- opencompass/configs/datasets/hellaswag/README.md +69 -0
- opencompass/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py +58 -0
- opencompass/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py +45 -0
- opencompass/configs/datasets/hellaswag/hellaswag_clean_ppl.py +35 -0
- opencompass/configs/datasets/hellaswag/hellaswag_gen.py +4 -0
- opencompass/configs/datasets/hellaswag/hellaswag_gen_6faab5.py +44 -0
- opencompass/configs/datasets/hellaswag/hellaswag_ppl.py +4 -0
- opencompass/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py +34 -0
- opencompass/configs/datasets/hellaswag/hellaswag_ppl_7d7f2d.py +33 -0
- opencompass/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py +34 -0
- opencompass/configs/datasets/hellaswag/hellaswag_ppl_a6e128.py +41 -0
- opencompass/configs/datasets/inference_ppl/README.md +26 -0
- opencompass/configs/datasets/inference_ppl/inference_ppl.py +38 -0
- opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py +4 -0
- opencompass/configs/datasets/iwslt2017/iwslt2017_gen_69ce16.py +32 -0
- opencompass/configs/datasets/iwslt2017/iwslt2017_gen_b4a814.py +41 -0
- opencompass/configs/datasets/iwslt2017/iwslt2017_gen_d0ebd1.py +39 -0
- opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py +62 -0
- opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py +62 -0
- opencompass/configs/datasets/math/README.md +69 -0
- opencompass/configs/datasets/math/deprecated_math_evaluatorv2_gen_265cce.py +38 -0
- opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py +78 -0
- opencompass/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py +100 -0
- opencompass/configs/datasets/math/math_agent_gen_0c1b4e.py +99 -0
- opencompass/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py +56 -0
- opencompass/configs/datasets/math/math_gen.py +4 -0
- opencompass/configs/datasets/math/math_gen_0957ff.py +36 -0
- opencompass/configs/datasets/math/math_gen_265cce.py +36 -0
- opencompass/configs/datasets/math/math_gen_736506.py +28 -0
opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .CLUE_cmnli_gen_1abf97 import cmnli_datasets # noqa: F401, F403
|
opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CMNLIDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
cmnli_reader_cfg = dict(
|
9 |
+
input_columns=['sentence1', 'sentence2'],
|
10 |
+
output_column='label',
|
11 |
+
test_split='train')
|
12 |
+
|
13 |
+
cmnli_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt=
|
20 |
+
'语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?\nA. 蕴含\nB. 矛盾\nC. 无关\n请从“A”,“B”,“C”中进行选择。\n答:'
|
21 |
+
),
|
22 |
+
]),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
cmnli_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=AccEvaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
32 |
+
)
|
33 |
+
|
34 |
+
cmnli_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='cmnli',
|
37 |
+
type=CMNLIDatasetV2,
|
38 |
+
path='opencompass/cmnli-dev',
|
39 |
+
reader_cfg=cmnli_reader_cfg,
|
40 |
+
infer_cfg=cmnli_infer_cfg,
|
41 |
+
eval_cfg=cmnli_eval_cfg,
|
42 |
+
)
|
43 |
+
]
|
opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CMNLIDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
cmnli_reader_cfg = dict(
|
9 |
+
input_columns=['sentence1', 'sentence2'],
|
10 |
+
output_column='label',
|
11 |
+
test_split='train')
|
12 |
+
|
13 |
+
cmnli_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt=
|
20 |
+
'阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}\nA. 对\nB. 错\nC. 可能\n请从“A”,“B”,“C”中进行选择。\n答:'
|
21 |
+
),
|
22 |
+
]),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
cmnli_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=AccEvaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
32 |
+
)
|
33 |
+
|
34 |
+
cmnli_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='cmnli',
|
37 |
+
type=CMNLIDatasetV2,
|
38 |
+
path='opencompass/cmnli-dev',
|
39 |
+
reader_cfg=cmnli_reader_cfg,
|
40 |
+
infer_cfg=cmnli_infer_cfg,
|
41 |
+
eval_cfg=cmnli_eval_cfg,
|
42 |
+
)
|
43 |
+
]
|
opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .CLUE_cmnli_ppl_fdc6de import cmnli_datasets # noqa: F401, F403
|
opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_98dd6e.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CMNLIDataset
|
6 |
+
|
7 |
+
cmnli_reader_cfg = dict(
|
8 |
+
input_columns=['sentence1', 'sentence2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
cmnli_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'contradiction':
|
17 |
+
'阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:错',
|
18 |
+
'entailment': '阅读文章:{sentence1}\n根据上文,回答如下问题: {sentence2}?\n答:对',
|
19 |
+
'neutral': '如果{sentence1}为真,那么{sentence2}也为真吗?可能'
|
20 |
+
}),
|
21 |
+
retriever=dict(type=ZeroRetriever),
|
22 |
+
inferencer=dict(type=PPLInferencer))
|
23 |
+
|
24 |
+
cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
25 |
+
|
26 |
+
cmnli_datasets = [
|
27 |
+
dict(
|
28 |
+
abbr='cmnli',
|
29 |
+
type=CMNLIDataset,
|
30 |
+
path='opencompass/cmnli-dev',
|
31 |
+
reader_cfg=cmnli_reader_cfg,
|
32 |
+
infer_cfg=cmnli_infer_cfg,
|
33 |
+
eval_cfg=cmnli_eval_cfg)
|
34 |
+
]
|
opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_ef69e7.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CMNLIDataset
|
6 |
+
|
7 |
+
cmnli_reader_cfg = dict(
|
8 |
+
input_columns=['sentence1', 'sentence2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
cmnli_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'contradiction':
|
17 |
+
dict(round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
|
21 |
+
dict(role='BOT', prompt='错')
|
22 |
+
]),
|
23 |
+
'entailment':
|
24 |
+
dict(round=[
|
25 |
+
dict(
|
26 |
+
role='HUMAN',
|
27 |
+
prompt='阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?'),
|
28 |
+
dict(role='BOT', prompt='对')
|
29 |
+
]),
|
30 |
+
'neutral':
|
31 |
+
dict(round=[
|
32 |
+
dict(
|
33 |
+
role='HUMAN', prompt='如果{sentence1}为真,那么{sentence2}也为真吗?'),
|
34 |
+
dict(role='BOT', prompt='可能')
|
35 |
+
]),
|
36 |
+
}),
|
37 |
+
retriever=dict(type=ZeroRetriever),
|
38 |
+
inferencer=dict(type=PPLInferencer))
|
39 |
+
|
40 |
+
cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
41 |
+
|
42 |
+
cmnli_datasets = [
|
43 |
+
dict(
|
44 |
+
abbr='cmnli',
|
45 |
+
type=CMNLIDataset,
|
46 |
+
path='opencompass/cmnli-dev',
|
47 |
+
reader_cfg=cmnli_reader_cfg,
|
48 |
+
infer_cfg=cmnli_infer_cfg,
|
49 |
+
eval_cfg=cmnli_eval_cfg)
|
50 |
+
]
|
opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_ppl_fdc6de.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CMNLIDataset
|
6 |
+
|
7 |
+
cmnli_reader_cfg = dict(
|
8 |
+
input_columns=['sentence1', 'sentence2'],
|
9 |
+
output_column='label',
|
10 |
+
test_split='train')
|
11 |
+
|
12 |
+
cmnli_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'contradiction':
|
17 |
+
dict(round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt='语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?'
|
21 |
+
),
|
22 |
+
dict(role='BOT', prompt='矛盾')
|
23 |
+
]),
|
24 |
+
'entailment':
|
25 |
+
dict(round=[
|
26 |
+
dict(
|
27 |
+
role='HUMAN',
|
28 |
+
prompt='语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?'
|
29 |
+
),
|
30 |
+
dict(role='BOT', prompt='蕴含')
|
31 |
+
]),
|
32 |
+
'neutral':
|
33 |
+
dict(round=[
|
34 |
+
dict(
|
35 |
+
role='HUMAN',
|
36 |
+
prompt='语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?'
|
37 |
+
),
|
38 |
+
dict(role='BOT', prompt='无关')
|
39 |
+
]),
|
40 |
+
}),
|
41 |
+
retriever=dict(type=ZeroRetriever),
|
42 |
+
inferencer=dict(type=PPLInferencer))
|
43 |
+
|
44 |
+
cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
45 |
+
|
46 |
+
cmnli_datasets = [
|
47 |
+
dict(
|
48 |
+
abbr='cmnli',
|
49 |
+
type=CMNLIDataset,
|
50 |
+
path='opencompass/cmnli-dev',
|
51 |
+
reader_cfg=cmnli_reader_cfg,
|
52 |
+
infer_cfg=cmnli_infer_cfg,
|
53 |
+
eval_cfg=cmnli_eval_cfg)
|
54 |
+
]
|
opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_csl_gen_28b223 import csl_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CslDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
csl_reader_cfg = dict(
|
9 |
+
input_columns=['abst', 'keywords'],
|
10 |
+
output_column='label',
|
11 |
+
)
|
12 |
+
|
13 |
+
csl_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt=
|
20 |
+
'摘要是对论文内容不加注释和评论的简短陈述,要求扼要地说明研究工作的目的、研究方法和最终结论等。\n关键词是一篇学术论文的核心词汇,一般由一系列名词组成。关键词在全文中应有较高出现频率,且能起到帮助文献检索的作用。\n摘要:{abst}\n关键词:{keywords}\n请问上述关键词是否匹配摘要且符合要求?\nA. 否\nB. 是\n请从”A“,”B“中进行选择。\n答:'
|
21 |
+
)
|
22 |
+
]),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
csl_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=AccEvaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
32 |
+
)
|
33 |
+
|
34 |
+
csl_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='csl_dev',
|
37 |
+
type=CslDatasetV2,
|
38 |
+
path='./data/FewCLUE/csl/dev_few_all.json',
|
39 |
+
reader_cfg=csl_reader_cfg,
|
40 |
+
infer_cfg=csl_infer_cfg,
|
41 |
+
eval_cfg=csl_eval_cfg,
|
42 |
+
),
|
43 |
+
dict(
|
44 |
+
abbr='csl_test',
|
45 |
+
type=CslDatasetV2,
|
46 |
+
path='./data/FewCLUE/csl/test_public.json',
|
47 |
+
reader_cfg=csl_reader_cfg,
|
48 |
+
infer_cfg=csl_infer_cfg,
|
49 |
+
eval_cfg=csl_eval_cfg,
|
50 |
+
),
|
51 |
+
]
|
opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CslDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
csl_reader_cfg = dict(
|
9 |
+
input_columns=['abst', 'keywords'],
|
10 |
+
output_column='label',
|
11 |
+
)
|
12 |
+
|
13 |
+
csl_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt=
|
20 |
+
'摘要:{abst}\n关键词:{keywords}\n上述关键词出现在学术期刊中是否恰当?\nA. 否\nB. 是\n请从”A“,”B“中进行选择。\n答:'
|
21 |
+
)
|
22 |
+
]),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
csl_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=AccEvaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
32 |
+
)
|
33 |
+
|
34 |
+
csl_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='csl_dev',
|
37 |
+
type=CslDatasetV2,
|
38 |
+
path='./data/FewCLUE/csl/dev_few_all.json',
|
39 |
+
reader_cfg=csl_reader_cfg,
|
40 |
+
infer_cfg=csl_infer_cfg,
|
41 |
+
eval_cfg=csl_eval_cfg,
|
42 |
+
),
|
43 |
+
dict(
|
44 |
+
abbr='csl_test',
|
45 |
+
type=CslDatasetV2,
|
46 |
+
path='./data/FewCLUE/csl/test_public.json',
|
47 |
+
reader_cfg=csl_reader_cfg,
|
48 |
+
infer_cfg=csl_infer_cfg,
|
49 |
+
eval_cfg=csl_eval_cfg,
|
50 |
+
),
|
51 |
+
]
|
opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_csl_ppl_841b62 import csl_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_ppl_769f8d.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CslDataset
|
6 |
+
|
7 |
+
csl_reader_cfg = dict(
|
8 |
+
input_columns=['abst', 'keywords'], output_column='label')
|
9 |
+
|
10 |
+
csl_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template={
|
14 |
+
0:
|
15 |
+
dict(round=[dict(role='HUMAN', prompt='摘要:{abst}')]),
|
16 |
+
1:
|
17 |
+
dict(
|
18 |
+
round=[dict(role='HUMAN', prompt='摘要:{abst}\n关键词:{keywords}')
|
19 |
+
]),
|
20 |
+
}),
|
21 |
+
retriever=dict(type=ZeroRetriever),
|
22 |
+
inferencer=dict(type=PPLInferencer))
|
23 |
+
|
24 |
+
csl_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
25 |
+
|
26 |
+
csl_datasets = [
|
27 |
+
dict(
|
28 |
+
type=CslDataset,
|
29 |
+
path='json',
|
30 |
+
abbr='csl_dev',
|
31 |
+
data_files='./data/FewCLUE/csl/dev_few_all.json',
|
32 |
+
split='train',
|
33 |
+
reader_cfg=csl_reader_cfg,
|
34 |
+
infer_cfg=csl_infer_cfg,
|
35 |
+
eval_cfg=csl_eval_cfg),
|
36 |
+
dict(
|
37 |
+
type=CslDataset,
|
38 |
+
path='json',
|
39 |
+
abbr='csl_test',
|
40 |
+
data_files='./data/FewCLUE/csl/test_public.json',
|
41 |
+
split='train',
|
42 |
+
reader_cfg=csl_reader_cfg,
|
43 |
+
infer_cfg=csl_infer_cfg,
|
44 |
+
eval_cfg=csl_eval_cfg)
|
45 |
+
]
|
opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_ppl_841b62.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CslDataset
|
6 |
+
|
7 |
+
csl_reader_cfg = dict(
|
8 |
+
input_columns=['abst', 'keywords'], output_column='label')
|
9 |
+
|
10 |
+
csl_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template={
|
14 |
+
0: '摘要:{abst}',
|
15 |
+
1: '摘要:{abst}\n关键词:{keywords}'
|
16 |
+
}),
|
17 |
+
retriever=dict(type=ZeroRetriever),
|
18 |
+
inferencer=dict(type=PPLInferencer))
|
19 |
+
|
20 |
+
csl_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
21 |
+
|
22 |
+
csl_datasets = [
|
23 |
+
dict(
|
24 |
+
type=CslDataset,
|
25 |
+
path='json',
|
26 |
+
abbr='csl_dev',
|
27 |
+
data_files='./data/FewCLUE/csl/dev_few_all.json',
|
28 |
+
split='train',
|
29 |
+
reader_cfg=csl_reader_cfg,
|
30 |
+
infer_cfg=csl_infer_cfg,
|
31 |
+
eval_cfg=csl_eval_cfg),
|
32 |
+
dict(
|
33 |
+
type=CslDataset,
|
34 |
+
path='json',
|
35 |
+
abbr='csl_test',
|
36 |
+
data_files='./data/FewCLUE/csl/test_public.json',
|
37 |
+
split='train',
|
38 |
+
reader_cfg=csl_reader_cfg,
|
39 |
+
infer_cfg=csl_infer_cfg,
|
40 |
+
eval_cfg=csl_eval_cfg)
|
41 |
+
]
|
opencompass/configs/datasets/Xsum/Xsum_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .Xsum_gen_31397e import Xsum_datasets # noqa: F401, F403
|
opencompass/configs/datasets/Xsum/Xsum_gen_31397e.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import RougeEvaluator
|
5 |
+
from opencompass.datasets import XsumDataset
|
6 |
+
|
7 |
+
Xsum_reader_cfg = dict(input_columns=['dialogue'], output_column='summary')
|
8 |
+
|
9 |
+
Xsum_infer_cfg = dict(
|
10 |
+
prompt_template=dict(
|
11 |
+
type=PromptTemplate,
|
12 |
+
template=dict(round=[
|
13 |
+
dict(
|
14 |
+
role='HUMAN',
|
15 |
+
prompt=
|
16 |
+
'Document:{dialogue}\nBased on the previous text, provide a brief single summary:'
|
17 |
+
),
|
18 |
+
]),
|
19 |
+
),
|
20 |
+
retriever=dict(type=ZeroRetriever),
|
21 |
+
inferencer=dict(type=GenInferencer),
|
22 |
+
)
|
23 |
+
|
24 |
+
Xsum_eval_cfg = dict(
|
25 |
+
evaluator=dict(type=RougeEvaluator),
|
26 |
+
pred_role='BOT',
|
27 |
+
pred_postprocessor=dict(type='Xsum'),
|
28 |
+
)
|
29 |
+
|
30 |
+
Xsum_datasets = [
|
31 |
+
dict(
|
32 |
+
type=XsumDataset,
|
33 |
+
abbr='Xsum',
|
34 |
+
path='opencompass/xsum',
|
35 |
+
reader_cfg=Xsum_reader_cfg,
|
36 |
+
infer_cfg=Xsum_infer_cfg,
|
37 |
+
eval_cfg=Xsum_eval_cfg,
|
38 |
+
)
|
39 |
+
]
|
opencompass/configs/datasets/Xsum/Xsum_gen_8ea5f8.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import RougeEvaluator
|
5 |
+
from opencompass.datasets import XsumDataset, Xsum_postprocess
|
6 |
+
|
7 |
+
Xsum_reader_cfg = dict(input_columns=['dialogue'], output_column='summary')
|
8 |
+
|
9 |
+
Xsum_infer_cfg = dict(
|
10 |
+
prompt_template=dict(
|
11 |
+
type=PromptTemplate,
|
12 |
+
template='Document:{dialogue}\n'
|
13 |
+
'Based on the previous text, provide a brief single summary:'),
|
14 |
+
retriever=dict(type=ZeroRetriever),
|
15 |
+
inferencer=dict(type=GenInferencer))
|
16 |
+
|
17 |
+
Xsum_eval_cfg = dict(
|
18 |
+
evaluator=dict(type=RougeEvaluator),
|
19 |
+
pred_postprocessor=dict(type=Xsum_postprocess),
|
20 |
+
)
|
21 |
+
|
22 |
+
Xsum_datasets = [
|
23 |
+
dict(
|
24 |
+
type=XsumDataset,
|
25 |
+
abbr='Xsum',
|
26 |
+
path='opencompass/xsum',
|
27 |
+
reader_cfg=Xsum_reader_cfg,
|
28 |
+
infer_cfg=Xsum_infer_cfg,
|
29 |
+
eval_cfg=Xsum_eval_cfg)
|
30 |
+
]
|
opencompass/configs/datasets/calm/README.md
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# CaLM Lite
|
2 |
+
**CaLM Lite** is a lightweight version of CaLM.
|
3 |
+
|
4 |
+
**Ca**usal evaluation of **L**anguage **M**odels (CaLM), to the best of our knowledge, is the first comprehensive benchmark for evaluating the causal reasoning capabilities of language models. The CaLM framework establishes a foundational taxonomy consisting of four modules: causal target (i.e., what to evaluate), adaptation (i.e., how to obtain the results), metric (i.e., how to measure the results), and error (i.e., how to analyze the bad results).
|
5 |
+
|
6 |
+
<div align="center">
|
7 |
+
|
8 |
+
[🌐 Website](https://opencausalab.github.io/CaLM) |
|
9 |
+
[📃 Report](https://arxiv.org/abs/2405.00622) |[ 🎆 Github](https://github.com/OpenCausaLab/CaLM) | 📧 Welcome to join us by email at [email protected]
|
10 |
+
</div>
|
11 |
+
|
12 |
+
## Quick Start
|
13 |
+
### Data Preparation
|
14 |
+
Download dataset to data/ folder.
|
15 |
+
```
|
16 |
+
wget https://github.com/OpenCausaLab/CaLM/releases/download/v1.0.0.lite/calm.zip
|
17 |
+
unzip calm.zip
|
18 |
+
```
|
19 |
+
### Run Model and Infer
|
20 |
+
To obtain a concise output with only the average information for all tasks, use:
|
21 |
+
|
22 |
+
```
|
23 |
+
python run.py --models YOUR_MODEL --datasets calm --summarizer calm
|
24 |
+
```
|
25 |
+
|
26 |
+
If you want detailed information for each task, use:
|
27 |
+
|
28 |
+
```
|
29 |
+
python run.py --models YOUR_MODEL --datasets calm
|
30 |
+
```
|
31 |
+
|
32 |
+
The `--summarizer calm` flag in the first command is used to generate a summarized output, while omitting it in the second command will provide task-specific details.
|
33 |
+
## Available Causal Tasks
|
34 |
+
We provide 92 tasks for causal evaluation, stored in the `data/calm` folder. For more information about our causal tasks, refer to [tasks](https://github.com/OpenCausaLab/CaLM/blob/main/documents/tasks.md).
|
35 |
+
The directory structure is:
|
36 |
+
|
37 |
+
```
|
38 |
+
├── calm
|
39 |
+
| ├── association
|
40 |
+
| ├── causal_discovery # Rung of the causal ladder
|
41 |
+
| │ ├── abstract_reasoning # Causal scenario
|
42 |
+
| │ │ ├── AR-B_CaLM-AR_CN.json # Causal task
|
43 |
+
| │ | └── AR-B_CaLM-AR_EN.json # Causal task
|
44 |
+
| │ └── ...
|
45 |
+
| └── ...
|
46 |
+
└── ...
|
47 |
+
```
|
48 |
+
|
49 |
+
## Dataset
|
50 |
+
- **Dataset size**: CaLM Lite leverages a light dataset of **9200**, while CaLM uses a significantly larger dataset of 126,334. The table below details the English dataset composition, with the Chinese version structured identically.
|
51 |
+
- **Dataset configuration**: We prioritize balance in our dataset for **binary classification** and **choice selection** questions. By ensuring an equal number of each GT label, we minimize the risk of introducing bias into the model's testing. For **probability calculation**, CaLM-Lite takes extra attention to balance the number of problems across different causal reasoning processes. (For more details on how causal reasoning process is defined, please refer to Section 9.1.6 of the [paper](https://arxiv.org/abs/2405.00622).)
|
52 |
+
- **Efficient evaluation**: For enhanced evaluation efficiency, OpenCompass offers customizable methods. Refer to the [documentation](https://opencompass.org.cn/doc) for guidance on tailoring these methods to your needs.
|
53 |
+
|
54 |
+
| Causal ladder | Causal scenario | Subset | Question type | Mode | CaLM Lite | CaLM |
|
55 |
+
|---------------|-----------------|--------|---------------|------|-----------|------|
|
56 |
+
| Causal discovery | PCD | E-CARE | Binary classification | Natural | 100 | 2000 |
|
57 |
+
| Causal discovery | PCD | E-CARE | Choice selection | Natural | 100 | 1000 |
|
58 |
+
| Causal discovery | PCD | COPA | Binary classification | Natural | 100 | 2000 |
|
59 |
+
| Causal discovery | PCD | COPA | Choice selection | Natural | 100 | 1000 |
|
60 |
+
| Causal discovery | ECI | CTB | Binary classification | Natural | 100 | 596 |
|
61 |
+
| Causal discovery | ECI | ESC | Binary classification | Natural | 100 | 1000 |
|
62 |
+
| Causal discovery | ECI | MAVEN-ERE | Binary classification | Natural | 100 | 1000 |
|
63 |
+
| Causal discovery | AR | CaLM-AR | Binary classification | Symbolic | 100 | 1600 |
|
64 |
+
| Causal discovery | CA | FP | Binary classification | Symbolic | 100 | 1600 |
|
65 |
+
| Causal discovery | CA | FA | Binary classification | Symbolic | 100 | 1600 |
|
66 |
+
| Association | CORR | correlation | Binary classification | Natural | 100 | 1476 |
|
67 |
+
| Association | EAE | exp-away | Binary classification | Natural | 100 | 168 |
|
68 |
+
| Intervention | CB | collider-bias | Binary classification | Natural | 100 | 163 |
|
69 |
+
| Intervention | ATE | ATE-natural | Binary classification | Natural | 100 | 1600 |
|
70 |
+
| Intervention | ATE | ATE-basic | Probability calculation | Mathematical | 100 | 1600 |
|
71 |
+
| Intervention | ATE | ATE-hard | Probability calculation | Mathematical | 100 | 1600 |
|
72 |
+
| Intervention | CDE | CDE-natural | Binary classification | Natural | 100 | 1600 |
|
73 |
+
| Intervention | CDE | CDE-basic | Probability calculation | Mathematical | 100 | 1600 |
|
74 |
+
| Intervention | CDE | CDE-hard | Probability calculation | Mathematical | 100 | 1600 |
|
75 |
+
| Intervention | BAS | backadj | Binary classification | Natural | 100 | 227 |
|
76 |
+
| Intervention | BAS | max-BAS | Choice selection | Symbolic | 100 | 1600 |
|
77 |
+
| Intervention | BAS | min-BAS | Choice selection | Symbolic | 100 | 1600 |
|
78 |
+
| Intervention | BAS | mix-BAS | Choice selection | Symbolic | 100 | 1600 |
|
79 |
+
| Intervention | FAS | FAS | Choice selection | Symbolic | 100 | 1600 |
|
80 |
+
| Intervention | IV | CaLM-IV | Choice selection | Symbolic | 100 | 1600 |
|
81 |
+
| Intervention | CEI | 0.2-UC | Binary classification | Symbolic | 100 | 1600 |
|
82 |
+
| Intervention | CEI | 0.4-UC | Binary classification | Symbolic | 100 | 1600 |
|
83 |
+
| Intervention | CEI | 0.6-UC | Binary classification | Symbolic | 100 | 1600 |
|
84 |
+
| Intervention | CEI | 0.8-UC | Binary classification | Symbolic | 100 | 1600 |
|
85 |
+
| Counterfactuals | ETT | ETT-natural | Binary classification | Natural | 100 | 1600 |
|
86 |
+
| Counterfactuals | ETT | ETT-basic | Probability calculation | Mathematical | 100 | 1600 |
|
87 |
+
| Counterfactuals | ETT | ETT-hard | Probability calculation | Mathematical | 100 | 1600 |
|
88 |
+
| Counterfactuals | NDE | NDE-natural | Binary classification | Natural | 100 | 1600 |
|
89 |
+
| Counterfactuals | NDE | NDE-basic | Probability calculation | Mathematical | 100 | 1600 |
|
90 |
+
| Counterfactuals | NDE | NDE-hard | Probability calculation | Mathematical | 100 | 1600 |
|
91 |
+
| Counterfactuals | NIE | NIE-natural | Binary classification | Natural | 100 | 1600 |
|
92 |
+
| Counterfactuals | NIE | NIE-basic | Probability calculation | Mathematical | 100 | 1600 |
|
93 |
+
| Counterfactuals | NIE | NIE-hard | Probability calculation | Mathematical | 100 | 1600 |
|
94 |
+
| Counterfactuals | PN | PN-basic | Probability calculation | Mathematical | 100 | 1600 |
|
95 |
+
| Counterfactuals | PN | PN-hard | Probability calculation | Mathematical | 100 | 1600 |
|
96 |
+
| Counterfactuals | PS | PS-basic | Probability calculation | Mathematical | 100 | 1600 |
|
97 |
+
| Counterfactuals | PS | PS-hard | Probability calculation | Mathematical | 100 | 1600 |
|
98 |
+
| Counterfactuals | AC | causal judgement | Binary classification | Natural | 100 | 187 |
|
99 |
+
| Counterfactuals | CR | CRASS | Choice selection | Natural | 100 | 274 |
|
100 |
+
| Counterfactuals | CR | det-counterfactual | Binary classification | Natural | 100 | 1476 |
|
101 |
+
| Counterfactuals | CEG | E-CARE | Open-ended generation | Natural | 100 | 1000 |
|
102 |
+
| **Total** | | | | | 4600 | 63167 |
|
103 |
+
|
104 |
+
## Available Prompt Styles (Adaptation)
|
105 |
+
Basic Prompt is our default setting for efficient evaluation of CaLM Lite, but we provide flexibility for exploring additional prompts through CaLM. If you'd like to explore and compare a wider range of prompts, we encourage you to use CaLM. We provide a comprehensive and easy-to-follow guide to assist you in our [repository](https://github.com/OpenCausaLab/CaLM).
|
106 |
+
|
107 |
+
## Citation
|
108 |
+
```
|
109 |
+
@misc{chen2024causal,
|
110 |
+
title={Causal Evaluation of Language Models},
|
111 |
+
author={Sirui Chen and Bo Peng and Meiqi Chen and Ruiqi Wang and Mengying Xu and Xingyu Zeng and Rui Zhao and Shengjie Zhao and Yu Qiao and Chaochao Lu},
|
112 |
+
year={2024},
|
113 |
+
eprint={2405.00622},
|
114 |
+
archivePrefix={arXiv},
|
115 |
+
primaryClass={cs.CL}
|
116 |
+
}
|
117 |
+
```
|
opencompass/configs/datasets/calm/calm.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import CaLMDataset, CaLMEvaluator
|
5 |
+
|
6 |
+
task_hiearchy_dict = {
|
7 |
+
# association/
|
8 |
+
# correlation/
|
9 |
+
'CORR-B_correlation_CN':'association/correlation/',
|
10 |
+
'CORR-B_correlation_EN':'association/correlation/',
|
11 |
+
# explaining_away_effect/
|
12 |
+
'EAE-B_exp-away_CN':'association/explaining_away_effect/',
|
13 |
+
'EAE-B_exp-away_EN':'association/explaining_away_effect/',
|
14 |
+
# causal_discovery/
|
15 |
+
# abstract_reasoning/
|
16 |
+
'AR-B_CaLM-AR_CN':'causal_discovery/abstract_reasoning/',
|
17 |
+
'AR-B_CaLM-AR_EN':'causal_discovery/abstract_reasoning/',
|
18 |
+
# causal_attribution/
|
19 |
+
'CA-B_FA_CN':'causal_discovery/causal_attribution/',
|
20 |
+
'CA-B_FA_EN':'causal_discovery/causal_attribution/',
|
21 |
+
'CA-B_FP_CN':'causal_discovery/causal_attribution/',
|
22 |
+
'CA-B_FP_EN':'causal_discovery/causal_attribution/',
|
23 |
+
# event_causality_identification/
|
24 |
+
'ECI-B_CTB_CN':'causal_discovery/event_causality_identification/',
|
25 |
+
'ECI-B_CTB_EN':'causal_discovery/event_causality_identification/',
|
26 |
+
'ECI-B_ESC_CN':'causal_discovery/event_causality_identification/',
|
27 |
+
'ECI-B_ESC_EN':'causal_discovery/event_causality_identification/',
|
28 |
+
'ECI-B_MAVEN-ERE_CN':'causal_discovery/event_causality_identification/',
|
29 |
+
'ECI-B_MAVEN-ERE_EN':'causal_discovery/event_causality_identification/',
|
30 |
+
# pairwise_causal_discovery/
|
31 |
+
'PCD-B_COPA_CN':'causal_discovery/pairwise_causal_discovery/',
|
32 |
+
'PCD-B_COPA_EN':'causal_discovery/pairwise_causal_discovery/',
|
33 |
+
'PCD-B_E-CARE_CN':'causal_discovery/pairwise_causal_discovery/',
|
34 |
+
'PCD-B_E-CARE_EN':'causal_discovery/pairwise_causal_discovery/',
|
35 |
+
'PCD-C_COPA_CN':'causal_discovery/pairwise_causal_discovery/',
|
36 |
+
'PCD-C_COPA_EN':'causal_discovery/pairwise_causal_discovery/',
|
37 |
+
'PCD-C_E-CARE_CN':'causal_discovery/pairwise_causal_discovery/',
|
38 |
+
'PCD-C_E-CARE_EN':'causal_discovery/pairwise_causal_discovery/',
|
39 |
+
# counterfactual/
|
40 |
+
# actual_causality/
|
41 |
+
'AC-B_causal_judgement_CN':'counterfactual/actual_causality/',
|
42 |
+
'AC-B_causal_judgement_EN':'counterfactual/actual_causality/',
|
43 |
+
# causal_explanation_generation/
|
44 |
+
'CEG-O_E-CARE_CN':'counterfactual/causal_explanation_generation/',
|
45 |
+
'CEG-O_E-CARE_EN':'counterfactual/causal_explanation_generation/',
|
46 |
+
# counterfactual_reasoning/
|
47 |
+
'CR-B_det-counterfactual_CN':'counterfactual/counterfactual_reasoning/',
|
48 |
+
'CR-B_det-counterfactual_EN':'counterfactual/counterfactual_reasoning/',
|
49 |
+
'CR-C_CRASS_CN':'counterfactual/counterfactual_reasoning/',
|
50 |
+
'CR-C_CRASS_EN':'counterfactual/counterfactual_reasoning/',
|
51 |
+
# effect_of_the_treatment_on_the_treated/
|
52 |
+
'ETT-B_ETT-natural_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
|
53 |
+
'ETT-B_ETT-natural_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
|
54 |
+
'ETT-P_ETT-basic_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
|
55 |
+
'ETT-P_ETT-basic_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
|
56 |
+
'ETT-P_ETT-hard_CN':'counterfactual/effect_of_the_treatment_on_the_treated/',
|
57 |
+
'ETT-P_ETT-hard_EN':'counterfactual/effect_of_the_treatment_on_the_treated/',
|
58 |
+
# natural_direct_effect/
|
59 |
+
'NDE-B_NDE-natural_CN':'counterfactual/natural_direct_effect/',
|
60 |
+
'NDE-B_NDE-natural_EN':'counterfactual/natural_direct_effect/',
|
61 |
+
'NDE-P_NDE-basic_CN':'counterfactual/natural_direct_effect/',
|
62 |
+
'NDE-P_NDE-basic_EN':'counterfactual/natural_direct_effect/',
|
63 |
+
'NDE-P_NDE-hard_CN':'counterfactual/natural_direct_effect/',
|
64 |
+
'NDE-P_NDE-hard_EN':'counterfactual/natural_direct_effect/',
|
65 |
+
# natural_indirect_effect/
|
66 |
+
'NIE-B_NIE-natural_CN':'counterfactual/natural_indirect_effect/',
|
67 |
+
'NIE-B_NIE-natural_EN':'counterfactual/natural_indirect_effect/',
|
68 |
+
'NIE-P_NIE-basic_CN':'counterfactual/natural_indirect_effect/',
|
69 |
+
'NIE-P_NIE-basic_EN':'counterfactual/natural_indirect_effect/',
|
70 |
+
'NIE-P_NIE-hard_CN':'counterfactual/natural_indirect_effect/',
|
71 |
+
'NIE-P_NIE-hard_EN':'counterfactual/natural_indirect_effect/',
|
72 |
+
# probability_of_necessity/
|
73 |
+
'PN-P_PN-basic_CN':'counterfactual/probability_of_necessity/',
|
74 |
+
'PN-P_PN-basic_EN':'counterfactual/probability_of_necessity/',
|
75 |
+
'PN-P_PN-hard_CN':'counterfactual/probability_of_necessity/',
|
76 |
+
'PN-P_PN-hard_EN':'counterfactual/probability_of_necessity/',
|
77 |
+
# probability_of_sufficiency/
|
78 |
+
'PS-P_PS-basic_CN':'counterfactual/probability_of_sufficiency/',
|
79 |
+
'PS-P_PS-basic_EN':'counterfactual/probability_of_sufficiency/',
|
80 |
+
'PS-P_PS-hard_CN':'counterfactual/probability_of_sufficiency/',
|
81 |
+
'PS-P_PS-hard_EN':'counterfactual/probability_of_sufficiency/',
|
82 |
+
# intervention/
|
83 |
+
# average_treatment_effect/
|
84 |
+
'ATE-B_ATE-natural_CN':'intervention/average_treatment_effect/',
|
85 |
+
'ATE-B_ATE-natural_EN':'intervention/average_treatment_effect/',
|
86 |
+
'ATE-P_ATE-basic_CN':'intervention/average_treatment_effect/',
|
87 |
+
'ATE-P_ATE-basic_EN':'intervention/average_treatment_effect/',
|
88 |
+
'ATE-P_ATE-hard_CN':'intervention/average_treatment_effect/',
|
89 |
+
'ATE-P_ATE-hard_EN':'intervention/average_treatment_effect/',
|
90 |
+
# backdoor_adjustment_set/
|
91 |
+
'BAS-B_backadj_CN':'intervention/backdoor_adjustment_set/',
|
92 |
+
'BAS-B_backadj_EN':'intervention/backdoor_adjustment_set/',
|
93 |
+
'BAS-C_max-BAS_CN':'intervention/backdoor_adjustment_set/',
|
94 |
+
'BAS-C_max-BAS_EN':'intervention/backdoor_adjustment_set/',
|
95 |
+
'BAS-C_min-BAS_CN':'intervention/backdoor_adjustment_set/',
|
96 |
+
'BAS-C_min-BAS_EN':'intervention/backdoor_adjustment_set/',
|
97 |
+
'BAS-C_mix-BAS_CN':'intervention/backdoor_adjustment_set/',
|
98 |
+
'BAS-C_mix-BAS_EN':'intervention/backdoor_adjustment_set/',
|
99 |
+
# causal_effect_identification/
|
100 |
+
'CEI-B_0.2-UC_CN':'intervention/causal_effect_identification/',
|
101 |
+
'CEI-B_0.2-UC_EN':'intervention/causal_effect_identification/',
|
102 |
+
'CEI-B_0.4-UC_CN':'intervention/causal_effect_identification/',
|
103 |
+
'CEI-B_0.4-UC_EN':'intervention/causal_effect_identification/',
|
104 |
+
'CEI-B_0.6-UC_CN':'intervention/causal_effect_identification/',
|
105 |
+
'CEI-B_0.6-UC_EN':'intervention/causal_effect_identification/',
|
106 |
+
'CEI-B_0.8-UC_CN':'intervention/causal_effect_identification/',
|
107 |
+
'CEI-B_0.8-UC_EN':'intervention/causal_effect_identification/',
|
108 |
+
# collider_bias/
|
109 |
+
'CB-B_collider-bias_CN':'intervention/collider_bias/',
|
110 |
+
'CB-B_collider-bias_EN':'intervention/collider_bias/',
|
111 |
+
# controlled_direct_effect/
|
112 |
+
'CDE-B_CDE-natural_CN':'intervention/controlled_direct_effect/',
|
113 |
+
'CDE-B_CDE-natural_EN':'intervention/controlled_direct_effect/',
|
114 |
+
'CDE-P_CDE-basic_CN':'intervention/controlled_direct_effect/',
|
115 |
+
'CDE-P_CDE-basic_EN':'intervention/controlled_direct_effect/',
|
116 |
+
'CDE-P_CDE-hard_CN':'intervention/controlled_direct_effect/',
|
117 |
+
'CDE-P_CDE-hard_EN':'intervention/controlled_direct_effect/',
|
118 |
+
# frontdoor_adjustment_set/
|
119 |
+
'FAS-C_FAS_CN':'intervention/frontdoor_adjustment_set/',
|
120 |
+
'FAS-C_FAS_EN':'intervention/frontdoor_adjustment_set/',
|
121 |
+
# instrumental_variable/
|
122 |
+
'IV-C_CaLM-IV_CN':'intervention/instrumental_variable/',
|
123 |
+
'IV-C_CaLM-IV_EN':'intervention/instrumental_variable/',}
|
124 |
+
|
125 |
+
calm_reader_cfg = dict(
|
126 |
+
input_columns=['question'],
|
127 |
+
output_column='gt_item')
|
128 |
+
|
129 |
+
calm_all_sets = list(set(key[:-3] for key in task_hiearchy_dict.keys()))
|
130 |
+
|
131 |
+
calm_datasets = []
|
132 |
+
for _name in calm_all_sets:
|
133 |
+
for _prompt_style in ['basic','basic-CN']:
|
134 |
+
_task_name = _name + ('_CN' if _prompt_style.endswith('-CN') else '_EN')
|
135 |
+
_path = f'./data/calm/{task_hiearchy_dict[_task_name]}{_task_name}.json'
|
136 |
+
|
137 |
+
calm_infer_cfg = dict(
|
138 |
+
prompt_template=dict(
|
139 |
+
type=PromptTemplate,
|
140 |
+
template='{question}'),
|
141 |
+
retriever=dict(type=ZeroRetriever),
|
142 |
+
inferencer=dict(type=GenInferencer, max_out_len=500))
|
143 |
+
|
144 |
+
calm_eval_cfg = dict(evaluator=dict(
|
145 |
+
type=CaLMEvaluator,
|
146 |
+
core_metrics=True,
|
147 |
+
error_analysis=True,
|
148 |
+
prompt_style=_prompt_style,
|
149 |
+
task=_task_name))
|
150 |
+
calm_datasets.append(
|
151 |
+
dict(
|
152 |
+
abbr=f'calm_{_task_name}',
|
153 |
+
type=CaLMDataset,
|
154 |
+
path=_path,
|
155 |
+
prompt_style=_prompt_style,
|
156 |
+
reader_cfg=calm_reader_cfg,
|
157 |
+
infer_cfg=calm_infer_cfg,
|
158 |
+
eval_cfg=calm_eval_cfg)
|
159 |
+
)
|
160 |
+
del _prompt_style, _task_name, _path, _name
|
opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .cvalues_responsibility_gen_543378 import cvalues_datasets # noqa: F401, F403
|
opencompass/configs/datasets/cvalues/cvalues_responsibility_gen_543378.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CValuesDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
cvalues_reader_cfg = dict(
|
9 |
+
input_columns=['prompt'],
|
10 |
+
output_column='label',
|
11 |
+
train_split='train',
|
12 |
+
test_split='train',
|
13 |
+
)
|
14 |
+
|
15 |
+
cvalues_infer_cfg = dict(
|
16 |
+
prompt_template=dict(
|
17 |
+
type=PromptTemplate,
|
18 |
+
template=dict(
|
19 |
+
round=[dict(role='HUMAN', prompt='{prompt}请直接给出答案:\n')])),
|
20 |
+
retriever=dict(type=ZeroRetriever),
|
21 |
+
inferencer=dict(type=GenInferencer))
|
22 |
+
|
23 |
+
cvalues_eval_cfg = dict(
|
24 |
+
evaluator=dict(type=AccEvaluator),
|
25 |
+
pred_role='BOT',
|
26 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
27 |
+
)
|
28 |
+
|
29 |
+
cvalues_datasets = [
|
30 |
+
dict(
|
31 |
+
abbr='CValues-Responsibility',
|
32 |
+
type=CValuesDataset,
|
33 |
+
path='data/cvalues_responsibility_mc.jsonl',
|
34 |
+
reader_cfg=cvalues_reader_cfg,
|
35 |
+
infer_cfg=cvalues_infer_cfg,
|
36 |
+
eval_cfg=cvalues_eval_cfg)
|
37 |
+
]
|
opencompass/configs/datasets/dingo/dingo_gen.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import DingoDataset, DingoEvaluator
|
5 |
+
|
6 |
+
|
7 |
+
dingo_paths = [
|
8 |
+
'./data/dingo/en_192.csv',
|
9 |
+
'./data/dingo/zh_170.csv',
|
10 |
+
]
|
11 |
+
|
12 |
+
dingo_datasets = []
|
13 |
+
for path in dingo_paths:
|
14 |
+
dingo_reader_cfg = dict(input_columns='input', output_column=None)
|
15 |
+
dingo_infer_cfg = dict(
|
16 |
+
prompt_template=dict(
|
17 |
+
type=PromptTemplate,
|
18 |
+
template=dict(round=[dict(role='HUMAN', prompt='{input}')])),
|
19 |
+
retriever=dict(type=ZeroRetriever),
|
20 |
+
inferencer=dict(type=GenInferencer),
|
21 |
+
)
|
22 |
+
dingo_eval_cfg = dict(evaluator=dict(type=DingoEvaluator), pred_role='BOT')
|
23 |
+
|
24 |
+
dingo_datasets.append(
|
25 |
+
dict(
|
26 |
+
abbr='dingo_' + path.split('/')[-1].split('.csv')[0],
|
27 |
+
type=DingoDataset,
|
28 |
+
path=path,
|
29 |
+
reader_cfg=dingo_reader_cfg,
|
30 |
+
infer_cfg=dingo_infer_cfg,
|
31 |
+
eval_cfg=dingo_eval_cfg,
|
32 |
+
))
|
33 |
+
|
34 |
+
datasets = dingo_datasets
|
opencompass/configs/datasets/hellaswag/README.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HellaSwag
|
2 |
+
|
3 |
+
```bash
|
4 |
+
python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug
|
5 |
+
python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug
|
6 |
+
```
|
7 |
+
|
8 |
+
## Base Models
|
9 |
+
|
10 |
+
| model | hellaswag |
|
11 |
+
|:------------------------:|------------:|
|
12 |
+
| llama-7b-turbomind | 26.99 |
|
13 |
+
| llama-13b-turbomind | 34.21 |
|
14 |
+
| llama-30b-turbomind | 35.65 |
|
15 |
+
| llama-65b-turbomind | 44.63 |
|
16 |
+
| llama-2-7b-turbomind | 29.29 |
|
17 |
+
| llama-2-13b-turbomind | 45.06 |
|
18 |
+
| llama-2-70b-turbomind | 55.91 |
|
19 |
+
| llama-3-8b-turbomind | 50.86 |
|
20 |
+
| llama-3-70b-turbomind | 80.60 |
|
21 |
+
| internlm2-1.8b-turbomind | 44.86 |
|
22 |
+
| internlm2-7b-turbomind | 89.52 |
|
23 |
+
| internlm2-20b-turbomind | 91.41 |
|
24 |
+
| qwen-1.8b-turbomind | 38.04 |
|
25 |
+
| qwen-7b-turbomind | 64.62 |
|
26 |
+
| qwen-14b-turbomind | 85.88 |
|
27 |
+
| qwen-72b-turbomind | 90.40 |
|
28 |
+
| qwen1.5-0.5b-hf | 29.19 |
|
29 |
+
| qwen1.5-1.8b-hf | 42.32 |
|
30 |
+
| qwen1.5-4b-hf | 55.89 |
|
31 |
+
| qwen1.5-7b-hf | 68.51 |
|
32 |
+
| qwen1.5-14b-hf | 83.86 |
|
33 |
+
| qwen1.5-32b-hf | 87.28 |
|
34 |
+
| qwen1.5-72b-hf | 90.41 |
|
35 |
+
| qwen1.5-moe-a2-7b-hf | 72.42 |
|
36 |
+
| mistral-7b-v0.1-hf | 42.04 |
|
37 |
+
| mistral-7b-v0.2-hf | 46.24 |
|
38 |
+
| mixtral-8x7b-v0.1-hf | 66.22 |
|
39 |
+
| mixtral-8x22b-v0.1-hf | 79.66 |
|
40 |
+
| yi-6b-hf | 66.83 |
|
41 |
+
| yi-34b-hf | 83.83 |
|
42 |
+
| deepseek-7b-base-hf | 30.42 |
|
43 |
+
| deepseek-67b-base-hf | 70.75 |
|
44 |
+
|
45 |
+
## Chat Models
|
46 |
+
|
47 |
+
| model | hellaswag |
|
48 |
+
|:-----------------------------:|------------:|
|
49 |
+
| qwen1.5-0.5b-chat-hf | 29.60 |
|
50 |
+
| qwen1.5-1.8b-chat-hf | 41.71 |
|
51 |
+
| qwen1.5-4b-chat-hf | 60.45 |
|
52 |
+
| qwen1.5-7b-chat-hf | 71.58 |
|
53 |
+
| qwen1.5-14b-chat-hf | 79.70 |
|
54 |
+
| qwen1.5-32b-chat-hf | 88.56 |
|
55 |
+
| qwen1.5-72b-chat-hf | 89.37 |
|
56 |
+
| qwen1.5-110b-chat-hf | 91.11 |
|
57 |
+
| internlm2-chat-1.8b-hf | 60.47 |
|
58 |
+
| internlm2-chat-1.8b-sft-hf | 61.58 |
|
59 |
+
| internlm2-chat-7b-hf | 84.80 |
|
60 |
+
| internlm2-chat-7b-sft-hf | 85.21 |
|
61 |
+
| internlm2-chat-20b-hf | 88.48 |
|
62 |
+
| internlm2-chat-20b-sft-hf | 88.95 |
|
63 |
+
| llama-3-8b-instruct-hf | 74.39 |
|
64 |
+
| llama-3-70b-instruct-hf | 89.07 |
|
65 |
+
| llama-3-8b-instruct-lmdeploy | 73.31 |
|
66 |
+
| llama-3-70b-instruct-lmdeploy | 87.28 |
|
67 |
+
| mistral-7b-instruct-v0.1-hf | 53.00 |
|
68 |
+
| mistral-7b-instruct-v0.2-hf | 65.72 |
|
69 |
+
| mixtral-8x7b-instruct-v0.1-hf | 76.16 |
|
opencompass/configs/datasets/hellaswag/hellaswag_10shot_gen_e42710.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
5 |
+
from opencompass.datasets import HellaswagDatasetwithICE
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
hellaswag_reader_cfg = dict(
|
9 |
+
input_columns=['ctx', 'A', 'B', 'C', 'D'],
|
10 |
+
output_column='label',
|
11 |
+
train_split='train',
|
12 |
+
test_split='val',
|
13 |
+
)
|
14 |
+
|
15 |
+
hellaswag_infer_cfg = dict(
|
16 |
+
ice_template=dict(
|
17 |
+
type=PromptTemplate,
|
18 |
+
template=dict(
|
19 |
+
round=[
|
20 |
+
dict(role='HUMAN', prompt=f'{{ctx}}\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}\nWhat is the right option?'),
|
21 |
+
dict(role='BOT', prompt='{label}\n'),
|
22 |
+
]
|
23 |
+
),
|
24 |
+
),
|
25 |
+
prompt_template=dict(
|
26 |
+
type=PromptTemplate,
|
27 |
+
template=dict(
|
28 |
+
begin=[
|
29 |
+
dict(role='HUMAN', prompt='Continue the following text without adding any additional information or formatting:\n'),
|
30 |
+
'</E>',
|
31 |
+
],
|
32 |
+
round=[
|
33 |
+
dict(role='HUMAN', prompt=f'{{ctx}}\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}\nWhat is the right option?'),
|
34 |
+
dict(role='BOT', prompt='{label}\n'),
|
35 |
+
],
|
36 |
+
),
|
37 |
+
ice_token='</E>',
|
38 |
+
),
|
39 |
+
retriever=dict(type=FixKRetriever, fix_id_list=list(range(10))),
|
40 |
+
inferencer=dict(type=GenInferencer),
|
41 |
+
)
|
42 |
+
|
43 |
+
hellaswag_eval_cfg = dict(
|
44 |
+
evaluator=dict(type=AccwithDetailsEvaluator),
|
45 |
+
pred_role='BOT',
|
46 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
47 |
+
)
|
48 |
+
|
49 |
+
hellaswag_datasets = [
|
50 |
+
dict(
|
51 |
+
abbr='hellaswag',
|
52 |
+
type=HellaswagDatasetwithICE,
|
53 |
+
path='opencompass/hellaswag_ice',
|
54 |
+
reader_cfg=hellaswag_reader_cfg,
|
55 |
+
infer_cfg=hellaswag_infer_cfg,
|
56 |
+
eval_cfg=hellaswag_eval_cfg,
|
57 |
+
)
|
58 |
+
]
|
opencompass/configs/datasets/hellaswag/hellaswag_10shot_ppl_59c85e.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
5 |
+
from opencompass.datasets import HellaswagDatasetwithICE
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
hellaswag_reader_cfg = dict(
|
9 |
+
input_columns=['ctx', 'A', 'B', 'C', 'D'],
|
10 |
+
output_column='label',
|
11 |
+
train_split='train',
|
12 |
+
test_split='val',
|
13 |
+
)
|
14 |
+
|
15 |
+
hint = 'Continue the following text without adding any additional information or formatting:'
|
16 |
+
question_and_options = '{ctx}\nA) {A}\nB) {B}\nC) {C}\nD) {D}\nWhat is the right option?'
|
17 |
+
hellaswag_infer_cfg = dict(
|
18 |
+
ice_template=dict(
|
19 |
+
type=PromptTemplate,
|
20 |
+
template={answer: f'{question_and_options}\n{answer}\n' for answer in ['A', 'B', 'C', 'D']},
|
21 |
+
),
|
22 |
+
prompt_template=dict(
|
23 |
+
type=PromptTemplate,
|
24 |
+
template={answer: f'{hint}\n</E>{question_and_options}\n{answer}' for answer in ['A', 'B', 'C', 'D']},
|
25 |
+
ice_token='</E>',
|
26 |
+
),
|
27 |
+
retriever=dict(type=FixKRetriever, fix_id_list=list(range(10))),
|
28 |
+
inferencer=dict(type=PPLInferencer),
|
29 |
+
)
|
30 |
+
|
31 |
+
hellaswag_eval_cfg = dict(
|
32 |
+
evaluator=dict(type=AccwithDetailsEvaluator),
|
33 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
34 |
+
)
|
35 |
+
|
36 |
+
hellaswag_datasets = [
|
37 |
+
dict(
|
38 |
+
abbr='hellaswag',
|
39 |
+
type=HellaswagDatasetwithICE,
|
40 |
+
path='opencompass/hellaswag_ice',
|
41 |
+
reader_cfg=hellaswag_reader_cfg,
|
42 |
+
infer_cfg=hellaswag_infer_cfg,
|
43 |
+
eval_cfg=hellaswag_eval_cfg,
|
44 |
+
)
|
45 |
+
]
|
opencompass/configs/datasets/hellaswag/hellaswag_clean_ppl.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
|
5 |
+
from opencompass.datasets import HellaswagDatasetClean as HellaswagDataset
|
6 |
+
|
7 |
+
hellaswag_reader_cfg = dict(
|
8 |
+
input_columns=['ctx', 'A', 'B', 'C', 'D'],
|
9 |
+
output_column='label')
|
10 |
+
|
11 |
+
hellaswag_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template={
|
15 |
+
i: dict(round=[
|
16 |
+
dict(role='HUMAN', prompt='{ctx}'),
|
17 |
+
dict(role='BOT', prompt=f"{{{chr(ord('A') + i)}}}"),
|
18 |
+
])
|
19 |
+
for i in range(4)
|
20 |
+
}),
|
21 |
+
retriever=dict(type=ZeroRetriever),
|
22 |
+
inferencer=dict(type=PPLInferencer))
|
23 |
+
|
24 |
+
hellaswag_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
|
25 |
+
analyze_contamination=True)
|
26 |
+
|
27 |
+
hellaswag_datasets = [
|
28 |
+
dict(
|
29 |
+
abbr='hellaswag',
|
30 |
+
type=HellaswagDataset,
|
31 |
+
path='opencompass/hellaswag',
|
32 |
+
reader_cfg=hellaswag_reader_cfg,
|
33 |
+
infer_cfg=hellaswag_infer_cfg,
|
34 |
+
eval_cfg=hellaswag_eval_cfg)
|
35 |
+
]
|
opencompass/configs/datasets/hellaswag/hellaswag_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .hellaswag_gen_6faab5 import hellaswag_datasets # noqa: F401, F403
|
opencompass/configs/datasets/hellaswag/hellaswag_gen_6faab5.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HellaswagDataset_V2
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
hellaswag_reader_cfg = dict(
|
9 |
+
input_columns=['ctx', 'A', 'B', 'C', 'D'],
|
10 |
+
output_column='label',
|
11 |
+
)
|
12 |
+
|
13 |
+
hellaswag_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt=('{ctx}\nQuestion: Which ending makes the most sense?\n'
|
20 |
+
'A. {A}\nB. {B}\nC. {C}\nD. {D}\n'
|
21 |
+
"You may choose from 'A', 'B', 'C', 'D'.\n"
|
22 |
+
'Answer:'),
|
23 |
+
),
|
24 |
+
]),
|
25 |
+
),
|
26 |
+
retriever=dict(type=ZeroRetriever, ),
|
27 |
+
inferencer=dict(type=GenInferencer),
|
28 |
+
)
|
29 |
+
|
30 |
+
hellaswag_eval_cfg = dict(
|
31 |
+
evaluator=dict(type=AccEvaluator),
|
32 |
+
pred_role='BOT',
|
33 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
34 |
+
)
|
35 |
+
|
36 |
+
hellaswag_datasets = [
|
37 |
+
dict(
|
38 |
+
abbr='hellaswag',
|
39 |
+
type=HellaswagDataset_V2,
|
40 |
+
path='opencompass/hellaswag',
|
41 |
+
reader_cfg=hellaswag_reader_cfg,
|
42 |
+
infer_cfg=hellaswag_infer_cfg,
|
43 |
+
eval_cfg=hellaswag_eval_cfg)
|
44 |
+
]
|
opencompass/configs/datasets/hellaswag/hellaswag_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .hellaswag_ppl_47bff9 import hellaswag_datasets # noqa: F401, F403
|
opencompass/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HellaswagDataset
|
6 |
+
|
7 |
+
hellaswag_reader_cfg = dict(
|
8 |
+
input_columns=['ctx', 'A', 'B', 'C', 'D'],
|
9 |
+
output_column='label')
|
10 |
+
|
11 |
+
hellaswag_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template={
|
15 |
+
i: dict(round=[
|
16 |
+
dict(role='HUMAN', prompt='{ctx}'),
|
17 |
+
dict(role='BOT', prompt=f"{{{chr(ord('A') + i)}}}"),
|
18 |
+
])
|
19 |
+
for i in range(4)
|
20 |
+
}),
|
21 |
+
retriever=dict(type=ZeroRetriever),
|
22 |
+
inferencer=dict(type=PPLInferencer))
|
23 |
+
|
24 |
+
hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
25 |
+
|
26 |
+
hellaswag_datasets = [
|
27 |
+
dict(
|
28 |
+
abbr='hellaswag',
|
29 |
+
type=HellaswagDataset,
|
30 |
+
path='opencompass/hellaswag',
|
31 |
+
reader_cfg=hellaswag_reader_cfg,
|
32 |
+
infer_cfg=hellaswag_infer_cfg,
|
33 |
+
eval_cfg=hellaswag_eval_cfg)
|
34 |
+
]
|
opencompass/configs/datasets/hellaswag/hellaswag_ppl_7d7f2d.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HellaswagDataset_V2
|
6 |
+
|
7 |
+
hellaswag_reader_cfg = dict(
|
8 |
+
input_columns=['query', 'A', 'B', 'C', 'D'],
|
9 |
+
output_column='label')
|
10 |
+
|
11 |
+
hellaswag_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template={
|
15 |
+
ans: dict(round=[
|
16 |
+
dict(role='HUMAN', prompt='{ctx}\nQuestion: Which ending makes the most sense?\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '),
|
17 |
+
dict(role='BOT', prompt=f'{ans}'),
|
18 |
+
]) for ans in ['A', 'B', 'C', 'D']
|
19 |
+
}),
|
20 |
+
retriever=dict(type=ZeroRetriever),
|
21 |
+
inferencer=dict(type=PPLInferencer))
|
22 |
+
|
23 |
+
hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
24 |
+
|
25 |
+
hellaswag_datasets = [
|
26 |
+
dict(
|
27 |
+
abbr='hellaswag',
|
28 |
+
type=HellaswagDataset_V2,
|
29 |
+
path='opencompass/hellaswag',
|
30 |
+
reader_cfg=hellaswag_reader_cfg,
|
31 |
+
infer_cfg=hellaswag_infer_cfg,
|
32 |
+
eval_cfg=hellaswag_eval_cfg)
|
33 |
+
]
|
opencompass/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HellaswagDataset
|
6 |
+
|
7 |
+
hellaswag_reader_cfg = dict(
|
8 |
+
input_columns=['ctx', 'A', 'B', 'C', 'D'],
|
9 |
+
output_column='label'
|
10 |
+
)
|
11 |
+
|
12 |
+
hellaswag_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
0: '{ctx} {A}',
|
17 |
+
1: '{ctx} {B}',
|
18 |
+
2: '{ctx} {C}',
|
19 |
+
3: '{ctx} {D}',
|
20 |
+
}),
|
21 |
+
retriever=dict(type=ZeroRetriever),
|
22 |
+
inferencer=dict(type=PPLInferencer))
|
23 |
+
|
24 |
+
hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
25 |
+
|
26 |
+
hellaswag_datasets = [
|
27 |
+
dict(
|
28 |
+
abbr='hellaswag',
|
29 |
+
type=HellaswagDataset,
|
30 |
+
path='opencompass/hellaswag',
|
31 |
+
reader_cfg=hellaswag_reader_cfg,
|
32 |
+
infer_cfg=hellaswag_infer_cfg,
|
33 |
+
eval_cfg=hellaswag_eval_cfg)
|
34 |
+
]
|
opencompass/configs/datasets/hellaswag/hellaswag_ppl_a6e128.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import HellaswagDataset_V3
|
6 |
+
|
7 |
+
hellaswag_reader_cfg = dict(
|
8 |
+
input_columns=['query', 'A', 'B', 'C', 'D'],
|
9 |
+
output_column='gold')
|
10 |
+
|
11 |
+
hellaswag_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template={
|
15 |
+
'0': dict(
|
16 |
+
round=[dict(role='HUMAN', prompt='{query} {A}')]
|
17 |
+
),
|
18 |
+
'1': dict(
|
19 |
+
round=[dict(role='HUMAN', prompt='{query} {B}')]
|
20 |
+
),
|
21 |
+
'2': dict(
|
22 |
+
round=[dict(role='HUMAN', prompt='{query} {C}')]
|
23 |
+
),
|
24 |
+
'3': dict(
|
25 |
+
round=[dict(role='HUMAN', prompt='{query} {D}')]
|
26 |
+
),
|
27 |
+
}),
|
28 |
+
retriever=dict(type=ZeroRetriever),
|
29 |
+
inferencer=dict(type=PPLInferencer))
|
30 |
+
|
31 |
+
hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
32 |
+
|
33 |
+
hellaswag_datasets = [
|
34 |
+
dict(
|
35 |
+
abbr='hellaswag',
|
36 |
+
type=HellaswagDataset_V3,
|
37 |
+
path='opencompass/hellaswag',
|
38 |
+
reader_cfg=hellaswag_reader_cfg,
|
39 |
+
infer_cfg=hellaswag_infer_cfg,
|
40 |
+
eval_cfg=hellaswag_eval_cfg)
|
41 |
+
]
|
opencompass/configs/datasets/inference_ppl/README.md
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Inference-PPL Datasets
|
2 |
+
|
3 |
+
- **Description**: Compute the loss only on the labeled positions, especially used for reasoning corpus.
|
4 |
+
- **Datasets**: cn-reasoning-val.jsonl (example datasets, inference-ppl can be generalized to more corpus).
|
5 |
+
|
6 |
+
# PPL Computation
|
7 |
+
|
8 |
+
$$ \text{ppl} = - \frac{1}{n} \sum_{i=0}^n \sum_{c=0}^{vocab\_size} y_{i,c} \log p_{i,c} \tag{1} $$
|
9 |
+
|
10 |
+
where Eq. (1) is the normal mean ppl computation formula, for inference-ppl, we only compute the average score based on pre-labeled position.
|
11 |
+
|
12 |
+
# Quick Start
|
13 |
+
|
14 |
+
```shell
|
15 |
+
cd opencompass
|
16 |
+
python run.py configs/eval_inference_ppl.py
|
17 |
+
```
|
18 |
+
|
19 |
+
# Some results
|
20 |
+
|
21 |
+
| Model | Result |
|
22 |
+
| ----------- | ----------- |
|
23 |
+
| Qwen1.5-7b | 0.59 |
|
24 |
+
| Qwen1.5-14b | 0.54 |
|
25 |
+
| Llama2-7b | 0.49 |
|
26 |
+
| Llama2-13b | 0.43 |
|
opencompass/configs/datasets/inference_ppl/inference_ppl.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import InferencePPLOnlyInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AverageInferencePPLEvaluator
|
5 |
+
|
6 |
+
from opencompass.datasets import InferencePPLDataset
|
7 |
+
|
8 |
+
# Build InferencePPLDataset
|
9 |
+
inference_ppl_datasets = []
|
10 |
+
|
11 |
+
llm_cmp_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template='{text}',
|
15 |
+
),
|
16 |
+
# No in-context example, using ZeroRetriever
|
17 |
+
retriever=dict(type=ZeroRetriever),
|
18 |
+
# compute inference-ppl
|
19 |
+
inferencer=dict(type=InferencePPLOnlyInferencer),
|
20 |
+
)
|
21 |
+
|
22 |
+
# Average the inference-ppl scores
|
23 |
+
llm_cmp_eval_cfg = dict(evaluator=dict(type=AverageInferencePPLEvaluator))
|
24 |
+
|
25 |
+
inference_ppl_datasets.append(
|
26 |
+
dict(
|
27 |
+
abbr=f'inference-ppl',
|
28 |
+
type=InferencePPLDataset,
|
29 |
+
path='./data/inference_ppl',
|
30 |
+
name='cn-reasoning-val',
|
31 |
+
samples=None, # Set small samples for testing
|
32 |
+
reader_cfg=dict(
|
33 |
+
input_columns=['text'],
|
34 |
+
output_column=None,
|
35 |
+
),
|
36 |
+
infer_cfg=llm_cmp_infer_cfg,
|
37 |
+
eval_cfg=llm_cmp_eval_cfg,
|
38 |
+
))
|
opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .iwslt2017_gen_d0ebd1 import iwslt2017_datasets # noqa: F401, F403
|
opencompass/configs/datasets/iwslt2017/iwslt2017_gen_69ce16.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import BM25Retriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import BleuEvaluator
|
5 |
+
from opencompass.datasets import IWSLT2017Dataset
|
6 |
+
from opencompass.utils.text_postprocessors import general_cn_postprocess
|
7 |
+
|
8 |
+
iwslt2017_reader_cfg = dict(
|
9 |
+
input_columns='en', output_column='de', train_split='validation')
|
10 |
+
|
11 |
+
iwslt2017_infer_cfg = dict(
|
12 |
+
ice_template=dict(type='PromptTemplate',
|
13 |
+
template='</E>{en} = {de}',
|
14 |
+
ice_token='</E>'),
|
15 |
+
retriever=dict(type=BM25Retriever, ice_num=1),
|
16 |
+
inferencer=dict(type=GenInferencer))
|
17 |
+
|
18 |
+
iwslt2017_eval_cfg = dict(
|
19 |
+
evaluator=dict(type=BleuEvaluator),
|
20 |
+
pred_role='BOT',
|
21 |
+
pred_postprocessor=dict(type=general_cn_postprocess),
|
22 |
+
dataset_postprocessor=dict(type=general_cn_postprocess))
|
23 |
+
|
24 |
+
iwslt2017_datasets = [
|
25 |
+
dict(
|
26 |
+
type=IWSLT2017Dataset,
|
27 |
+
path='iwslt2017',
|
28 |
+
name='iwslt2017-en-de',
|
29 |
+
reader_cfg=iwslt2017_reader_cfg,
|
30 |
+
infer_cfg=iwslt2017_infer_cfg,
|
31 |
+
eval_cfg=iwslt2017_eval_cfg)
|
32 |
+
]
|
opencompass/configs/datasets/iwslt2017/iwslt2017_gen_b4a814.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import BM25Retriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import BleuEvaluator
|
5 |
+
from opencompass.datasets import IWSLT2017Dataset
|
6 |
+
from opencompass.utils.text_postprocessors import general_cn_postprocess
|
7 |
+
|
8 |
+
iwslt2017_reader_cfg = dict(
|
9 |
+
input_columns='en', output_column='de', train_split='validation')
|
10 |
+
|
11 |
+
iwslt2017_infer_cfg = dict(
|
12 |
+
ice_template=dict(type='PromptTemplate',
|
13 |
+
template=dict(
|
14 |
+
begin=[
|
15 |
+
dict(role='SYSTEM', fallback_role='HUMAN', prompt='Please translate the following English statements to German:'),
|
16 |
+
'</E>',
|
17 |
+
],
|
18 |
+
round=[
|
19 |
+
dict(role='HUMAN', prompt='{en}'),
|
20 |
+
dict(role='BOT', prompt='{de}'),
|
21 |
+
]
|
22 |
+
),
|
23 |
+
ice_token='</E>'),
|
24 |
+
retriever=dict(type=BM25Retriever, ice_num=1),
|
25 |
+
inferencer=dict(type=GenInferencer))
|
26 |
+
|
27 |
+
iwslt2017_eval_cfg = dict(
|
28 |
+
evaluator=dict(type=BleuEvaluator),
|
29 |
+
pred_role='BOT',
|
30 |
+
pred_postprocessor=dict(type=general_cn_postprocess),
|
31 |
+
dataset_postprocessor=dict(type=general_cn_postprocess))
|
32 |
+
|
33 |
+
iwslt2017_datasets = [
|
34 |
+
dict(
|
35 |
+
type=IWSLT2017Dataset,
|
36 |
+
path='iwslt2017',
|
37 |
+
name='iwslt2017-en-de',
|
38 |
+
reader_cfg=iwslt2017_reader_cfg,
|
39 |
+
infer_cfg=iwslt2017_infer_cfg,
|
40 |
+
eval_cfg=iwslt2017_eval_cfg)
|
41 |
+
]
|
opencompass/configs/datasets/iwslt2017/iwslt2017_gen_d0ebd1.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import BM25Retriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import BleuEvaluator
|
5 |
+
from opencompass.datasets import IWSLT2017Dataset
|
6 |
+
from opencompass.utils.text_postprocessors import general_cn_postprocess
|
7 |
+
|
8 |
+
iwslt2017_reader_cfg = dict(
|
9 |
+
input_columns='en', output_column='de', train_split='validation')
|
10 |
+
|
11 |
+
iwslt2017_infer_cfg = dict(
|
12 |
+
ice_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(
|
15 |
+
begin='</E>',
|
16 |
+
round=[
|
17 |
+
dict(role='HUMAN', prompt='Please translate the following English statements to German:\n{en}'),
|
18 |
+
dict(role='BOT', prompt='{de}'),
|
19 |
+
]
|
20 |
+
),
|
21 |
+
ice_token='</E>'),
|
22 |
+
retriever=dict(type=BM25Retriever, ice_num=1),
|
23 |
+
inferencer=dict(type=GenInferencer))
|
24 |
+
|
25 |
+
iwslt2017_eval_cfg = dict(
|
26 |
+
evaluator=dict(type=BleuEvaluator),
|
27 |
+
pred_role='BOT',
|
28 |
+
pred_postprocessor=dict(type=general_cn_postprocess),
|
29 |
+
dataset_postprocessor=dict(type=general_cn_postprocess))
|
30 |
+
|
31 |
+
iwslt2017_datasets = [
|
32 |
+
dict(
|
33 |
+
type=IWSLT2017Dataset,
|
34 |
+
path='iwslt2017',
|
35 |
+
name='iwslt2017-en-de',
|
36 |
+
reader_cfg=iwslt2017_reader_cfg,
|
37 |
+
infer_cfg=iwslt2017_infer_cfg,
|
38 |
+
eval_cfg=iwslt2017_eval_cfg)
|
39 |
+
]
|
opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import LawBenchDataset
|
5 |
+
|
6 |
+
names = [
|
7 |
+
['1-1', 'article_recitation'],
|
8 |
+
['1-2', 'knowledge_question_answering'],
|
9 |
+
['2-1', 'document_proofreading'],
|
10 |
+
['2-2', 'dispute_focus_identification'],
|
11 |
+
['2-3', 'marital_disputes_identification'],
|
12 |
+
['2-4', 'issue_topic_identification'],
|
13 |
+
['2-5', 'reading_comprehension'],
|
14 |
+
['2-6', 'named_entity_recognition'],
|
15 |
+
['2-7', 'opinion_summarization'],
|
16 |
+
['2-8', 'argument_mining'],
|
17 |
+
['2-9', 'event_detection'],
|
18 |
+
['2-10', 'trigger_word_extraction'],
|
19 |
+
['3-1', 'fact_based_article_prediction'],
|
20 |
+
['3-2', 'scene_based_article_prediction'],
|
21 |
+
['3-3', 'charge_prediction'],
|
22 |
+
['3-4', 'prison_term_prediction_wo_article'],
|
23 |
+
['3-5', 'prison_term_prediction_w_article'],
|
24 |
+
['3-6', 'case_analysis'],
|
25 |
+
['3-7', 'criminal_damages_calculation'],
|
26 |
+
['3-8', 'consultation'],
|
27 |
+
]
|
28 |
+
|
29 |
+
lawbench_datasets = []
|
30 |
+
for index, name in names:
|
31 |
+
lawbench_reader_cfg = dict(
|
32 |
+
input_columns=['instruction', 'question'],
|
33 |
+
output_column='answer')
|
34 |
+
|
35 |
+
lawbench_infer_cfg = dict(
|
36 |
+
prompt_template=dict(
|
37 |
+
type=PromptTemplate,
|
38 |
+
template=dict(
|
39 |
+
round=[
|
40 |
+
dict(role='HUMAN', prompt='{instruction}\n{question}'),
|
41 |
+
]
|
42 |
+
),
|
43 |
+
),
|
44 |
+
retriever=dict(type=ZeroRetriever),
|
45 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
46 |
+
)
|
47 |
+
|
48 |
+
lawbench_eval_cfg = dict(
|
49 |
+
evaluator=dict(type='LawBenchEvaluator_' + index.replace('-', '_'))
|
50 |
+
)
|
51 |
+
|
52 |
+
lawbench_datasets.append(
|
53 |
+
dict(
|
54 |
+
abbr='lawbench-' + index + '-' + name + '-1-shot',
|
55 |
+
type=LawBenchDataset,
|
56 |
+
path='./data/lawbench/one_shot',
|
57 |
+
index=index,
|
58 |
+
reader_cfg=lawbench_reader_cfg,
|
59 |
+
infer_cfg=lawbench_infer_cfg,
|
60 |
+
eval_cfg=lawbench_eval_cfg
|
61 |
+
)
|
62 |
+
)
|
opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import LawBenchDataset
|
5 |
+
|
6 |
+
names = [
|
7 |
+
['1-1', 'article_recitation'],
|
8 |
+
['1-2', 'knowledge_question_answering'],
|
9 |
+
['2-1', 'document_proofreading'],
|
10 |
+
['2-2', 'dispute_focus_identification'],
|
11 |
+
['2-3', 'marital_disputes_identification'],
|
12 |
+
['2-4', 'issue_topic_identification'],
|
13 |
+
['2-5', 'reading_comprehension'],
|
14 |
+
['2-6', 'named_entity_recognition'],
|
15 |
+
['2-7', 'opinion_summarization'],
|
16 |
+
['2-8', 'argument_mining'],
|
17 |
+
['2-9', 'event_detection'],
|
18 |
+
['2-10', 'trigger_word_extraction'],
|
19 |
+
['3-1', 'fact_based_article_prediction'],
|
20 |
+
['3-2', 'scene_based_article_prediction'],
|
21 |
+
['3-3', 'charge_prediction'],
|
22 |
+
['3-4', 'prison_term_prediction_wo_article'],
|
23 |
+
['3-5', 'prison_term_prediction_w_article'],
|
24 |
+
['3-6', 'case_analysis'],
|
25 |
+
['3-7', 'criminal_damages_calculation'],
|
26 |
+
['3-8', 'consultation'],
|
27 |
+
]
|
28 |
+
|
29 |
+
lawbench_datasets = []
|
30 |
+
for index, name in names:
|
31 |
+
lawbench_reader_cfg = dict(
|
32 |
+
input_columns=['instruction', 'question'],
|
33 |
+
output_column='answer')
|
34 |
+
|
35 |
+
lawbench_infer_cfg = dict(
|
36 |
+
prompt_template=dict(
|
37 |
+
type=PromptTemplate,
|
38 |
+
template=dict(
|
39 |
+
round=[
|
40 |
+
dict(role='HUMAN', prompt='{instruction}\n{question}'),
|
41 |
+
]
|
42 |
+
),
|
43 |
+
),
|
44 |
+
retriever=dict(type=ZeroRetriever),
|
45 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024)
|
46 |
+
)
|
47 |
+
|
48 |
+
lawbench_eval_cfg = dict(
|
49 |
+
evaluator=dict(type='LawBenchEvaluator_' + index.replace('-', '_'))
|
50 |
+
)
|
51 |
+
|
52 |
+
lawbench_datasets.append(
|
53 |
+
dict(
|
54 |
+
abbr='lawbench-' + index + '-' + name + '-0-shot',
|
55 |
+
type=LawBenchDataset,
|
56 |
+
path='./data/lawbench/zero_shot',
|
57 |
+
index=index,
|
58 |
+
reader_cfg=lawbench_reader_cfg,
|
59 |
+
infer_cfg=lawbench_infer_cfg,
|
60 |
+
eval_cfg=lawbench_eval_cfg
|
61 |
+
)
|
62 |
+
)
|
opencompass/configs/datasets/math/README.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MATH
|
2 |
+
|
3 |
+
```bash
|
4 |
+
python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
|
5 |
+
python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug
|
6 |
+
```
|
7 |
+
|
8 |
+
## Base Models
|
9 |
+
|
10 |
+
| model | math |
|
11 |
+
|:------------------------:|-------:|
|
12 |
+
| llama-7b-turbomind | 2.94 |
|
13 |
+
| llama-13b-turbomind | 3.84 |
|
14 |
+
| llama-30b-turbomind | 6.54 |
|
15 |
+
| llama-65b-turbomind | 10.66 |
|
16 |
+
| llama-2-7b-turbomind | 3.58 |
|
17 |
+
| llama-2-13b-turbomind | 5.30 |
|
18 |
+
| llama-2-70b-turbomind | 13.26 |
|
19 |
+
| llama-3-8b-turbomind | 16.42 |
|
20 |
+
| llama-3-70b-turbomind | 39.64 |
|
21 |
+
| internlm2-1.8b-turbomind | 9.42 |
|
22 |
+
| internlm2-7b-turbomind | 25.16 |
|
23 |
+
| internlm2-20b-turbomind | 32.24 |
|
24 |
+
| qwen-1.8b-turbomind | 6.30 |
|
25 |
+
| qwen-7b-turbomind | 15.56 |
|
26 |
+
| qwen-14b-turbomind | 30.38 |
|
27 |
+
| qwen-72b-turbomind | 44.18 |
|
28 |
+
| qwen1.5-0.5b-hf | 4.16 |
|
29 |
+
| qwen1.5-1.8b-hf | 11.32 |
|
30 |
+
| qwen1.5-4b-hf | 17.50 |
|
31 |
+
| qwen1.5-7b-hf | 17.34 |
|
32 |
+
| qwen1.5-14b-hf | 36.18 |
|
33 |
+
| qwen1.5-32b-hf | 45.74 |
|
34 |
+
| qwen1.5-72b-hf | 41.56 |
|
35 |
+
| qwen1.5-moe-a2-7b-hf | 27.96 |
|
36 |
+
| mistral-7b-v0.1-hf | 13.44 |
|
37 |
+
| mistral-7b-v0.2-hf | 12.74 |
|
38 |
+
| mixtral-8x7b-v0.1-hf | 29.46 |
|
39 |
+
| mixtral-8x22b-v0.1-hf | 41.82 |
|
40 |
+
| yi-6b-hf | 6.60 |
|
41 |
+
| yi-34b-hf | 18.80 |
|
42 |
+
| deepseek-7b-base-hf | 4.66 |
|
43 |
+
| deepseek-67b-base-hf | 18.76 |
|
44 |
+
|
45 |
+
## Chat Models
|
46 |
+
|
47 |
+
| model | math |
|
48 |
+
|:-----------------------------:|-------:|
|
49 |
+
| qwen1.5-0.5b-chat-hf | 0.56 |
|
50 |
+
| qwen1.5-1.8b-chat-hf | 4.94 |
|
51 |
+
| qwen1.5-4b-chat-hf | 7.34 |
|
52 |
+
| qwen1.5-7b-chat-hf | 22.14 |
|
53 |
+
| qwen1.5-14b-chat-hf | 32.22 |
|
54 |
+
| qwen1.5-32b-chat-hf | 41.80 |
|
55 |
+
| qwen1.5-72b-chat-hf | 45.22 |
|
56 |
+
| qwen1.5-110b-chat-hf | 54.38 |
|
57 |
+
| internlm2-chat-1.8b-hf | 14.06 |
|
58 |
+
| internlm2-chat-1.8b-sft-hf | 13.10 |
|
59 |
+
| internlm2-chat-7b-hf | 28.08 |
|
60 |
+
| internlm2-chat-7b-sft-hf | 27.60 |
|
61 |
+
| internlm2-chat-20b-hf | 34.68 |
|
62 |
+
| internlm2-chat-20b-sft-hf | 32.54 |
|
63 |
+
| llama-3-8b-instruct-hf | 27.50 |
|
64 |
+
| llama-3-70b-instruct-hf | 47.52 |
|
65 |
+
| llama-3-8b-instruct-lmdeploy | 27.42 |
|
66 |
+
| llama-3-70b-instruct-lmdeploy | 46.90 |
|
67 |
+
| mistral-7b-instruct-v0.1-hf | 8.48 |
|
68 |
+
| mistral-7b-instruct-v0.2-hf | 10.82 |
|
69 |
+
| mixtral-8x7b-instruct-v0.1-hf | 27.02 |
|
opencompass/configs/datasets/math/deprecated_math_evaluatorv2_gen_265cce.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template=dict(round=[
|
12 |
+
dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'),
|
13 |
+
dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'),
|
14 |
+
dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'),
|
15 |
+
dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'),
|
16 |
+
dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'),
|
17 |
+
dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'),
|
18 |
+
dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'),
|
19 |
+
dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'),
|
20 |
+
dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
|
21 |
+
])),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
24 |
+
|
25 |
+
# postprocess v2
|
26 |
+
math_eval_cfg = dict(
|
27 |
+
evaluator=dict(type=MATHEvaluator, version='v2'),
|
28 |
+
pred_postprocessor=dict(type=math_postprocess_v2))
|
29 |
+
|
30 |
+
math_datasets = [
|
31 |
+
dict(
|
32 |
+
type=MATHDataset,
|
33 |
+
abbr='math',
|
34 |
+
path='./data/math/math.json',
|
35 |
+
reader_cfg=math_reader_cfg,
|
36 |
+
infer_cfg=math_infer_cfg,
|
37 |
+
eval_cfg=math_eval_cfg)
|
38 |
+
]
|
opencompass/configs/datasets/math/math_0shot_llm_judge_gen_393424.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator
|
5 |
+
from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess
|
6 |
+
from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
|
7 |
+
|
8 |
+
# ----------------------------- Eval Parameters -----------------------------
|
9 |
+
## Postprocess function
|
10 |
+
post_func = 're' # 're', 'xfinder_model', 'naive_model'
|
11 |
+
|
12 |
+
## Evalute function
|
13 |
+
eval_func = 'naive_model' # 're', 'naive_model'
|
14 |
+
|
15 |
+
## Model api url
|
16 |
+
xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model'
|
17 |
+
naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name
|
18 |
+
naive_model_url = ['http://22.8.6.22:23333/v1', 'http://22.8.67.84:23333/v1', 'http://22.8.72.81:23333/v1', 'http://22.9.42.143:23333/v1'] # Multi-apis for accerlation
|
19 |
+
|
20 |
+
# ----------------------------- Detailed Config -----------------------------
|
21 |
+
|
22 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
23 |
+
|
24 |
+
math_infer_cfg = dict(
|
25 |
+
prompt_template=dict(
|
26 |
+
type=PromptTemplate,
|
27 |
+
template=dict(
|
28 |
+
round=[
|
29 |
+
dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
30 |
+
]
|
31 |
+
),
|
32 |
+
),
|
33 |
+
retriever=dict(type=ZeroRetriever),
|
34 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
35 |
+
)
|
36 |
+
|
37 |
+
if post_func == 're':
|
38 |
+
pred_postprocessor = dict(type=math_postprocess_v2)
|
39 |
+
elif post_func == 'xfinder_model':
|
40 |
+
pred_postprocessor = dict(
|
41 |
+
type=xfinder_postprocess,
|
42 |
+
question_type='math',
|
43 |
+
model_name='xFinder-qwen1505',
|
44 |
+
num_processes=128,
|
45 |
+
api_url=xfinder_url,
|
46 |
+
)
|
47 |
+
elif post_func == 'naive_model':
|
48 |
+
pred_postprocessor = dict(
|
49 |
+
type=naive_model_postprocess,
|
50 |
+
custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
|
51 |
+
model_name=naive_model_name,
|
52 |
+
num_processes=64,
|
53 |
+
api_url=naive_model_url,
|
54 |
+
)
|
55 |
+
|
56 |
+
if eval_func == 're':
|
57 |
+
evaluator = dict(type=MATHEvaluator, version='v2')
|
58 |
+
elif eval_func == 'naive_model':
|
59 |
+
evaluator = dict(
|
60 |
+
type=GaoKaoMATHEvaluator,
|
61 |
+
model_name=naive_model_name,
|
62 |
+
url=naive_model_url,
|
63 |
+
)
|
64 |
+
|
65 |
+
math_eval_cfg = dict(
|
66 |
+
evaluator=evaluator, pred_postprocessor=pred_postprocessor,
|
67 |
+
)
|
68 |
+
|
69 |
+
math_datasets = [
|
70 |
+
dict(
|
71 |
+
type=MATHDataset,
|
72 |
+
abbr='math',
|
73 |
+
path='opencompass/math',
|
74 |
+
reader_cfg=math_reader_cfg,
|
75 |
+
infer_cfg=math_infer_cfg,
|
76 |
+
eval_cfg=math_eval_cfg,
|
77 |
+
)
|
78 |
+
]
|
opencompass/configs/datasets/math/math_agent_evaluatorv2_gen_0c1b4e.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import AgentInferencer
|
4 |
+
from opencompass.datasets import (
|
5 |
+
MATHDataset, MATHAgentEvaluator, math_postprocess_v2
|
6 |
+
)
|
7 |
+
|
8 |
+
# use pal format but not perform well
|
9 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
10 |
+
|
11 |
+
math_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(
|
15 |
+
round=[
|
16 |
+
# # ################################### NEW SHOT ###################################
|
17 |
+
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
|
18 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter
|
19 |
+
Tool Input:```python
|
20 |
+
from sympy import symbols, simplify
|
21 |
+
|
22 |
+
def solution():
|
23 |
+
x = symbols('x')
|
24 |
+
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
|
25 |
+
simplified_expr = simplify(expr)
|
26 |
+
|
27 |
+
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
|
28 |
+
result = x3_coefficient
|
29 |
+
return result
|
30 |
+
```"""),
|
31 |
+
dict(role='SYSTEM', prompt='Response:26'),
|
32 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
|
33 |
+
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
|
34 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter
|
35 |
+
Tool Input:```python
|
36 |
+
import math
|
37 |
+
|
38 |
+
def solution():
|
39 |
+
radius = 6
|
40 |
+
|
41 |
+
# Surface area of the hemisphere
|
42 |
+
hemisphere_area = 2 * math.pi * radius**2
|
43 |
+
|
44 |
+
# Area of the circular base
|
45 |
+
base_area = math.pi * radius**2
|
46 |
+
|
47 |
+
# Total surface area
|
48 |
+
total_surface_area = hemisphere_area + base_area
|
49 |
+
|
50 |
+
# Formatting the result in LaTeX
|
51 |
+
result = r'{}\pi'.format(total_surface_area / math.pi)
|
52 |
+
return result
|
53 |
+
```"""),
|
54 |
+
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
|
55 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
|
56 |
+
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
|
57 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter
|
58 |
+
Tool Input:```python
|
59 |
+
def solution():
|
60 |
+
# Probabilities of each outcome
|
61 |
+
prime_prob = 1 / 6
|
62 |
+
composite_prob = 1 / 3
|
63 |
+
otherwise_prob = 1 / 6
|
64 |
+
|
65 |
+
# Expected value of each outcome
|
66 |
+
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
|
67 |
+
composite_expected_value = 0 * composite_prob
|
68 |
+
otherwise_expected_value = -3 * otherwise_prob
|
69 |
+
|
70 |
+
# Total expected value
|
71 |
+
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
|
72 |
+
|
73 |
+
# Dollar value to the nearest cent
|
74 |
+
result = "{:.2f}".format(total_expected_value)
|
75 |
+
return result
|
76 |
+
```"""),
|
77 |
+
dict(role='SYSTEM', prompt='Response:1.17'),
|
78 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
|
79 |
+
dict(role='HUMAN', prompt='{problem}'),
|
80 |
+
])),
|
81 |
+
retriever=dict(type=ZeroRetriever),
|
82 |
+
inferencer=dict(type=AgentInferencer),
|
83 |
+
)
|
84 |
+
|
85 |
+
math_eval_cfg = dict(
|
86 |
+
evaluator=dict(
|
87 |
+
type=MATHAgentEvaluator,
|
88 |
+
version='v2'),
|
89 |
+
pred_postprocessor=dict(type=math_postprocess_v2))
|
90 |
+
|
91 |
+
math_datasets = [
|
92 |
+
dict(
|
93 |
+
abbr='math-agent',
|
94 |
+
type=MATHDataset,
|
95 |
+
path='opencompass/math',
|
96 |
+
reader_cfg=math_reader_cfg,
|
97 |
+
infer_cfg=math_infer_cfg,
|
98 |
+
eval_cfg=math_eval_cfg,
|
99 |
+
)
|
100 |
+
]
|
opencompass/configs/datasets/math/math_agent_gen_0c1b4e.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import AgentInferencer
|
4 |
+
from opencompass.datasets import (
|
5 |
+
MATHDataset, MATHAgentEvaluator, math_postprocess
|
6 |
+
)
|
7 |
+
|
8 |
+
# use pal format but not perform well
|
9 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
10 |
+
|
11 |
+
math_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(
|
15 |
+
round=[
|
16 |
+
# # ################################### NEW SHOT ###################################
|
17 |
+
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
|
18 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter
|
19 |
+
Tool Input:```python
|
20 |
+
from sympy import symbols, simplify
|
21 |
+
|
22 |
+
def solution():
|
23 |
+
x = symbols('x')
|
24 |
+
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
|
25 |
+
simplified_expr = simplify(expr)
|
26 |
+
|
27 |
+
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
|
28 |
+
result = x3_coefficient
|
29 |
+
return result
|
30 |
+
```"""),
|
31 |
+
dict(role='SYSTEM', prompt='Response:26'),
|
32 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
|
33 |
+
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
|
34 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter
|
35 |
+
Tool Input:```python
|
36 |
+
import math
|
37 |
+
|
38 |
+
def solution():
|
39 |
+
radius = 6
|
40 |
+
|
41 |
+
# Surface area of the hemisphere
|
42 |
+
hemisphere_area = 2 * math.pi * radius**2
|
43 |
+
|
44 |
+
# Area of the circular base
|
45 |
+
base_area = math.pi * radius**2
|
46 |
+
|
47 |
+
# Total surface area
|
48 |
+
total_surface_area = hemisphere_area + base_area
|
49 |
+
|
50 |
+
# Formatting the result in LaTeX
|
51 |
+
result = r'{}\pi'.format(total_surface_area / math.pi)
|
52 |
+
return result
|
53 |
+
```"""),
|
54 |
+
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
|
55 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
|
56 |
+
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
|
57 |
+
dict(role='BOT', prompt="""Tool:PythonInterpreter
|
58 |
+
Tool Input:```python
|
59 |
+
def solution():
|
60 |
+
# Probabilities of each outcome
|
61 |
+
prime_prob = 1 / 6
|
62 |
+
composite_prob = 1 / 3
|
63 |
+
otherwise_prob = 1 / 6
|
64 |
+
|
65 |
+
# Expected value of each outcome
|
66 |
+
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
|
67 |
+
composite_expected_value = 0 * composite_prob
|
68 |
+
otherwise_expected_value = -3 * otherwise_prob
|
69 |
+
|
70 |
+
# Total expected value
|
71 |
+
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
|
72 |
+
|
73 |
+
# Dollar value to the nearest cent
|
74 |
+
result = "{:.2f}".format(total_expected_value)
|
75 |
+
return result
|
76 |
+
```"""),
|
77 |
+
dict(role='SYSTEM', prompt='Response:1.17'),
|
78 |
+
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
|
79 |
+
dict(role='HUMAN', prompt='{problem}'),
|
80 |
+
])),
|
81 |
+
retriever=dict(type=ZeroRetriever),
|
82 |
+
inferencer=dict(type=AgentInferencer),
|
83 |
+
)
|
84 |
+
|
85 |
+
math_eval_cfg = dict(
|
86 |
+
evaluator=dict(type=MATHAgentEvaluator),
|
87 |
+
pred_postprocessor=dict(type=math_postprocess),
|
88 |
+
)
|
89 |
+
|
90 |
+
math_datasets = [
|
91 |
+
dict(
|
92 |
+
abbr='math-agent',
|
93 |
+
type=MATHDataset,
|
94 |
+
path='opencompass/math',
|
95 |
+
reader_cfg=math_reader_cfg,
|
96 |
+
infer_cfg=math_infer_cfg,
|
97 |
+
eval_cfg=math_eval_cfg,
|
98 |
+
)
|
99 |
+
]
|
opencompass/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template="""\
|
12 |
+
Problem:
|
13 |
+
Find the domain of the expression $\\frac{{\\sqrt{{x-2}}}}{{\\sqrt{{5-x}}}}$.
|
14 |
+
Solution:
|
15 |
+
The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.
|
16 |
+
Final Answer: The final answer is $[2,5)$. I hope it is correct.
|
17 |
+
|
18 |
+
Problem:
|
19 |
+
If $\\det \\mathbf{{A}} = 2$ and $\\det \\mathbf{{B}} = 12,$ then find $\\det (\\mathbf{{A}} \\mathbf{{B}}).$
|
20 |
+
Solution:
|
21 |
+
We have that $\\det (\\mathbf{{A}} \\mathbf{{B}}) = (\\det \\mathbf{{A}})(\\det \\mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$
|
22 |
+
Final Answer: The final answer is $24$. I hope it is correct.
|
23 |
+
|
24 |
+
Problem:
|
25 |
+
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
|
26 |
+
Solution:
|
27 |
+
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \\Rightarrow\\qquad n&=480/30=\\boxed{{16}} \\end{{align*}}
|
28 |
+
Final Answer: The final answer is $16$. I hope it is correct.
|
29 |
+
|
30 |
+
Problem:
|
31 |
+
If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \\end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.
|
32 |
+
Solution:
|
33 |
+
If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$
|
34 |
+
Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.
|
35 |
+
|
36 |
+
Problem:
|
37 |
+
{problem}
|
38 |
+
Solution:"""
|
39 |
+
),
|
40 |
+
retriever=dict(type=ZeroRetriever),
|
41 |
+
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Problem']))
|
42 |
+
|
43 |
+
# postprocess v2
|
44 |
+
math_eval_cfg = dict(
|
45 |
+
evaluator=dict(type=MATHEvaluator, version='v2'),
|
46 |
+
pred_postprocessor=dict(type=math_postprocess_v2))
|
47 |
+
|
48 |
+
math_datasets = [
|
49 |
+
dict(
|
50 |
+
type=MATHDataset,
|
51 |
+
abbr='math',
|
52 |
+
path='opencompass/math',
|
53 |
+
reader_cfg=math_reader_cfg,
|
54 |
+
infer_cfg=math_infer_cfg,
|
55 |
+
eval_cfg=math_eval_cfg)
|
56 |
+
]
|
opencompass/configs/datasets/math/math_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .math_gen_265cce import math_datasets # noqa: F401, F403
|
opencompass/configs/datasets/math/math_gen_0957ff.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template=dict(round=[
|
12 |
+
dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{\sqrt{x-2}}{\sqrt{5-x}}$.}\nSolution:'),
|
13 |
+
dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'),
|
14 |
+
dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12,$ then find $\det (\mathbf{A} \mathbf{B}).$\nSolution:'),
|
15 |
+
dict(role='BOT', prompt='We have that $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'),
|
16 |
+
dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'),
|
17 |
+
dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{align*} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{16} \end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'),
|
18 |
+
dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{align*} 6x-4y&=a,\\\\ 6y-9x &=b. \end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.\nSolution:'),
|
19 |
+
dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.\n'),
|
20 |
+
dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
|
21 |
+
])),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
24 |
+
|
25 |
+
math_eval_cfg = dict(
|
26 |
+
evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
|
27 |
+
|
28 |
+
math_datasets = [
|
29 |
+
dict(
|
30 |
+
type=MATHDataset,
|
31 |
+
abbr='math',
|
32 |
+
path='opencompass/math',
|
33 |
+
reader_cfg=math_reader_cfg,
|
34 |
+
infer_cfg=math_infer_cfg,
|
35 |
+
eval_cfg=math_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/math/math_gen_265cce.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template=dict(round=[
|
12 |
+
dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'),
|
13 |
+
dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'),
|
14 |
+
dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'),
|
15 |
+
dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'),
|
16 |
+
dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'),
|
17 |
+
dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'),
|
18 |
+
dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'),
|
19 |
+
dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'),
|
20 |
+
dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
|
21 |
+
])),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
24 |
+
|
25 |
+
math_eval_cfg = dict(
|
26 |
+
evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2))
|
27 |
+
|
28 |
+
math_datasets = [
|
29 |
+
dict(
|
30 |
+
type=MATHDataset,
|
31 |
+
abbr='math',
|
32 |
+
path='opencompass/math',
|
33 |
+
reader_cfg=math_reader_cfg,
|
34 |
+
infer_cfg=math_infer_cfg,
|
35 |
+
eval_cfg=math_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/math/math_gen_736506.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import MATHInternDataset, MATHInternEvaluator, math_intern_postprocess
|
5 |
+
|
6 |
+
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
7 |
+
|
8 |
+
math_infer_cfg = dict(
|
9 |
+
prompt_template=dict(
|
10 |
+
type=PromptTemplate,
|
11 |
+
template=dict(round=[
|
12 |
+
dict(role='HUMAN', prompt="Question: {problem}\nLet's think step by step\nAnswer:")
|
13 |
+
])),
|
14 |
+
retriever=dict(type=ZeroRetriever),
|
15 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
16 |
+
|
17 |
+
math_eval_cfg = dict(
|
18 |
+
evaluator=dict(type=MATHInternEvaluator), pred_postprocessor=dict(type=math_intern_postprocess))
|
19 |
+
|
20 |
+
math_datasets = [
|
21 |
+
dict(
|
22 |
+
type=MATHInternDataset,
|
23 |
+
abbr='math',
|
24 |
+
path='opencompass/math',
|
25 |
+
reader_cfg=math_reader_cfg,
|
26 |
+
infer_cfg=math_infer_cfg,
|
27 |
+
eval_cfg=math_eval_cfg)
|
28 |
+
]
|