Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py +55 -0
- opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py +53 -0
- opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py +48 -0
- opencompass/configs/datasets/ARC_c/ARC_c_few_shot_ppl.py +63 -0
- opencompass/configs/datasets/ARC_c/ARC_c_gen.py +4 -0
- opencompass/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py +44 -0
- opencompass/configs/datasets/ARC_c/ARC_c_ppl.py +4 -0
- opencompass/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py +37 -0
- opencompass/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py +54 -0
- opencompass/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py +36 -0
- opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py +4 -0
- opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py +51 -0
- opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl.py +4 -0
- opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py +45 -0
- opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_acccb5.py +39 -0
- opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py +4 -0
- opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py +29 -0
- opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_30dea0.py +42 -0
- opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py +35 -0
- opencompass/configs/datasets/humaneval/README.md +69 -0
- opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py +36 -0
- opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py +36 -0
- opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py +36 -0
- opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py +33 -0
- opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py +31 -0
- opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py +41 -0
- opencompass/configs/datasets/humaneval/humaneval_gen.py +4 -0
- opencompass/configs/datasets/humaneval/humaneval_gen_66a7f4.py +35 -0
- opencompass/configs/datasets/humaneval/humaneval_gen_8e312c.py +37 -0
- opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py +36 -0
- opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py +36 -0
- opencompass/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py +37 -0
- opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py +4 -0
- opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py +35 -0
- opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py +37 -0
- opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py +36 -0
- opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py +37 -0
- opencompass/configs/datasets/mmlu/README.md +368 -0
- opencompass/configs/datasets/mmlu/mmlu_all_sets.py +59 -0
- opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py +114 -0
- opencompass/configs/datasets/mmlu/mmlu_gen.py +4 -0
- opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py +124 -0
- opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py +123 -0
- opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py +124 -0
- opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py +110 -0
- opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py +124 -0
- opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py +141 -0
- opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py +59 -0
- opencompass/configs/datasets/mmlu/mmlu_ppl.py +4 -0
- opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py +106 -0
opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
|
5 |
+
from opencompass.datasets import ARCDatasetClean as ARCDataset
|
6 |
+
|
7 |
+
ARC_c_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
9 |
+
output_column='answerKey')
|
10 |
+
|
11 |
+
ARC_c_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template={
|
15 |
+
'A':
|
16 |
+
dict(
|
17 |
+
round=[
|
18 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
19 |
+
dict(role='BOT', prompt='{textA}')
|
20 |
+
], ),
|
21 |
+
'B':
|
22 |
+
dict(
|
23 |
+
round=[
|
24 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
25 |
+
dict(role='BOT', prompt='{textB}')
|
26 |
+
], ),
|
27 |
+
'C':
|
28 |
+
dict(
|
29 |
+
round=[
|
30 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
31 |
+
dict(role='BOT', prompt='{textC}')
|
32 |
+
], ),
|
33 |
+
'D':
|
34 |
+
dict(
|
35 |
+
round=[
|
36 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
37 |
+
dict(role='BOT', prompt='{textD}')
|
38 |
+
], ),
|
39 |
+
}),
|
40 |
+
retriever=dict(type=ZeroRetriever),
|
41 |
+
inferencer=dict(type=PPLInferencer))
|
42 |
+
|
43 |
+
ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
|
44 |
+
analyze_contamination=True)
|
45 |
+
|
46 |
+
ARC_c_datasets = [
|
47 |
+
dict(
|
48 |
+
type=ARCDataset,
|
49 |
+
abbr='ARC-c-test',
|
50 |
+
path='opencompass/ai2_arc-test',
|
51 |
+
name='ARC-Challenge',
|
52 |
+
reader_cfg=ARC_c_reader_cfg,
|
53 |
+
infer_cfg=ARC_c_infer_cfg,
|
54 |
+
eval_cfg=ARC_c_eval_cfg)
|
55 |
+
]
|
opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import ARCDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess, match_answer_pattern
|
7 |
+
|
8 |
+
QUERY_TEMPLATE = """
|
9 |
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
|
10 |
+
|
11 |
+
{question}
|
12 |
+
|
13 |
+
A. {textA}
|
14 |
+
B. {textB}
|
15 |
+
C. {textC}
|
16 |
+
D. {textD}
|
17 |
+
""".strip()
|
18 |
+
|
19 |
+
ARC_c_reader_cfg = dict(
|
20 |
+
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
21 |
+
output_column='answerKey')
|
22 |
+
|
23 |
+
ARC_c_infer_cfg = dict(
|
24 |
+
prompt_template=dict(
|
25 |
+
type=PromptTemplate,
|
26 |
+
template=dict(
|
27 |
+
round=[
|
28 |
+
dict(
|
29 |
+
role='HUMAN',
|
30 |
+
prompt=QUERY_TEMPLATE)
|
31 |
+
], ),
|
32 |
+
),
|
33 |
+
retriever=dict(type=ZeroRetriever),
|
34 |
+
inferencer=dict(type=GenInferencer),
|
35 |
+
)
|
36 |
+
|
37 |
+
ARC_c_eval_cfg = dict(
|
38 |
+
evaluator=dict(type=AccEvaluator),
|
39 |
+
pred_role='BOT',
|
40 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
41 |
+
)
|
42 |
+
|
43 |
+
ARC_c_datasets = [
|
44 |
+
dict(
|
45 |
+
abbr='ARC-c',
|
46 |
+
type=ARCDataset,
|
47 |
+
path='opencompass/ai2_arc-dev',
|
48 |
+
name='ARC-Challenge',
|
49 |
+
reader_cfg=ARC_c_reader_cfg,
|
50 |
+
infer_cfg=ARC_c_infer_cfg,
|
51 |
+
eval_cfg=ARC_c_eval_cfg,
|
52 |
+
)
|
53 |
+
]
|
opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import ARCDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
ARC_c_reader_cfg = dict(
|
9 |
+
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
10 |
+
output_column='answerKey',
|
11 |
+
)
|
12 |
+
|
13 |
+
ARC_c_infer_cfg = dict(
|
14 |
+
ice_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(
|
17 |
+
begin='</E>',
|
18 |
+
round=[
|
19 |
+
dict(
|
20 |
+
role='HUMAN',
|
21 |
+
prompt='Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:',
|
22 |
+
),
|
23 |
+
dict(role='BOT', prompt='{answerKey}'),
|
24 |
+
],
|
25 |
+
),
|
26 |
+
ice_token='</E>',
|
27 |
+
),
|
28 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
|
29 |
+
inferencer=dict(type=GenInferencer, max_out_len=50),
|
30 |
+
)
|
31 |
+
|
32 |
+
ARC_c_eval_cfg = dict(
|
33 |
+
evaluator=dict(type=AccEvaluator),
|
34 |
+
pred_role='BOT',
|
35 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
36 |
+
)
|
37 |
+
|
38 |
+
ARC_c_datasets = [
|
39 |
+
dict(
|
40 |
+
abbr='ARC-c',
|
41 |
+
type=ARCDataset,
|
42 |
+
path='opencompass/ai2_arc-dev',
|
43 |
+
name='ARC-Challenge',
|
44 |
+
reader_cfg=ARC_c_reader_cfg,
|
45 |
+
infer_cfg=ARC_c_infer_cfg,
|
46 |
+
eval_cfg=ARC_c_eval_cfg,
|
47 |
+
)
|
48 |
+
]
|
opencompass/configs/datasets/ARC_c/ARC_c_few_shot_ppl.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import ARCDataset
|
6 |
+
|
7 |
+
ARC_c_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
9 |
+
output_column='answerKey',
|
10 |
+
)
|
11 |
+
|
12 |
+
ARC_c_infer_cfg = dict(
|
13 |
+
ice_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template={
|
16 |
+
'A': dict(
|
17 |
+
begin='</E>',
|
18 |
+
round=[
|
19 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
20 |
+
dict(role='BOT', prompt='{textA}'),
|
21 |
+
],
|
22 |
+
),
|
23 |
+
'B': dict(
|
24 |
+
begin='</E>',
|
25 |
+
round=[
|
26 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
27 |
+
dict(role='BOT', prompt='{textB}'),
|
28 |
+
],
|
29 |
+
),
|
30 |
+
'C': dict(
|
31 |
+
begin='</E>',
|
32 |
+
round=[
|
33 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
34 |
+
dict(role='BOT', prompt='{textC}'),
|
35 |
+
],
|
36 |
+
),
|
37 |
+
'D': dict(
|
38 |
+
begin='</E>',
|
39 |
+
round=[
|
40 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
41 |
+
dict(role='BOT', prompt='{textD}'),
|
42 |
+
],
|
43 |
+
),
|
44 |
+
},
|
45 |
+
ice_token='</E>',
|
46 |
+
),
|
47 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
|
48 |
+
inferencer=dict(type=PPLInferencer),
|
49 |
+
)
|
50 |
+
|
51 |
+
ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
52 |
+
|
53 |
+
ARC_c_datasets = [
|
54 |
+
dict(
|
55 |
+
type=ARCDataset,
|
56 |
+
abbr='ARC-c',
|
57 |
+
path='opencompass/ai2_arc-dev',
|
58 |
+
name='ARC-Challenge',
|
59 |
+
reader_cfg=ARC_c_reader_cfg,
|
60 |
+
infer_cfg=ARC_c_infer_cfg,
|
61 |
+
eval_cfg=ARC_c_eval_cfg,
|
62 |
+
)
|
63 |
+
]
|
opencompass/configs/datasets/ARC_c/ARC_c_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .ARC_c_gen_1e0de5 import ARC_c_datasets # noqa: F401, F403
|
opencompass/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import ARCDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
ARC_c_reader_cfg = dict(
|
9 |
+
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
10 |
+
output_column='answerKey')
|
11 |
+
|
12 |
+
ARC_c_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template=dict(
|
16 |
+
round=[
|
17 |
+
dict(
|
18 |
+
role='HUMAN',
|
19 |
+
prompt=
|
20 |
+
'Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:'
|
21 |
+
)
|
22 |
+
], ),
|
23 |
+
),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
ARC_c_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=AccEvaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
32 |
+
)
|
33 |
+
|
34 |
+
ARC_c_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='ARC-c',
|
37 |
+
type=ARCDataset,
|
38 |
+
path='opencompass/ai2_arc-dev',
|
39 |
+
name='ARC-Challenge',
|
40 |
+
reader_cfg=ARC_c_reader_cfg,
|
41 |
+
infer_cfg=ARC_c_infer_cfg,
|
42 |
+
eval_cfg=ARC_c_eval_cfg,
|
43 |
+
)
|
44 |
+
]
|
opencompass/configs/datasets/ARC_c/ARC_c_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .ARC_c_ppl_a450bd import ARC_c_datasets # noqa: F401, F403
|
opencompass/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import ARCDataset
|
6 |
+
|
7 |
+
ARC_c_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
9 |
+
output_column='answerKey')
|
10 |
+
|
11 |
+
ARC_c_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template={
|
15 |
+
opt: dict(
|
16 |
+
round=[
|
17 |
+
dict(role='HUMAN', prompt=f'{{question}}\nA. {{textA}}\nB. {{textB}}\nC. {{textC}}\nD. {{textD}}'),
|
18 |
+
dict(role='BOT', prompt=f'Answer: {opt}'),
|
19 |
+
]
|
20 |
+
) for opt in ['A', 'B', 'C', 'D']
|
21 |
+
},
|
22 |
+
),
|
23 |
+
retriever=dict(type=ZeroRetriever),
|
24 |
+
inferencer=dict(type=PPLInferencer))
|
25 |
+
|
26 |
+
ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
27 |
+
|
28 |
+
ARC_c_datasets = [
|
29 |
+
dict(
|
30 |
+
type=ARCDataset,
|
31 |
+
abbr='ARC-c',
|
32 |
+
path='opencompass/ai2_arc-dev',
|
33 |
+
name='ARC-Challenge',
|
34 |
+
reader_cfg=ARC_c_reader_cfg,
|
35 |
+
infer_cfg=ARC_c_infer_cfg,
|
36 |
+
eval_cfg=ARC_c_eval_cfg)
|
37 |
+
]
|
opencompass/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import ARCDataset
|
6 |
+
|
7 |
+
ARC_c_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
9 |
+
output_column='answerKey')
|
10 |
+
|
11 |
+
ARC_c_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template={
|
15 |
+
'A':
|
16 |
+
dict(
|
17 |
+
round=[
|
18 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
19 |
+
dict(role='BOT', prompt='{textA}')
|
20 |
+
], ),
|
21 |
+
'B':
|
22 |
+
dict(
|
23 |
+
round=[
|
24 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
25 |
+
dict(role='BOT', prompt='{textB}')
|
26 |
+
], ),
|
27 |
+
'C':
|
28 |
+
dict(
|
29 |
+
round=[
|
30 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
31 |
+
dict(role='BOT', prompt='{textC}')
|
32 |
+
], ),
|
33 |
+
'D':
|
34 |
+
dict(
|
35 |
+
round=[
|
36 |
+
dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
|
37 |
+
dict(role='BOT', prompt='{textD}')
|
38 |
+
], ),
|
39 |
+
}),
|
40 |
+
retriever=dict(type=ZeroRetriever),
|
41 |
+
inferencer=dict(type=PPLInferencer))
|
42 |
+
|
43 |
+
ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
44 |
+
|
45 |
+
ARC_c_datasets = [
|
46 |
+
dict(
|
47 |
+
type=ARCDataset,
|
48 |
+
abbr='ARC-c',
|
49 |
+
path='opencompass/ai2_arc-dev',
|
50 |
+
name='ARC-Challenge',
|
51 |
+
reader_cfg=ARC_c_reader_cfg,
|
52 |
+
infer_cfg=ARC_c_infer_cfg,
|
53 |
+
eval_cfg=ARC_c_eval_cfg)
|
54 |
+
]
|
opencompass/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
# with read_base():
|
3 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
4 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
5 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
6 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
7 |
+
from opencompass.datasets import ARCDataset
|
8 |
+
|
9 |
+
ARC_c_reader_cfg = dict(
|
10 |
+
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
11 |
+
output_column='answerKey')
|
12 |
+
|
13 |
+
ARC_c_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template={
|
17 |
+
'A': 'Question: {question}\nAnswer: {textA}',
|
18 |
+
'B': 'Question: {question}\nAnswer: {textB}',
|
19 |
+
'C': 'Question: {question}\nAnswer: {textC}',
|
20 |
+
'D': 'Question: {question}\nAnswer: {textD}'
|
21 |
+
}),
|
22 |
+
retriever=dict(type=ZeroRetriever),
|
23 |
+
inferencer=dict(type=PPLInferencer))
|
24 |
+
|
25 |
+
ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
26 |
+
|
27 |
+
ARC_c_datasets = [
|
28 |
+
dict(
|
29 |
+
type=ARCDataset,
|
30 |
+
abbr='ARC-c',
|
31 |
+
path='opencompass/ai2_arc-dev',
|
32 |
+
name='ARC-Challenge',
|
33 |
+
reader_cfg=ARC_c_reader_cfg,
|
34 |
+
infer_cfg=ARC_c_infer_cfg,
|
35 |
+
eval_cfg=ARC_c_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_chid_gen_0a29a2 import chid_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CHIDDatasetV2
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
chid_reader_cfg = dict(
|
9 |
+
input_columns=['content','A','B','C','D','E','F','G'],
|
10 |
+
output_column='answer',
|
11 |
+
)
|
12 |
+
|
13 |
+
chid_infer_cfg = dict(
|
14 |
+
prompt_template=dict(
|
15 |
+
type=PromptTemplate,
|
16 |
+
template=dict(
|
17 |
+
round=[
|
18 |
+
dict(
|
19 |
+
role='HUMAN',
|
20 |
+
prompt=
|
21 |
+
'{content}\n请选择______处所填的词\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nF. {F}\nG. {G}\n请从”A“,”B“,”C“,”D“,”E“,”F“,”G“中进行选择。答:',
|
22 |
+
),
|
23 |
+
])),
|
24 |
+
retriever=dict(type=ZeroRetriever),
|
25 |
+
inferencer=dict(type=GenInferencer),
|
26 |
+
)
|
27 |
+
|
28 |
+
chid_eval_cfg = dict(
|
29 |
+
evaluator=dict(type=AccEvaluator),
|
30 |
+
pred_role='BOT',
|
31 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
32 |
+
)
|
33 |
+
|
34 |
+
chid_datasets = [
|
35 |
+
dict(
|
36 |
+
abbr='chid-dev',
|
37 |
+
type=CHIDDatasetV2,
|
38 |
+
path='./data/FewCLUE/chid/dev_few_all.json',
|
39 |
+
reader_cfg=chid_reader_cfg,
|
40 |
+
infer_cfg=chid_infer_cfg,
|
41 |
+
eval_cfg=chid_eval_cfg,
|
42 |
+
),
|
43 |
+
dict(
|
44 |
+
abbr='chid-test',
|
45 |
+
type=CHIDDatasetV2,
|
46 |
+
path='./data/FewCLUE/chid/test_public.json',
|
47 |
+
reader_cfg=chid_reader_cfg,
|
48 |
+
infer_cfg=chid_infer_cfg,
|
49 |
+
eval_cfg=chid_eval_cfg,
|
50 |
+
),
|
51 |
+
]
|
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .FewCLUE_chid_ppl_8f2872 import chid_datasets # noqa: F401, F403
|
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CHIDDataset
|
6 |
+
|
7 |
+
chid_reader_cfg = dict(
|
8 |
+
input_columns=[f'content{i}' for i in range(7)], output_column='answer')
|
9 |
+
|
10 |
+
chid_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template={
|
14 |
+
i: dict(
|
15 |
+
round=[
|
16 |
+
dict(role='HUMAN', prompt=f'以下句子是否通顺?\n{{content{i}}}'),
|
17 |
+
dict(role='BOT', prompt='这个句子是通顺的。'),
|
18 |
+
], )
|
19 |
+
for i in range(7)
|
20 |
+
}),
|
21 |
+
retriever=dict(type=ZeroRetriever),
|
22 |
+
inferencer=dict(type=PPLInferencer))
|
23 |
+
|
24 |
+
chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
25 |
+
|
26 |
+
chid_datasets = [
|
27 |
+
dict(
|
28 |
+
type=CHIDDataset,
|
29 |
+
path='json',
|
30 |
+
abbr='chid-dev',
|
31 |
+
data_files='./data/FewCLUE/chid/dev_few_all.json',
|
32 |
+
split='train',
|
33 |
+
reader_cfg=chid_reader_cfg,
|
34 |
+
infer_cfg=chid_infer_cfg,
|
35 |
+
eval_cfg=chid_eval_cfg),
|
36 |
+
dict(
|
37 |
+
type=CHIDDataset,
|
38 |
+
path='json',
|
39 |
+
abbr='chid-test',
|
40 |
+
data_files='./data/FewCLUE/chid/test_public.json',
|
41 |
+
split='train',
|
42 |
+
reader_cfg=chid_reader_cfg,
|
43 |
+
infer_cfg=chid_infer_cfg,
|
44 |
+
eval_cfg=chid_eval_cfg),
|
45 |
+
]
|
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_acccb5.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import CHIDDataset
|
6 |
+
|
7 |
+
chid_reader_cfg = dict(
|
8 |
+
input_columns=[f'content{i}' for i in range(7)], output_column='answer')
|
9 |
+
|
10 |
+
chid_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template={i: f'以下句子是否通顺?\n{{content{i}}}\n这个句子是通顺的。'
|
14 |
+
for i in range(7)}),
|
15 |
+
retriever=dict(type=ZeroRetriever),
|
16 |
+
inferencer=dict(type=PPLInferencer))
|
17 |
+
|
18 |
+
chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
19 |
+
|
20 |
+
chid_datasets = [
|
21 |
+
dict(
|
22 |
+
type=CHIDDataset,
|
23 |
+
path='json',
|
24 |
+
abbr='chid-dev',
|
25 |
+
data_files='./data/FewCLUE/chid/dev_few_all.json',
|
26 |
+
split='train',
|
27 |
+
reader_cfg=chid_reader_cfg,
|
28 |
+
infer_cfg=chid_infer_cfg,
|
29 |
+
eval_cfg=chid_eval_cfg),
|
30 |
+
dict(
|
31 |
+
type=CHIDDataset,
|
32 |
+
path='json',
|
33 |
+
abbr='chid-test',
|
34 |
+
data_files='./data/FewCLUE/chid/test_public.json',
|
35 |
+
split='train',
|
36 |
+
reader_cfg=chid_reader_cfg,
|
37 |
+
infer_cfg=chid_infer_cfg,
|
38 |
+
eval_cfg=chid_eval_cfg),
|
39 |
+
]
|
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets # noqa: F401, F403
|
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import EMEvaluator
|
5 |
+
from opencompass.datasets import ReCoRDDataset, ReCoRD_postprocess
|
6 |
+
|
7 |
+
ReCoRD_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'text'], output_column='answers')
|
9 |
+
|
10 |
+
ReCoRD_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=
|
14 |
+
'Passage:{text}\nResult:{question}\nQuestion: What entity does ____ refer to in the result?Give me the entity name:'),
|
15 |
+
retriever=dict(type=ZeroRetriever),
|
16 |
+
inferencer=dict(type=GenInferencer))
|
17 |
+
|
18 |
+
ReCoRD_eval_cfg = dict(
|
19 |
+
evaluator=dict(type=EMEvaluator), pred_postprocessor=dict(type=ReCoRD_postprocess))
|
20 |
+
|
21 |
+
ReCoRD_datasets = [
|
22 |
+
dict(
|
23 |
+
type=ReCoRDDataset,
|
24 |
+
abbr='ReCoRD',
|
25 |
+
path='./data/SuperGLUE/ReCoRD/val.jsonl',
|
26 |
+
reader_cfg=ReCoRD_reader_cfg,
|
27 |
+
infer_cfg=ReCoRD_infer_cfg,
|
28 |
+
eval_cfg=ReCoRD_eval_cfg)
|
29 |
+
]
|
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_30dea0.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import EMEvaluator
|
5 |
+
from opencompass.datasets import ReCoRDDataset
|
6 |
+
|
7 |
+
ReCoRD_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'text'],
|
9 |
+
output_column='answers',
|
10 |
+
)
|
11 |
+
|
12 |
+
ReCoRD_infer_cfg = dict(
|
13 |
+
prompt_template=dict(
|
14 |
+
type=PromptTemplate,
|
15 |
+
template=dict(round=[
|
16 |
+
dict(
|
17 |
+
role='HUMAN',
|
18 |
+
prompt=
|
19 |
+
'Passage: {text}\nResult: {question}\nQuestion: What entity does ____ refer to in the result? Give me the entity name:'
|
20 |
+
),
|
21 |
+
]),
|
22 |
+
),
|
23 |
+
retriever=dict(type=ZeroRetriever),
|
24 |
+
inferencer=dict(type=GenInferencer),
|
25 |
+
)
|
26 |
+
|
27 |
+
ReCoRD_eval_cfg = dict(
|
28 |
+
evaluator=dict(type=EMEvaluator),
|
29 |
+
pred_role='BOT',
|
30 |
+
pred_postprocessor=dict(type='ReCoRD'),
|
31 |
+
)
|
32 |
+
|
33 |
+
ReCoRD_datasets = [
|
34 |
+
dict(
|
35 |
+
type=ReCoRDDataset,
|
36 |
+
abbr='ReCoRD',
|
37 |
+
path='./data/SuperGLUE/ReCoRD/val.jsonl',
|
38 |
+
reader_cfg=ReCoRD_reader_cfg,
|
39 |
+
infer_cfg=ReCoRD_infer_cfg,
|
40 |
+
eval_cfg=ReCoRD_eval_cfg,
|
41 |
+
)
|
42 |
+
]
|
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import EMEvaluator
|
5 |
+
from opencompass.datasets import ReCoRDDatasetV2, ReCoRD_postprocess
|
6 |
+
|
7 |
+
ReCoRD_reader_cfg = dict(
|
8 |
+
input_columns=['question', 'text'], output_column='answers')
|
9 |
+
|
10 |
+
ReCoRD_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN', prompt='Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:'
|
16 |
+
),
|
17 |
+
]),
|
18 |
+
),
|
19 |
+
retriever=dict(type=ZeroRetriever),
|
20 |
+
inferencer=dict(type=GenInferencer))
|
21 |
+
|
22 |
+
ReCoRD_eval_cfg = dict(
|
23 |
+
evaluator=dict(type=EMEvaluator),
|
24 |
+
pred_role='BOT',
|
25 |
+
pred_postprocessor=dict(type=ReCoRD_postprocess))
|
26 |
+
|
27 |
+
ReCoRD_datasets = [
|
28 |
+
dict(
|
29 |
+
type=ReCoRDDatasetV2,
|
30 |
+
abbr='ReCoRD',
|
31 |
+
path='./data/SuperGLUE/ReCoRD/val.jsonl',
|
32 |
+
reader_cfg=ReCoRD_reader_cfg,
|
33 |
+
infer_cfg=ReCoRD_infer_cfg,
|
34 |
+
eval_cfg=ReCoRD_eval_cfg)
|
35 |
+
]
|
opencompass/configs/datasets/humaneval/README.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# HumanEval
|
2 |
+
|
3 |
+
```bash
|
4 |
+
python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
|
5 |
+
python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
|
6 |
+
```
|
7 |
+
|
8 |
+
## Base Models
|
9 |
+
|
10 |
+
| model | pass@1 |
|
11 |
+
|:------------------------:|---------:|
|
12 |
+
| llama-7b-turbomind | 12.80 |
|
13 |
+
| llama-13b-turbomind | 15.24 |
|
14 |
+
| llama-30b-turbomind | 9.15 |
|
15 |
+
| llama-65b-turbomind | 7.32 |
|
16 |
+
| llama-2-7b-turbomind | 14.02 |
|
17 |
+
| llama-2-13b-turbomind | 15.24 |
|
18 |
+
| llama-2-70b-turbomind | 15.24 |
|
19 |
+
| llama-3-8b-turbomind | 28.05 |
|
20 |
+
| llama-3-70b-turbomind | 28.05 |
|
21 |
+
| internlm2-1.8b-turbomind | 30.49 |
|
22 |
+
| internlm2-7b-turbomind | 48.17 |
|
23 |
+
| internlm2-20b-turbomind | 51.83 |
|
24 |
+
| qwen-1.8b-turbomind | 16.46 |
|
25 |
+
| qwen-7b-turbomind | 23.78 |
|
26 |
+
| qwen-14b-turbomind | 23.78 |
|
27 |
+
| qwen-72b-turbomind | 66.46 |
|
28 |
+
| qwen1.5-0.5b-hf | 8.54 |
|
29 |
+
| qwen1.5-1.8b-hf | 23.17 |
|
30 |
+
| qwen1.5-4b-hf | 41.46 |
|
31 |
+
| qwen1.5-7b-hf | 53.05 |
|
32 |
+
| qwen1.5-14b-hf | 57.32 |
|
33 |
+
| qwen1.5-32b-hf | 70.12 |
|
34 |
+
| qwen1.5-72b-hf | 65.85 |
|
35 |
+
| qwen1.5-moe-a2-7b-hf | 45.73 |
|
36 |
+
| mistral-7b-v0.1-hf | 14.02 |
|
37 |
+
| mistral-7b-v0.2-hf | 9.15 |
|
38 |
+
| mixtral-8x7b-v0.1-hf | 24.39 |
|
39 |
+
| mixtral-8x22b-v0.1-hf | 16.46 |
|
40 |
+
| yi-6b-hf | 14.63 |
|
41 |
+
| yi-34b-hf | 17.07 |
|
42 |
+
| deepseek-7b-base-hf | 18.29 |
|
43 |
+
| deepseek-67b-base-hf | 23.17 |
|
44 |
+
|
45 |
+
## Chat Models
|
46 |
+
|
47 |
+
| model | pass@1 |
|
48 |
+
|:-----------------------------:|---------:|
|
49 |
+
| qwen1.5-0.5b-chat-hf | 9.15 |
|
50 |
+
| qwen1.5-1.8b-chat-hf | 15.85 |
|
51 |
+
| qwen1.5-4b-chat-hf | 30.49 |
|
52 |
+
| qwen1.5-7b-chat-hf | 40.85 |
|
53 |
+
| qwen1.5-14b-chat-hf | 50.00 |
|
54 |
+
| qwen1.5-32b-chat-hf | 57.93 |
|
55 |
+
| qwen1.5-72b-chat-hf | 60.37 |
|
56 |
+
| qwen1.5-110b-chat-hf | 65.24 |
|
57 |
+
| internlm2-chat-1.8b-hf | 33.54 |
|
58 |
+
| internlm2-chat-1.8b-sft-hf | 34.15 |
|
59 |
+
| internlm2-chat-7b-hf | 56.71 |
|
60 |
+
| internlm2-chat-7b-sft-hf | 61.59 |
|
61 |
+
| internlm2-chat-20b-hf | 67.68 |
|
62 |
+
| internlm2-chat-20b-sft-hf | 67.68 |
|
63 |
+
| llama-3-8b-instruct-hf | 55.49 |
|
64 |
+
| llama-3-70b-instruct-hf | 70.73 |
|
65 |
+
| llama-3-8b-instruct-lmdeploy | 57.93 |
|
66 |
+
| llama-3-70b-instruct-lmdeploy | 70.73 |
|
67 |
+
| mistral-7b-instruct-v0.1-hf | 32.32 |
|
68 |
+
| mistral-7b-instruct-v0.2-hf | 29.27 |
|
69 |
+
| mixtral-8x7b-instruct-v0.1-hf | 34.15 |
|
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval',
|
33 |
+
reader_cfg=humaneval_reader_cfg,
|
34 |
+
infer_cfg=humaneval_infer_cfg,
|
35 |
+
eval_cfg=humaneval_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval',
|
33 |
+
reader_cfg=humaneval_reader_cfg,
|
34 |
+
infer_cfg=humaneval_infer_cfg,
|
35 |
+
eval_cfg=humaneval_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval',
|
33 |
+
reader_cfg=humaneval_reader_cfg,
|
34 |
+
infer_cfg=humaneval_infer_cfg,
|
35 |
+
eval_cfg=humaneval_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template='Complete the following python code:\n{prompt}',
|
14 |
+
),
|
15 |
+
retriever=dict(type=ZeroRetriever),
|
16 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
17 |
+
|
18 |
+
humaneval_eval_cfg = dict(
|
19 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
20 |
+
pred_role='BOT',
|
21 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
22 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
23 |
+
)
|
24 |
+
|
25 |
+
humaneval_datasets = [
|
26 |
+
dict(
|
27 |
+
abbr='openai_humaneval',
|
28 |
+
type=HumanevalDataset,
|
29 |
+
path='opencompass/humaneval',
|
30 |
+
reader_cfg=humaneval_reader_cfg,
|
31 |
+
infer_cfg=humaneval_infer_cfg,
|
32 |
+
eval_cfg=humaneval_eval_cfg)
|
33 |
+
]
|
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template='{prompt}'),
|
14 |
+
retriever=dict(type=ZeroRetriever),
|
15 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
16 |
+
|
17 |
+
humaneval_eval_cfg = dict(
|
18 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
19 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
20 |
+
pred_postprocessor=dict(type=humaneval_postprocess),
|
21 |
+
)
|
22 |
+
|
23 |
+
humaneval_datasets = [
|
24 |
+
dict(
|
25 |
+
abbr='openai_humaneval',
|
26 |
+
type=HumanevalDataset,
|
27 |
+
path='opencompass/humaneval',
|
28 |
+
reader_cfg=humaneval_reader_cfg,
|
29 |
+
infer_cfg=humaneval_infer_cfg,
|
30 |
+
eval_cfg=humaneval_eval_cfg)
|
31 |
+
]
|
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(
|
14 |
+
begin=[
|
15 |
+
dict(
|
16 |
+
role='SYSTEM',
|
17 |
+
fallback_role='HUMAN',
|
18 |
+
prompt='Complete the following python code:'),
|
19 |
+
],
|
20 |
+
round=[
|
21 |
+
dict(role='HUMAN', prompt='{prompt}'),
|
22 |
+
])),
|
23 |
+
retriever=dict(type=ZeroRetriever),
|
24 |
+
inferencer=dict(type=GenInferencer))
|
25 |
+
|
26 |
+
humaneval_eval_cfg = dict(
|
27 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
28 |
+
pred_role='BOT',
|
29 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
30 |
+
pred_postprocessor=dict(type=humaneval_postprocess),
|
31 |
+
)
|
32 |
+
|
33 |
+
humaneval_datasets = [
|
34 |
+
dict(
|
35 |
+
abbr='openai_humaneval',
|
36 |
+
type=HumanevalDataset,
|
37 |
+
path='opencompass/humaneval',
|
38 |
+
reader_cfg=humaneval_reader_cfg,
|
39 |
+
infer_cfg=humaneval_infer_cfg,
|
40 |
+
eval_cfg=humaneval_eval_cfg)
|
41 |
+
]
|
opencompass/configs/datasets/humaneval/humaneval_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403
|
opencompass/configs/datasets/humaneval/humaneval_gen_66a7f4.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
|
7 |
+
|
8 |
+
HUMANEVAL_TEMPLATE = dict(
|
9 |
+
round=[
|
10 |
+
dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
|
11 |
+
]
|
12 |
+
)
|
13 |
+
|
14 |
+
humaneval_infer_cfg = dict(
|
15 |
+
prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
|
16 |
+
retriever=dict(type=ZeroRetriever),
|
17 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
18 |
+
)
|
19 |
+
|
20 |
+
humaneval_eval_cfg = dict(
|
21 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
22 |
+
k=[1, 10, 100],
|
23 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
24 |
+
)
|
25 |
+
|
26 |
+
humaneval_datasets = [
|
27 |
+
dict(
|
28 |
+
abbr='openai_humaneval',
|
29 |
+
type=HumanevalDataset,
|
30 |
+
path='opencompass/humaneval',
|
31 |
+
reader_cfg=humaneval_reader_cfg,
|
32 |
+
infer_cfg=humaneval_infer_cfg,
|
33 |
+
eval_cfg=humaneval_eval_cfg,
|
34 |
+
)
|
35 |
+
]
|
opencompass/configs/datasets/humaneval/humaneval_gen_8e312c.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# THIS SHALL ALSO BE DEPRECATED
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
6 |
+
|
7 |
+
humaneval_reader_cfg = dict(
|
8 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
9 |
+
|
10 |
+
# TODO: allow empty output-column
|
11 |
+
humaneval_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(round=[
|
15 |
+
dict(
|
16 |
+
role='HUMAN',
|
17 |
+
prompt='Complete the following python code:\n{prompt}'),
|
18 |
+
])),
|
19 |
+
retriever=dict(type=ZeroRetriever),
|
20 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
21 |
+
|
22 |
+
humaneval_eval_cfg = dict(
|
23 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
24 |
+
pred_role='BOT',
|
25 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
26 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
27 |
+
)
|
28 |
+
|
29 |
+
humaneval_datasets = [
|
30 |
+
dict(
|
31 |
+
abbr='openai_humaneval',
|
32 |
+
type=HumanevalDataset,
|
33 |
+
path='opencompass/humaneval',
|
34 |
+
reader_cfg=humaneval_reader_cfg,
|
35 |
+
infer_cfg=humaneval_infer_cfg,
|
36 |
+
eval_cfg=humaneval_eval_cfg)
|
37 |
+
]
|
opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval',
|
33 |
+
reader_cfg=humaneval_reader_cfg,
|
34 |
+
infer_cfg=humaneval_infer_cfg,
|
35 |
+
eval_cfg=humaneval_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='Complete the following python code:\n{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval_passk',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval',
|
33 |
+
reader_cfg=humaneval_reader_cfg,
|
34 |
+
infer_cfg=humaneval_infer_cfg,
|
35 |
+
eval_cfg=humaneval_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='Complete the following python code:\n{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='openai_humaneval_repeat10',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval',
|
33 |
+
num_repeats=10,
|
34 |
+
reader_cfg=humaneval_reader_cfg,
|
35 |
+
infer_cfg=humaneval_infer_cfg,
|
36 |
+
eval_cfg=humaneval_eval_cfg)
|
37 |
+
]
|
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .humaneval_plus_gen_8e312c import humaneval_plus_datasets # noqa: F401, F403
|
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
|
7 |
+
|
8 |
+
HUMANEVAL_TEMPLATE = dict(
|
9 |
+
round=[
|
10 |
+
dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
|
11 |
+
]
|
12 |
+
)
|
13 |
+
|
14 |
+
humaneval_plus_infer_cfg = dict(
|
15 |
+
prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
|
16 |
+
retriever=dict(type=ZeroRetriever),
|
17 |
+
inferencer=dict(type=GenInferencer, max_out_len=1024),
|
18 |
+
)
|
19 |
+
|
20 |
+
humaneval_plus_eval_cfg = dict(
|
21 |
+
evaluator=dict(type=HumanEvalPlusEvaluator),
|
22 |
+
k=[1, 10, 100],
|
23 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
24 |
+
)
|
25 |
+
|
26 |
+
humaneval_plus_datasets = [
|
27 |
+
dict(
|
28 |
+
abbr='humaneval_plus',
|
29 |
+
type=HumanevalDataset,
|
30 |
+
path='opencompass/humaneval',
|
31 |
+
reader_cfg=humaneval_plus_reader_cfg,
|
32 |
+
infer_cfg=humaneval_plus_infer_cfg,
|
33 |
+
eval_cfg=humaneval_plus_eval_cfg,
|
34 |
+
)
|
35 |
+
]
|
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# THIS SHALL ALSO BE DEPRECATED
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
|
6 |
+
|
7 |
+
humaneval_plus_reader_cfg = dict(
|
8 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
9 |
+
|
10 |
+
# TODO: allow empty output-column
|
11 |
+
humaneval_plus_infer_cfg = dict(
|
12 |
+
prompt_template=dict(
|
13 |
+
type=PromptTemplate,
|
14 |
+
template=dict(round=[
|
15 |
+
dict(
|
16 |
+
role='HUMAN',
|
17 |
+
prompt='Complete the following python code:\n{prompt}'),
|
18 |
+
])),
|
19 |
+
retriever=dict(type=ZeroRetriever),
|
20 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
21 |
+
|
22 |
+
humaneval_plus_eval_cfg = dict(
|
23 |
+
evaluator=dict(type=HumanEvalPlusEvaluator),
|
24 |
+
pred_role='BOT',
|
25 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
26 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
27 |
+
)
|
28 |
+
|
29 |
+
humaneval_plus_datasets = [
|
30 |
+
dict(
|
31 |
+
abbr='humaneval_plus',
|
32 |
+
type=HumanevalDataset,
|
33 |
+
path='opencompass/humaneval',
|
34 |
+
reader_cfg=humaneval_plus_reader_cfg,
|
35 |
+
infer_cfg=humaneval_plus_infer_cfg,
|
36 |
+
eval_cfg=humaneval_plus_eval_cfg)
|
37 |
+
]
|
opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_plus_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_plus_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='Complete the following python code:\n{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_plus_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_plus_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='humaneval_plus_passk',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval',
|
33 |
+
reader_cfg=humaneval_plus_reader_cfg,
|
34 |
+
infer_cfg=humaneval_plus_infer_cfg,
|
35 |
+
eval_cfg=humaneval_plus_eval_cfg)
|
36 |
+
]
|
opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
|
5 |
+
|
6 |
+
humaneval_plus_reader_cfg = dict(
|
7 |
+
input_columns=['prompt'], output_column='task_id', train_split='test')
|
8 |
+
|
9 |
+
# TODO: allow empty output-column
|
10 |
+
humaneval_plus_infer_cfg = dict(
|
11 |
+
prompt_template=dict(
|
12 |
+
type=PromptTemplate,
|
13 |
+
template=dict(round=[
|
14 |
+
dict(
|
15 |
+
role='HUMAN',
|
16 |
+
prompt='Complete the following python code:\n{prompt}'),
|
17 |
+
])),
|
18 |
+
retriever=dict(type=ZeroRetriever),
|
19 |
+
inferencer=dict(type=GenInferencer, max_out_len=512))
|
20 |
+
|
21 |
+
humaneval_plus_eval_cfg = dict(
|
22 |
+
evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
|
23 |
+
pred_role='BOT',
|
24 |
+
k=[1, 10, 100], # the parameter only for humaneval
|
25 |
+
pred_postprocessor=dict(type=humaneval_postprocess_v2),
|
26 |
+
)
|
27 |
+
|
28 |
+
humaneval_plus_datasets = [
|
29 |
+
dict(
|
30 |
+
abbr='humaneval_plus_repeat10',
|
31 |
+
type=HumanevalDataset,
|
32 |
+
path='opencompass/humaneval',
|
33 |
+
num_repeats=10,
|
34 |
+
reader_cfg=humaneval_plus_reader_cfg,
|
35 |
+
infer_cfg=humaneval_plus_infer_cfg,
|
36 |
+
eval_cfg=humaneval_plus_eval_cfg)
|
37 |
+
]
|
opencompass/configs/datasets/mmlu/README.md
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MMLU
|
2 |
+
|
3 |
+
```bash
|
4 |
+
python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
|
5 |
+
python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
|
6 |
+
```
|
7 |
+
|
8 |
+
## Base Models
|
9 |
+
|
10 |
+
| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
|
11 |
+
|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
|
12 |
+
| llama-7b-turbomind | 35.66 | 31.22 | 37.70 | 38.90 | 37.01 |
|
13 |
+
| llama-13b-turbomind | 47.76 | 37.68 | 55.36 | 52.43 | 50.83 |
|
14 |
+
| llama-30b-turbomind | 58.55 | 46.95 | 67.35 | 65.13 | 60.78 |
|
15 |
+
| llama-65b-turbomind | 63.78 | 52.35 | 73.68 | 70.84 | 64.29 |
|
16 |
+
| llama-2-7b-turbomind | 46.78 | 37.81 | 52.11 | 51.69 | 50.04 |
|
17 |
+
| llama-2-13b-turbomind | 55.76 | 44.61 | 63.86 | 62.97 | 57.35 |
|
18 |
+
| llama-2-70b-turbomind | 69.87 | 58.30 | 79.86 | 75.84 | 71.58 |
|
19 |
+
| llama-3-8b-turbomind | 66.43 | 55.95 | 76.11 | 70.29 | 68.96 |
|
20 |
+
| llama-3-70b-turbomind | 79.35 | 70.66 | 87.54 | 83.43 | 80.42 |
|
21 |
+
| internlm2-1.8b-turbomind | 45.99 | 39.63 | 51.02 | 48.65 | 47.96 |
|
22 |
+
| internlm2-7b-turbomind | 65.84 | 56.48 | 74.43 | 69.68 | 67.75 |
|
23 |
+
| internlm2-20b-turbomind | 67.58 | 59.01 | 76.04 | 71.20 | 68.69 |
|
24 |
+
| qwen-1.8b-turbomind | 46.61 | 38.91 | 51.35 | 49.57 | 50.51 |
|
25 |
+
| qwen-7b-turbomind | 59.75 | 50.16 | 67.98 | 63.48 | 62.44 |
|
26 |
+
| qwen-14b-turbomind | 67.85 | 59.13 | 76.18 | 71.62 | 69.12 |
|
27 |
+
| qwen-72b-turbomind | 77.36 | 68.70 | 85.28 | 80.60 | 79.45 |
|
28 |
+
| qwen1.5-0.5b-hf | 39.98 | 33.96 | 45.08 | 41.59 | 42.48 |
|
29 |
+
| qwen1.5-1.8b-hf | 47.14 | 39.47 | 52.70 | 49.01 | 51.33 |
|
30 |
+
| qwen1.5-4b-hf | 57.03 | 47.80 | 64.86 | 60.10 | 60.20 |
|
31 |
+
| qwen1.5-7b-hf | 62.15 | 53.22 | 70.25 | 65.62 | 64.26 |
|
32 |
+
| qwen1.5-14b-hf | 69.10 | 61.46 | 77.57 | 71.25 | 70.29 |
|
33 |
+
| qwen1.5-32b-hf | 73.88 | 65.60 | 81.41 | 77.10 | 75.79 |
|
34 |
+
| qwen1.5-72b-hf | 77.02 | 69.00 | 84.55 | 80.60 | 78.21 |
|
35 |
+
| qwen1.5-moe-a2-7b-hf | 62.09 | 53.27 | 70.74 | 63.80 | 65.28 |
|
36 |
+
| mistral-7b-v0.1-hf | 64.04 | 53.21 | 73.65 | 68.04 | 67.00 |
|
37 |
+
| mistral-7b-v0.2-hf | 63.85 | 53.21 | 72.17 | 68.40 | 67.15 |
|
38 |
+
| mixtral-8x7b-v0.1-hf | 71.80 | 61.70 | 81.03 | 75.51 | 74.35 |
|
39 |
+
| mixtral-8x22b-v0.1-hf | 77.67 | 68.94 | 86.81 | 81.23 | 78.43 |
|
40 |
+
| yi-6b-hf | 64.08 | 52.61 | 74.10 | 68.58 | 67.11 |
|
41 |
+
| yi-34b-hf | 76.26 | 66.73 | 83.74 | 81.78 | 77.77 |
|
42 |
+
| deepseek-7b-base-hf | 49.22 | 40.17 | 56.73 | 53.46 | 51.26 |
|
43 |
+
| deepseek-67b-base-hf | 71.95 | 60.57 | 81.69 | 77.11 | 74.42 |
|
44 |
+
|
45 |
+
### Details
|
46 |
+
|
47 |
+
| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
|
48 |
+
|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
|
49 |
+
| llama-7b-turbomind | 37.50 | 30.00 | 30.00 | 33.00 | 23.53 | 23.45 | 34.87 | 37.78 | 25.00 | 27.68 | 34.34 | 31.00 |
|
50 |
+
| llama-13b-turbomind | 46.53 | 30.00 | 42.00 | 36.00 | 18.63 | 42.76 | 46.71 | 46.67 | 30.00 | 32.14 | 45.66 | 37.00 |
|
51 |
+
| llama-30b-turbomind | 59.03 | 45.00 | 47.00 | 35.00 | 26.47 | 53.10 | 61.18 | 51.85 | 37.00 | 41.07 | 57.36 | 38.00 |
|
52 |
+
| llama-65b-turbomind | 68.75 | 49.00 | 47.00 | 37.00 | 35.29 | 55.17 | 73.03 | 57.78 | 30.00 | 48.21 | 66.04 | 38.00 |
|
53 |
+
| llama-2-7b-turbomind | 46.53 | 34.00 | 33.00 | 34.00 | 22.55 | 47.59 | 40.13 | 47.41 | 29.00 | 38.39 | 46.42 | 32.00 |
|
54 |
+
| llama-2-13b-turbomind | 59.03 | 44.00 | 48.00 | 29.00 | 26.47 | 50.34 | 53.29 | 49.63 | 35.00 | 28.57 | 60.00 | 32.00 |
|
55 |
+
| llama-2-70b-turbomind | 84.72 | 51.00 | 60.00 | 39.00 | 37.25 | 65.52 | 81.58 | 63.70 | 32.00 | 52.68 | 72.08 | 46.00 |
|
56 |
+
| llama-3-8b-turbomind | 77.08 | 46.00 | 51.00 | 31.00 | 51.96 | 62.76 | 67.11 | 68.15 | 34.00 | 52.68 | 74.72 | 35.00 |
|
57 |
+
| llama-3-70b-turbomind | 93.75 | 62.00 | 72.00 | 52.00 | 50.98 | 74.48 | 92.11 | 79.26 | 48.00 | 63.39 | 86.42 | 49.00 |
|
58 |
+
| internlm2-1.8b-turbomind | 38.89 | 37.00 | 44.00 | 35.00 | 30.39 | 49.66 | 50.66 | 44.44 | 25.00 | 35.71 | 51.32 | 32.00 |
|
59 |
+
| internlm2-7b-turbomind | 77.08 | 48.00 | 64.00 | 33.00 | 47.06 | 63.45 | 73.68 | 57.78 | 37.00 | 45.54 | 69.81 | 35.00 |
|
60 |
+
| internlm2-20b-turbomind | 83.33 | 51.00 | 61.00 | 36.00 | 45.10 | 64.83 | 75.00 | 59.26 | 39.00 | 53.57 | 73.58 | 32.00 |
|
61 |
+
| qwen-1.8b-turbomind | 42.36 | 36.00 | 39.00 | 34.00 | 27.45 | 51.03 | 50.66 | 42.96 | 31.00 | 31.25 | 53.21 | 28.00 |
|
62 |
+
| qwen-7b-turbomind | 67.36 | 48.00 | 53.00 | 28.00 | 39.22 | 59.31 | 63.82 | 49.63 | 34.00 | 38.39 | 63.02 | 37.00 |
|
63 |
+
| qwen-14b-turbomind | 78.47 | 51.00 | 62.00 | 42.00 | 49.02 | 65.52 | 71.05 | 60.00 | 37.00 | 58.93 | 71.32 | 40.00 |
|
64 |
+
| qwen-72b-turbomind | 93.75 | 56.00 | 66.00 | 56.00 | 50.98 | 80.69 | 85.53 | 73.33 | 41.00 | 62.50 | 83.77 | 54.00 |
|
65 |
+
| qwen1.5-0.5b-hf | 38.89 | 25.00 | 38.00 | 32.00 | 25.49 | 45.52 | 44.74 | 33.33 | 30.00 | 39.29 | 38.11 | 39.00 |
|
66 |
+
| qwen1.5-1.8b-hf | 43.75 | 34.00 | 45.00 | 38.00 | 28.43 | 47.59 | 47.37 | 40.74 | 32.00 | 31.25 | 53.96 | 37.00 |
|
67 |
+
| qwen1.5-4b-hf | 50.00 | 46.00 | 41.00 | 45.00 | 31.37 | 53.10 | 61.18 | 51.85 | 35.00 | 44.64 | 60.38 | 37.00 |
|
68 |
+
| qwen1.5-7b-hf | 66.67 | 48.00 | 55.00 | 37.00 | 41.18 | 60.69 | 65.79 | 52.59 | 39.00 | 41.07 | 68.68 | 43.00 |
|
69 |
+
| qwen1.5-14b-hf | 75.69 | 49.00 | 58.00 | 49.00 | 49.02 | 71.72 | 73.03 | 65.93 | 39.00 | 52.68 | 73.96 | 49.00 |
|
70 |
+
| qwen1.5-32b-hf | 85.42 | 53.00 | 59.00 | 51.00 | 53.92 | 72.41 | 82.24 | 63.70 | 43.00 | 58.04 | 78.11 | 50.00 |
|
71 |
+
| qwen1.5-72b-hf | 90.97 | 54.00 | 65.00 | 57.00 | 52.94 | 80.00 | 87.50 | 73.33 | 43.00 | 64.29 | 81.89 | 50.00 |
|
72 |
+
| qwen1.5-moe-a2-7b-hf | 62.50 | 44.00 | 54.00 | 41.00 | 49.02 | 58.62 | 69.74 | 57.78 | 37.00 | 38.39 | 66.79 | 38.00 |
|
73 |
+
| mistral-7b-v0.1-hf | 72.92 | 50.00 | 51.00 | 40.00 | 39.22 | 57.93 | 65.79 | 62.96 | 29.00 | 49.11 | 69.43 | 36.00 |
|
74 |
+
| mistral-7b-v0.2-hf | 71.53 | 49.00 | 53.00 | 40.00 | 36.27 | 57.24 | 64.47 | 60.00 | 29.00 | 53.57 | 67.92 | 39.00 |
|
75 |
+
| mixtral-8x7b-v0.1-hf | 85.42 | 54.00 | 62.00 | 43.00 | 46.08 | 68.97 | 82.89 | 70.37 | 37.00 | 56.25 | 79.25 | 51.00 |
|
76 |
+
| mixtral-8x22b-v0.1-hf | 89.58 | 56.00 | 69.00 | 48.00 | 52.94 | 76.55 | 86.18 | 77.04 | 53.00 | 62.50 | 82.26 | 56.00 |
|
77 |
+
| yi-6b-hf | 66.67 | 43.00 | 51.00 | 39.00 | 35.29 | 64.83 | 65.79 | 60.00 | 29.00 | 41.96 | 66.79 | 46.00 |
|
78 |
+
| yi-34b-hf | 88.89 | 52.00 | 66.00 | 44.00 | 48.04 | 80.00 | 89.47 | 74.81 | 44.00 | 58.04 | 78.87 | 52.00 |
|
79 |
+
| deepseek-7b-base-hf | 52.08 | 29.00 | 44.00 | 40.00 | 31.37 | 44.83 | 51.97 | 40.74 | 27.00 | 32.14 | 53.58 | 31.00 |
|
80 |
+
| deepseek-67b-base-hf | 84.72 | 52.00 | 62.00 | 42.00 | 42.16 | 70.34 | 80.92 | 65.19 | 39.00 | 50.00 | 78.11 | 42.00 |
|
81 |
+
|
82 |
+
| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
|
83 |
+
|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
|
84 |
+
| llama-7b-turbomind | 33.01 | 39.22 | 45.73 | 26.24 | 33.33 | 51.24 | 24.25 | 45.00 | 31.09 | 30.05 | 37.00 | 35.13 |
|
85 |
+
| llama-13b-turbomind | 66.02 | 51.63 | 71.79 | 34.75 | 55.05 | 64.46 | 30.06 | 63.00 | 47.48 | 37.22 | 53.00 | 48.53 |
|
86 |
+
| llama-30b-turbomind | 76.70 | 62.42 | 84.19 | 44.68 | 71.72 | 75.21 | 40.56 | 66.00 | 57.98 | 46.48 | 66.00 | 63.73 |
|
87 |
+
| llama-65b-turbomind | 82.52 | 68.95 | 87.18 | 48.94 | 79.29 | 81.82 | 47.82 | 79.00 | 68.49 | 50.07 | 68.00 | 66.67 |
|
88 |
+
| llama-2-7b-turbomind | 53.40 | 48.69 | 68.38 | 36.52 | 49.49 | 65.29 | 24.02 | 60.00 | 44.12 | 36.31 | 55.00 | 43.79 |
|
89 |
+
| llama-2-13b-turbomind | 72.82 | 61.76 | 79.49 | 39.72 | 69.19 | 74.38 | 43.80 | 70.00 | 58.40 | 42.50 | 54.00 | 54.90 |
|
90 |
+
| llama-2-70b-turbomind | 83.50 | 77.12 | 91.03 | 56.03 | 86.87 | 87.60 | 44.69 | 77.00 | 77.31 | 52.93 | 74.00 | 75.65 |
|
91 |
+
| llama-3-8b-turbomind | 87.38 | 75.82 | 89.74 | 48.94 | 80.81 | 84.30 | 40.89 | 81.00 | 73.95 | 46.22 | 77.00 | 71.90 |
|
92 |
+
| llama-3-70b-turbomind | 91.26 | 87.25 | 94.87 | 64.18 | 93.94 | 89.26 | 62.91 | 83.00 | 87.82 | 61.80 | 90.00 | 85.78 |
|
93 |
+
| internlm2-1.8b-turbomind | 60.19 | 58.17 | 63.25 | 31.21 | 56.57 | 56.20 | 24.47 | 52.00 | 50.42 | 36.11 | 53.00 | 41.83 |
|
94 |
+
| internlm2-7b-turbomind | 79.61 | 75.49 | 87.61 | 48.23 | 82.83 | 77.69 | 49.39 | 74.00 | 72.27 | 47.65 | 73.00 | 65.03 |
|
95 |
+
| internlm2-20b-turbomind | 79.61 | 75.49 | 91.88 | 50.00 | 87.88 | 85.95 | 35.08 | 81.00 | 70.59 | 49.48 | 78.00 | 70.10 |
|
96 |
+
| qwen-1.8b-turbomind | 66.02 | 60.46 | 73.50 | 38.30 | 56.57 | 66.94 | 23.91 | 56.00 | 42.02 | 33.96 | 51.00 | 39.54 |
|
97 |
+
| qwen-7b-turbomind | 78.64 | 67.32 | 83.33 | 41.49 | 76.77 | 76.03 | 29.72 | 73.00 | 58.40 | 41.72 | 69.00 | 59.64 |
|
98 |
+
| qwen-14b-turbomind | 78.64 | 73.86 | 88.89 | 48.58 | 83.84 | 84.30 | 45.47 | 77.00 | 73.95 | 50.85 | 74.00 | 69.61 |
|
99 |
+
| qwen-72b-turbomind | 90.29 | 84.97 | 94.87 | 65.96 | 92.93 | 88.43 | 65.70 | 79.00 | 84.87 | 61.21 | 86.00 | 82.19 |
|
100 |
+
| qwen1.5-0.5b-hf | 52.43 | 46.41 | 60.68 | 31.21 | 46.46 | 56.20 | 25.70 | 46.00 | 37.39 | 32.79 | 46.00 | 37.75 |
|
101 |
+
| qwen1.5-1.8b-hf | 66.02 | 58.50 | 75.64 | 33.69 | 56.06 | 72.73 | 24.69 | 57.00 | 39.50 | 36.11 | 53.00 | 42.81 |
|
102 |
+
| qwen1.5-4b-hf | 74.76 | 62.75 | 84.19 | 46.81 | 76.77 | 71.07 | 25.03 | 67.00 | 55.04 | 41.33 | 64.00 | 56.05 |
|
103 |
+
| qwen1.5-7b-hf | 78.64 | 70.92 | 86.32 | 44.68 | 81.82 | 77.69 | 32.74 | 76.00 | 64.29 | 45.37 | 68.00 | 61.27 |
|
104 |
+
| qwen1.5-14b-hf | 80.58 | 75.49 | 85.90 | 51.06 | 86.36 | 80.99 | 45.03 | 80.00 | 76.47 | 48.57 | 78.00 | 69.61 |
|
105 |
+
| qwen1.5-32b-hf | 86.41 | 81.37 | 95.30 | 56.38 | 91.41 | 88.43 | 44.02 | 76.00 | 82.77 | 57.89 | 83.00 | 75.33 |
|
106 |
+
| qwen1.5-72b-hf | 87.38 | 85.29 | 94.87 | 64.89 | 92.42 | 90.08 | 62.12 | 83.00 | 84.03 | 60.76 | 86.00 | 81.05 |
|
107 |
+
| qwen1.5-moe-a2-7b-hf | 78.64 | 70.92 | 86.32 | 46.81 | 81.82 | 77.69 | 25.59 | 71.00 | 65.97 | 45.37 | 65.00 | 61.44 |
|
108 |
+
| mistral-7b-v0.1-hf | 82.52 | 75.49 | 87.61 | 48.94 | 76.77 | 77.69 | 32.51 | 77.00 | 66.39 | 44.98 | 74.00 | 67.97 |
|
109 |
+
| mistral-7b-v0.2-hf | 81.55 | 74.18 | 88.46 | 51.06 | 76.77 | 80.99 | 38.77 | 75.00 | 64.71 | 45.37 | 72.00 | 66.34 |
|
110 |
+
| mixtral-8x7b-v0.1-hf | 87.38 | 81.70 | 91.88 | 51.77 | 85.86 | 85.95 | 40.11 | 80.00 | 79.41 | 53.32 | 77.00 | 77.94 |
|
111 |
+
| mixtral-8x22b-v0.1-hf | 89.32 | 85.95 | 91.88 | 62.06 | 91.41 | 90.08 | 64.58 | 83.00 | 87.82 | 60.82 | 84.00 | 83.17 |
|
112 |
+
| yi-6b-hf | 80.58 | 71.57 | 91.03 | 48.23 | 83.33 | 76.86 | 41.34 | 75.00 | 74.79 | 49.35 | 80.00 | 65.69 |
|
113 |
+
| yi-34b-hf | 91.26 | 85.62 | 92.31 | 65.25 | 89.39 | 91.74 | 64.69 | 82.00 | 85.29 | 59.97 | 87.00 | 82.19 |
|
114 |
+
| deepseek-7b-base-hf | 61.17 | 53.59 | 72.22 | 34.04 | 59.09 | 65.29 | 26.37 | 61.00 | 44.96 | 35.53 | 56.00 | 49.18 |
|
115 |
+
| deepseek-67b-base-hf | 88.35 | 79.74 | 91.88 | 57.09 | 89.39 | 85.12 | 46.15 | 76.00 | 82.35 | 55.93 | 72.00 | 79.58 |
|
116 |
+
|
117 |
+
| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
|
118 |
+
|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
|
119 |
+
| llama-7b-turbomind | 41.67 | 49.12 | 40.84 | 34.94 | 29.56 | 40.00 | 34.10 | 35.11 | 26.46 | 27.81 | 34.00 | 41.82 |
|
120 |
+
| llama-13b-turbomind | 51.85 | 67.84 | 55.31 | 43.37 | 28.57 | 60.91 | 46.15 | 57.25 | 26.98 | 29.80 | 49.00 | 61.21 |
|
121 |
+
| llama-30b-turbomind | 71.30 | 79.53 | 66.24 | 49.40 | 40.39 | 70.00 | 56.67 | 64.89 | 37.30 | 35.10 | 60.00 | 70.91 |
|
122 |
+
| llama-65b-turbomind | 75.00 | 81.29 | 73.63 | 53.01 | 41.38 | 74.55 | 65.90 | 77.86 | 40.21 | 35.76 | 69.00 | 76.36 |
|
123 |
+
| llama-2-7b-turbomind | 53.70 | 69.01 | 60.13 | 41.57 | 36.95 | 54.55 | 45.90 | 55.73 | 27.25 | 31.13 | 40.00 | 59.39 |
|
124 |
+
| llama-2-13b-turbomind | 74.07 | 76.61 | 63.99 | 45.78 | 44.83 | 62.73 | 50.77 | 62.60 | 34.13 | 36.42 | 57.00 | 63.03 |
|
125 |
+
| llama-2-70b-turbomind | 83.33 | 85.96 | 78.46 | 53.61 | 52.22 | 69.09 | 74.87 | 87.02 | 43.39 | 43.71 | 78.00 | 84.24 |
|
126 |
+
| llama-3-8b-turbomind | 75.00 | 83.04 | 74.28 | 56.02 | 54.68 | 71.82 | 64.87 | 79.39 | 42.06 | 45.03 | 68.00 | 76.36 |
|
127 |
+
| llama-3-70b-turbomind | 86.11 | 91.23 | 86.50 | 57.83 | 71.92 | 74.55 | 82.56 | 88.55 | 62.70 | 56.95 | 86.00 | 86.67 |
|
128 |
+
| internlm2-1.8b-turbomind | 55.56 | 59.65 | 51.13 | 40.96 | 43.35 | 52.73 | 43.33 | 47.33 | 30.42 | 33.11 | 47.00 | 56.36 |
|
129 |
+
| internlm2-7b-turbomind | 79.63 | 82.46 | 73.63 | 51.20 | 55.17 | 70.00 | 66.92 | 70.99 | 46.03 | 42.38 | 70.00 | 78.79 |
|
130 |
+
| internlm2-20b-turbomind | 75.93 | 82.46 | 73.95 | 56.02 | 57.64 | 68.18 | 70.51 | 68.70 | 49.21 | 38.41 | 75.00 | 82.42 |
|
131 |
+
| qwen-1.8b-turbomind | 59.26 | 56.14 | 50.80 | 40.96 | 37.93 | 60.00 | 41.03 | 51.15 | 33.33 | 34.44 | 39.00 | 64.24 |
|
132 |
+
| qwen-7b-turbomind | 73.15 | 76.61 | 67.20 | 47.59 | 51.23 | 65.45 | 60.00 | 69.47 | 43.12 | 38.41 | 67.00 | 66.67 |
|
133 |
+
| qwen-14b-turbomind | 76.85 | 84.21 | 72.03 | 53.01 | 65.52 | 66.36 | 66.92 | 78.63 | 51.32 | 41.72 | 72.00 | 82.42 |
|
134 |
+
| qwen-72b-turbomind | 83.33 | 88.30 | 83.28 | 58.43 | 65.52 | 74.55 | 81.54 | 89.31 | 68.52 | 58.28 | 81.00 | 84.24 |
|
135 |
+
| qwen1.5-0.5b-hf | 40.74 | 40.94 | 41.48 | 40.96 | 28.57 | 50.91 | 36.92 | 41.98 | 28.84 | 22.52 | 37.00 | 52.73 |
|
136 |
+
| qwen1.5-1.8b-hf | 55.56 | 57.31 | 49.84 | 40.96 | 36.45 | 56.36 | 43.59 | 56.49 | 35.19 | 27.81 | 45.00 | 61.21 |
|
137 |
+
| qwen1.5-4b-hf | 70.37 | 70.76 | 61.74 | 44.58 | 45.32 | 65.45 | 54.62 | 64.89 | 47.88 | 32.45 | 62.00 | 70.30 |
|
138 |
+
| qwen1.5-7b-hf | 75.93 | 77.19 | 66.24 | 50.60 | 53.20 | 62.73 | 60.00 | 71.76 | 50.26 | 38.41 | 71.00 | 74.55 |
|
139 |
+
| qwen1.5-14b-hf | 74.07 | 83.63 | 70.74 | 46.39 | 58.62 | 64.55 | 73.59 | 76.34 | 59.26 | 49.01 | 75.00 | 83.64 |
|
140 |
+
| qwen1.5-32b-hf | 83.33 | 85.96 | 82.96 | 56.63 | 61.58 | 63.64 | 77.95 | 83.97 | 69.31 | 50.99 | 85.00 | 86.06 |
|
141 |
+
| qwen1.5-72b-hf | 84.26 | 88.89 | 82.32 | 57.23 | 66.01 | 72.73 | 82.05 | 87.02 | 69.31 | 56.95 | 84.00 | 84.24 |
|
142 |
+
| qwen1.5-moe-a2-7b-hf | 70.37 | 80.12 | 66.56 | 51.20 | 47.78 | 64.55 | 62.31 | 70.99 | 46.30 | 45.03 | 59.00 | 69.70 |
|
143 |
+
| mistral-7b-v0.1-hf | 77.78 | 83.04 | 69.45 | 54.82 | 53.20 | 67.27 | 66.15 | 78.63 | 38.10 | 31.79 | 68.00 | 78.79 |
|
144 |
+
| mistral-7b-v0.2-hf | 73.15 | 82.46 | 72.99 | 53.01 | 55.67 | 66.36 | 62.31 | 77.10 | 40.48 | 34.44 | 66.00 | 76.36 |
|
145 |
+
| mixtral-8x7b-v0.1-hf | 82.41 | 88.30 | 78.14 | 51.20 | 62.56 | 70.00 | 70.77 | 80.92 | 48.68 | 48.34 | 71.00 | 80.61 |
|
146 |
+
| mixtral-8x22b-v0.1-hf | 84.26 | 89.47 | 84.57 | 59.04 | 67.49 | 78.18 | 79.23 | 88.55 | 61.64 | 52.98 | 87.00 | 86.06 |
|
147 |
+
| yi-6b-hf | 78.70 | 81.87 | 69.77 | 46.39 | 52.71 | 73.64 | 65.13 | 74.81 | 46.30 | 38.41 | 66.00 | 71.52 |
|
148 |
+
| yi-34b-hf | 89.81 | 86.55 | 83.92 | 57.23 | 64.04 | 73.64 | 79.49 | 85.50 | 66.40 | 52.32 | 81.00 | 86.06 |
|
149 |
+
| deepseek-7b-base-hf | 55.56 | 73.10 | 56.59 | 46.99 | 34.98 | 62.73 | 48.21 | 58.78 | 28.57 | 29.14 | 50.00 | 61.82 |
|
150 |
+
| deepseek-67b-base-hf | 84.26 | 85.96 | 81.03 | 56.02 | 57.64 | 72.73 | 73.85 | 82.44 | 51.59 | 45.03 | 74.00 | 81.82 |
|
151 |
+
|
152 |
+
| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
|
153 |
+
|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
|
154 |
+
| llama-7b-turbomind | 42.00 | 40.46 | 32.87 | 42.78 | 26.19 | 46.11 | 35.19 | 33.47 | 32.90 | 42.33 | 43.88 | 43.75 |
|
155 |
+
| llama-13b-turbomind | 46.00 | 50.00 | 30.56 | 64.88 | 31.75 | 66.84 | 51.85 | 52.65 | 51.94 | 52.76 | 67.51 | 51.10 |
|
156 |
+
| llama-30b-turbomind | 55.00 | 66.76 | 49.07 | 77.91 | 36.51 | 82.90 | 68.21 | 66.12 | 69.35 | 67.48 | 80.59 | 55.88 |
|
157 |
+
| llama-65b-turbomind | 59.00 | 73.70 | 61.57 | 81.35 | 43.65 | 88.60 | 73.46 | 71.84 | 74.19 | 77.30 | 83.97 | 62.13 |
|
158 |
+
| llama-2-7b-turbomind | 53.00 | 51.16 | 27.78 | 63.60 | 27.78 | 67.36 | 48.77 | 47.76 | 50.97 | 51.53 | 64.56 | 52.57 |
|
159 |
+
| llama-2-13b-turbomind | 54.00 | 64.45 | 45.37 | 74.46 | 36.51 | 80.83 | 64.81 | 62.86 | 67.42 | 66.87 | 72.15 | 54.41 |
|
160 |
+
| llama-2-70b-turbomind | 72.00 | 77.17 | 63.43 | 86.08 | 48.41 | 94.30 | 83.64 | 78.37 | 81.61 | 80.98 | 87.76 | 74.63 |
|
161 |
+
| llama-3-8b-turbomind | 62.00 | 73.70 | 54.17 | 82.76 | 48.41 | 90.16 | 72.53 | 75.51 | 77.74 | 73.01 | 82.70 | 72.06 |
|
162 |
+
| llama-3-70b-turbomind | 83.00 | 85.55 | 72.22 | 92.21 | 66.67 | 97.41 | 91.05 | 84.90 | 90.32 | 87.73 | 94.09 | 87.13 |
|
163 |
+
| internlm2-1.8b-turbomind | 44.00 | 45.95 | 38.89 | 59.39 | 32.54 | 60.62 | 50.31 | 54.29 | 52.58 | 45.40 | 62.87 | 37.87 |
|
164 |
+
| internlm2-7b-turbomind | 69.00 | 66.76 | 57.87 | 80.72 | 50.00 | 90.16 | 73.15 | 75.10 | 79.68 | 68.71 | 81.01 | 70.22 |
|
165 |
+
| internlm2-20b-turbomind | 74.00 | 74.57 | 60.19 | 81.48 | 44.44 | 91.71 | 75.31 | 81.63 | 82.58 | 75.46 | 87.76 | 63.60 |
|
166 |
+
| qwen-1.8b-turbomind | 52.00 | 52.31 | 34.72 | 57.98 | 29.37 | 59.07 | 47.22 | 48.57 | 52.26 | 44.17 | 61.18 | 43.38 |
|
167 |
+
| qwen-7b-turbomind | 68.00 | 64.74 | 45.37 | 77.39 | 43.65 | 83.94 | 68.21 | 70.20 | 72.26 | 65.64 | 75.95 | 58.46 |
|
168 |
+
| qwen-14b-turbomind | 75.00 | 74.86 | 57.87 | 84.04 | 51.59 | 91.71 | 70.99 | 77.14 | 83.55 | 73.01 | 83.12 | 67.65 |
|
169 |
+
| qwen-72b-turbomind | 80.00 | 84.97 | 68.98 | 91.44 | 54.76 | 98.96 | 87.04 | 81.63 | 89.03 | 84.05 | 90.30 | 84.93 |
|
170 |
+
| qwen1.5-0.5b-hf | 47.00 | 46.82 | 23.15 | 48.02 | 29.37 | 48.70 | 40.12 | 38.37 | 40.65 | 35.58 | 53.16 | 31.62 |
|
171 |
+
| qwen1.5-1.8b-hf | 54.00 | 54.91 | 28.70 | 61.69 | 23.81 | 58.03 | 48.15 | 51.84 | 55.48 | 45.40 | 59.92 | 39.71 |
|
172 |
+
| qwen1.5-4b-hf | 65.00 | 66.76 | 44.44 | 73.95 | 35.71 | 78.24 | 60.19 | 65.31 | 66.45 | 65.64 | 71.31 | 50.00 |
|
173 |
+
| qwen1.5-7b-hf | 68.00 | 70.81 | 48.61 | 76.50 | 38.89 | 84.97 | 69.44 | 68.16 | 74.52 | 68.10 | 77.22 | 56.25 |
|
174 |
+
| qwen1.5-14b-hf | 77.00 | 73.70 | 62.96 | 83.40 | 53.17 | 90.67 | 71.60 | 80.82 | 84.52 | 76.69 | 83.54 | 71.69 |
|
175 |
+
| qwen1.5-32b-hf | 77.00 | 78.90 | 68.98 | 88.12 | 54.76 | 94.82 | 81.48 | 80.82 | 88.39 | 82.21 | 86.08 | 80.88 |
|
176 |
+
| qwen1.5-72b-hf | 80.00 | 84.39 | 68.98 | 91.44 | 55.56 | 98.96 | 86.73 | 81.63 | 88.71 | 85.89 | 89.87 | 82.72 |
|
177 |
+
| qwen1.5-moe-a2-7b-hf | 74.00 | 65.90 | 56.48 | 82.25 | 34.13 | 84.46 | 70.68 | 74.29 | 73.23 | 68.10 | 76.79 | 66.91 |
|
178 |
+
| mistral-7b-v0.1-hf | 57.00 | 71.10 | 57.41 | 81.61 | 40.48 | 86.53 | 73.46 | 72.65 | 76.77 | 79.14 | 77.22 | 68.75 |
|
179 |
+
| mistral-7b-v0.2-hf | 61.00 | 71.39 | 52.78 | 80.08 | 40.48 | 88.08 | 69.44 | 72.24 | 76.13 | 77.91 | 78.06 | 70.59 |
|
180 |
+
| mixtral-8x7b-v0.1-hf | 77.00 | 80.06 | 63.43 | 87.87 | 54.76 | 93.26 | 83.95 | 80.00 | 84.19 | 79.14 | 88.61 | 81.25 |
|
181 |
+
| mixtral-8x22b-v0.1-hf | 72.00 | 84.10 | 68.52 | 90.68 | 57.14 | 96.37 | 86.73 | 86.53 | 90.32 | 87.73 | 90.30 | 87.87 |
|
182 |
+
| yi-6b-hf | 67.00 | 69.36 | 52.78 | 80.46 | 44.44 | 89.64 | 70.99 | 74.69 | 77.10 | 78.53 | 78.90 | 65.81 |
|
183 |
+
| yi-34b-hf | 79.00 | 83.82 | 66.67 | 90.29 | 57.14 | 97.93 | 87.65 | 84.90 | 88.39 | 87.73 | 92.83 | 81.99 |
|
184 |
+
| deepseek-7b-base-hf | 49.00 | 52.31 | 41.20 | 66.28 | 30.95 | 63.73 | 55.86 | 51.84 | 52.90 | 58.90 | 62.45 | 45.22 |
|
185 |
+
| deepseek-67b-base-hf | 81.00 | 77.17 | 63.89 | 90.04 | 53.17 | 97.93 | 85.49 | 73.88 | 82.26 | 84.05 | 91.56 | 78.31 |
|
186 |
+
|
187 |
+
| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
|
188 |
+
|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
|
189 |
+
| llama-7b-turbomind | 24.81 | 32.95 | 38.73 | 45.77 | 27.19 | 48.07 | 38.12 | 43.00 |
|
190 |
+
| llama-13b-turbomind | 26.30 | 42.20 | 59.80 | 61.19 | 28.95 | 61.28 | 53.36 | 78.00 |
|
191 |
+
| llama-30b-turbomind | 27.41 | 54.91 | 76.96 | 79.10 | 35.96 | 76.15 | 67.71 | 83.00 |
|
192 |
+
| llama-65b-turbomind | 34.44 | 54.34 | 82.84 | 81.09 | 39.47 | 82.39 | 66.37 | 88.00 |
|
193 |
+
| llama-2-7b-turbomind | 29.63 | 43.35 | 60.29 | 62.69 | 27.19 | 62.75 | 56.05 | 64.00 |
|
194 |
+
| llama-2-13b-turbomind | 27.04 | 52.60 | 75.49 | 73.13 | 32.46 | 76.51 | 64.57 | 82.00 |
|
195 |
+
| llama-2-70b-turbomind | 34.07 | 64.16 | 90.69 | 90.55 | 44.74 | 87.52 | 80.27 | 92.00 |
|
196 |
+
| llama-3-8b-turbomind | 38.15 | 64.16 | 83.33 | 86.57 | 47.37 | 84.04 | 70.85 | 87.00 |
|
197 |
+
| llama-3-70b-turbomind | 48.89 | 79.77 | 95.10 | 94.03 | 72.81 | 94.13 | 82.51 | 94.00 |
|
198 |
+
| internlm2-1.8b-turbomind | 30.37 | 41.04 | 55.88 | 51.74 | 28.95 | 61.47 | 51.12 | 63.00 |
|
199 |
+
| internlm2-7b-turbomind | 39.63 | 68.21 | 76.96 | 84.58 | 44.74 | 84.59 | 72.65 | 86.00 |
|
200 |
+
| internlm2-20b-turbomind | 39.63 | 66.47 | 82.84 | 85.07 | 47.37 | 86.79 | 70.85 | 84.00 |
|
201 |
+
| qwen-1.8b-turbomind | 28.52 | 43.35 | 54.90 | 60.70 | 36.84 | 60.73 | 48.43 | 60.00 |
|
202 |
+
| qwen-7b-turbomind | 30.00 | 57.23 | 75.98 | 79.10 | 32.46 | 79.27 | 63.23 | 81.00 |
|
203 |
+
| qwen-14b-turbomind | 37.41 | 70.52 | 81.37 | 85.07 | 50.00 | 84.95 | 73.09 | 86.00 |
|
204 |
+
| qwen-72b-turbomind | 50.00 | 75.72 | 92.16 | 90.05 | 59.65 | 92.66 | 82.51 | 95.00 |
|
205 |
+
| qwen1.5-0.5b-hf | 29.63 | 33.53 | 45.10 | 59.70 | 28.95 | 44.77 | 37.22 | 69.00 |
|
206 |
+
| qwen1.5-1.8b-hf | 34.07 | 39.31 | 47.55 | 63.18 | 32.46 | 59.08 | 53.81 | 73.00 |
|
207 |
+
| qwen1.5-4b-hf | 35.93 | 55.49 | 71.08 | 73.13 | 37.72 | 72.11 | 63.68 | 79.00 |
|
208 |
+
| qwen1.5-7b-hf | 34.81 | 61.85 | 78.92 | 82.09 | 41.23 | 80.73 | 61.88 | 84.00 |
|
209 |
+
| qwen1.5-14b-hf | 45.93 | 68.21 | 80.88 | 83.08 | 55.26 | 86.06 | 73.09 | 88.00 |
|
210 |
+
| qwen1.5-32b-hf | 47.04 | 76.30 | 90.20 | 86.07 | 57.89 | 90.28 | 75.78 | 92.00 |
|
211 |
+
| qwen1.5-72b-hf | 47.78 | 75.14 | 92.65 | 88.56 | 59.65 | 92.48 | 79.82 | 94.00 |
|
212 |
+
| qwen1.5-moe-a2-7b-hf | 46.30 | 54.91 | 78.43 | 79.10 | 38.60 | 82.39 | 66.82 | 83.00 |
|
213 |
+
| mistral-7b-v0.1-hf | 33.70 | 65.32 | 78.92 | 83.08 | 50.00 | 82.39 | 69.51 | 86.00 |
|
214 |
+
| mistral-7b-v0.2-hf | 38.15 | 64.16 | 81.86 | 82.09 | 43.86 | 80.18 | 69.96 | 86.00 |
|
215 |
+
| mixtral-8x7b-v0.1-hf | 40.37 | 69.94 | 86.27 | 88.56 | 65.79 | 88.81 | 79.37 | 91.00 |
|
216 |
+
| mixtral-8x22b-v0.1-hf | 45.93 | 79.19 | 90.20 | 93.03 | 70.18 | 92.29 | 79.37 | 95.00 |
|
217 |
+
| yi-6b-hf | 32.59 | 61.27 | 79.90 | 82.59 | 35.96 | 82.94 | 67.26 | 86.00 |
|
218 |
+
| yi-34b-hf | 45.19 | 71.68 | 91.18 | 88.56 | 55.26 | 91.74 | 78.48 | 91.00 |
|
219 |
+
| deepseek-7b-base-hf | 28.89 | 41.62 | 60.29 | 70.15 | 26.32 | 69.72 | 55.61 | 76.00 |
|
220 |
+
| deepseek-67b-base-hf | 38.89 | 72.25 | 90.69 | 90.05 | 52.63 | 90.46 | 80.72 | 95.00 |
|
221 |
+
|
222 |
+
## Chat Models
|
223 |
+
|
224 |
+
| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
|
225 |
+
|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
|
226 |
+
| qwen1.5-0.5b-chat-hf | 35.32 | 30.90 | 37.59 | 37.29 | 37.73 |
|
227 |
+
| qwen1.5-1.8b-chat-hf | 45.62 | 39.20 | 49.21 | 47.67 | 49.63 |
|
228 |
+
| qwen1.5-4b-chat-hf | 55.90 | 48.07 | 62.67 | 59.70 | 57.31 |
|
229 |
+
| qwen1.5-7b-chat-hf | 61.79 | 52.68 | 69.41 | 66.41 | 63.45 |
|
230 |
+
| qwen1.5-14b-chat-hf | 67.96 | 59.79 | 75.46 | 71.23 | 69.72 |
|
231 |
+
| qwen1.5-32b-chat-hf | 75.36 | 67.04 | 82.11 | 80.44 | 76.23 |
|
232 |
+
| qwen1.5-72b-chat-hf | 77.24 | 69.59 | 83.95 | 81.58 | 77.87 |
|
233 |
+
| qwen1.5-110b-chat-hf | 77.95 | 71.56 | 83.77 | 81.44 | 78.41 |
|
234 |
+
| internlm2-chat-1.8b-hf | 47.58 | 40.88 | 53.33 | 49.92 | 49.74 |
|
235 |
+
| internlm2-chat-1.8b-sft-hf | 47.44 | 40.55 | 53.31 | 49.67 | 49.89 |
|
236 |
+
| internlm2-chat-7b-hf | 63.05 | 53.42 | 71.47 | 67.27 | 65.13 |
|
237 |
+
| internlm2-chat-7b-sft-hf | 63.33 | 53.95 | 71.74 | 67.62 | 65.00 |
|
238 |
+
| internlm2-chat-20b-hf | 67.37 | 57.39 | 75.75 | 71.63 | 69.95 |
|
239 |
+
| internlm2-chat-20b-sft-hf | 67.34 | 57.49 | 75.67 | 70.99 | 70.40 |
|
240 |
+
| llama-3-8b-instruct-hf | 68.37 | 58.01 | 77.82 | 71.22 | 71.94 |
|
241 |
+
| llama-3-70b-instruct-hf | 80.93 | 73.86 | 87.71 | 83.90 | 82.01 |
|
242 |
+
| llama-3-8b-instruct-lmdeploy | 67.35 | 56.66 | 75.96 | 70.90 | 71.49 |
|
243 |
+
| llama-3-70b-instruct-lmdeploy | 80.85 | 74.07 | 87.26 | 83.73 | 81.96 |
|
244 |
+
| mistral-7b-instruct-v0.1-hf | 54.36 | 43.74 | 62.96 | 58.87 | 57.46 |
|
245 |
+
| mistral-7b-instruct-v0.2-hf | 59.98 | 49.56 | 69.22 | 64.41 | 62.24 |
|
246 |
+
| mixtral-8x7b-instruct-v0.1-hf | 70.11 | 60.29 | 79.01 | 74.08 | 72.28 |
|
247 |
+
|
248 |
+
### Details
|
249 |
+
|
250 |
+
| model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
|
251 |
+
|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
|
252 |
+
| qwen1.5-0.5b-chat-hf | 31.25 | 32.00 | 33.00 | 29.00 | 33.33 | 38.62 | 33.55 | 28.89 | 20.00 | 27.68 | 40.38 | 33.00 |
|
253 |
+
| qwen1.5-1.8b-chat-hf | 42.36 | 28.00 | 45.00 | 33.00 | 27.45 | 44.83 | 51.97 | 42.22 | 32.00 | 38.39 | 48.30 | 30.00 |
|
254 |
+
| qwen1.5-4b-chat-hf | 56.25 | 47.00 | 49.00 | 39.00 | 36.27 | 54.48 | 57.89 | 49.63 | 38.00 | 33.04 | 59.62 | 23.00 |
|
255 |
+
| qwen1.5-7b-chat-hf | 64.58 | 51.00 | 59.00 | 37.00 | 41.18 | 53.79 | 66.45 | 53.33 | 43.00 | 41.07 | 67.92 | 36.00 |
|
256 |
+
| qwen1.5-14b-chat-hf | 77.08 | 51.00 | 64.00 | 42.00 | 45.10 | 64.83 | 77.63 | 65.93 | 39.00 | 46.43 | 73.21 | 45.00 |
|
257 |
+
| qwen1.5-32b-chat-hf | 84.72 | 53.00 | 57.00 | 48.00 | 52.94 | 74.48 | 82.24 | 67.41 | 52.00 | 61.61 | 78.11 | 48.00 |
|
258 |
+
| qwen1.5-72b-chat-hf | 90.97 | 57.00 | 66.00 | 55.00 | 55.88 | 80.00 | 88.16 | 72.59 | 56.00 | 59.82 | 80.00 | 51.00 |
|
259 |
+
| qwen1.5-110b-chat-hf | 88.89 | 62.00 | 66.00 | 64.00 | 58.82 | 75.86 | 89.47 | 68.15 | 59.00 | 63.39 | 79.62 | 59.00 |
|
260 |
+
| internlm2-chat-1.8b-hf | 49.31 | 36.00 | 47.00 | 33.00 | 36.27 | 42.76 | 48.03 | 49.63 | 30.00 | 33.93 | 53.58 | 28.00 |
|
261 |
+
| internlm2-chat-1.8b-sft-hf | 51.39 | 37.00 | 50.00 | 33.00 | 33.33 | 42.76 | 46.05 | 49.63 | 31.00 | 32.14 | 53.21 | 29.00 |
|
262 |
+
| internlm2-chat-7b-hf | 68.75 | 47.00 | 62.00 | 32.00 | 38.24 | 57.24 | 69.74 | 58.52 | 29.00 | 53.57 | 70.19 | 41.00 |
|
263 |
+
| internlm2-chat-7b-sft-hf | 71.53 | 47.00 | 63.00 | 34.00 | 37.25 | 57.24 | 69.74 | 57.78 | 29.00 | 52.68 | 69.43 | 34.00 |
|
264 |
+
| internlm2-chat-20b-hf | 76.39 | 51.00 | 61.00 | 37.00 | 40.20 | 62.76 | 78.95 | 67.41 | 33.00 | 46.43 | 75.09 | 42.00 |
|
265 |
+
| internlm2-chat-20b-sft-hf | 77.08 | 49.00 | 60.00 | 39.00 | 39.22 | 64.14 | 79.61 | 68.15 | 35.00 | 46.43 | 75.09 | 42.00 |
|
266 |
+
| llama-3-8b-instruct-hf | 81.94 | 48.00 | 58.00 | 43.00 | 48.04 | 60.69 | 76.32 | 71.11 | 33.00 | 54.46 | 73.58 | 46.00 |
|
267 |
+
| llama-3-70b-instruct-hf | 93.06 | 56.00 | 70.00 | 60.00 | 60.78 | 77.24 | 93.42 | 79.26 | 53.00 | 71.43 | 86.42 | 66.00 |
|
268 |
+
| llama-3-8b-instruct-lmdeploy | 79.17 | 47.00 | 53.00 | 36.00 | 49.02 | 60.00 | 73.68 | 68.89 | 36.00 | 55.36 | 73.96 | 42.00 |
|
269 |
+
| llama-3-70b-instruct-lmdeploy | 93.75 | 57.00 | 66.00 | 61.00 | 65.69 | 77.93 | 92.11 | 78.52 | 55.00 | 70.54 | 86.42 | 64.00 |
|
270 |
+
| mistral-7b-instruct-v0.1-hf | 57.64 | 35.00 | 50.00 | 31.00 | 24.51 | 51.72 | 58.55 | 45.93 | 35.00 | 41.07 | 56.98 | 32.00 |
|
271 |
+
| mistral-7b-instruct-v0.2-hf | 70.14 | 42.00 | 49.00 | 35.00 | 43.14 | 54.48 | 65.79 | 56.30 | 29.00 | 42.86 | 65.28 | 37.00 |
|
272 |
+
| mixtral-8x7b-instruct-v0.1-hf | 81.25 | 57.00 | 57.00 | 40.00 | 50.00 | 60.69 | 80.92 | 65.93 | 45.00 | 50.89 | 76.60 | 41.00 |
|
273 |
+
|
274 |
+
| model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
|
275 |
+
|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
|
276 |
+
| qwen1.5-0.5b-chat-hf | 41.75 | 38.89 | 49.15 | 26.60 | 48.48 | 50.41 | 24.69 | 42.00 | 32.35 | 31.75 | 31.00 | 32.35 |
|
277 |
+
| qwen1.5-1.8b-chat-hf | 62.14 | 55.56 | 76.92 | 34.40 | 58.08 | 61.16 | 21.90 | 56.00 | 42.44 | 35.14 | 50.00 | 44.93 |
|
278 |
+
| qwen1.5-4b-chat-hf | 73.79 | 58.50 | 82.05 | 47.16 | 74.24 | 71.90 | 32.29 | 69.00 | 58.40 | 40.74 | 58.00 | 53.76 |
|
279 |
+
| qwen1.5-7b-chat-hf | 79.61 | 69.28 | 85.47 | 41.49 | 78.79 | 76.86 | 35.75 | 74.00 | 65.13 | 44.78 | 68.00 | 57.68 |
|
280 |
+
| qwen1.5-14b-chat-hf | 82.52 | 70.26 | 87.18 | 51.77 | 85.86 | 82.64 | 53.74 | 81.00 | 76.05 | 47.98 | 76.00 | 67.48 |
|
281 |
+
| qwen1.5-32b-chat-hf | 84.47 | 77.78 | 94.44 | 60.99 | 90.91 | 87.60 | 72.96 | 79.00 | 83.61 | 58.28 | 83.00 | 77.94 |
|
282 |
+
| qwen1.5-72b-chat-hf | 89.32 | 85.95 | 93.59 | 61.35 | 90.91 | 86.78 | 75.98 | 83.00 | 84.87 | 60.30 | 83.00 | 81.05 |
|
283 |
+
| qwen1.5-110b-chat-hf | 86.41 | 80.72 | 92.74 | 69.15 | 93.94 | 84.30 | 77.88 | 83.00 | 88.66 | 61.73 | 84.00 | 82.19 |
|
284 |
+
| internlm2-chat-1.8b-hf | 72.82 | 50.65 | 69.23 | 35.46 | 56.06 | 56.20 | 27.82 | 60.00 | 49.16 | 33.83 | 54.00 | 43.79 |
|
285 |
+
| internlm2-chat-1.8b-sft-hf | 71.84 | 52.61 | 68.80 | 34.75 | 55.56 | 53.72 | 27.04 | 58.00 | 48.74 | 34.09 | 54.00 | 44.61 |
|
286 |
+
| internlm2-chat-7b-hf | 78.64 | 66.67 | 85.90 | 46.81 | 79.29 | 70.25 | 35.31 | 79.00 | 68.07 | 46.41 | 68.00 | 64.87 |
|
287 |
+
| internlm2-chat-7b-sft-hf | 79.61 | 67.97 | 86.75 | 47.52 | 80.30 | 70.25 | 35.98 | 80.00 | 69.33 | 45.83 | 70.00 | 65.36 |
|
288 |
+
| internlm2-chat-20b-hf | 80.58 | 75.16 | 90.17 | 52.13 | 83.84 | 80.99 | 39.33 | 80.00 | 70.59 | 49.67 | 75.00 | 70.26 |
|
289 |
+
| internlm2-chat-20b-sft-hf | 80.58 | 76.14 | 91.03 | 53.19 | 84.34 | 80.99 | 36.31 | 77.00 | 71.85 | 49.61 | 77.00 | 70.59 |
|
290 |
+
| llama-3-8b-instruct-hf | 82.52 | 79.41 | 91.45 | 52.48 | 80.30 | 79.34 | 46.26 | 75.00 | 76.89 | 49.61 | 85.00 | 72.22 |
|
291 |
+
| llama-3-70b-instruct-hf | 89.32 | 87.58 | 93.16 | 66.67 | 92.42 | 90.08 | 76.20 | 83.00 | 89.50 | 64.67 | 92.00 | 87.09 |
|
292 |
+
| llama-3-8b-instruct-lmdeploy | 87.38 | 79.41 | 90.17 | 52.48 | 79.80 | 78.51 | 44.25 | 75.00 | 74.37 | 48.76 | 84.00 | 69.61 |
|
293 |
+
| llama-3-70b-instruct-lmdeploy | 90.29 | 88.56 | 93.59 | 65.96 | 92.93 | 89.26 | 75.75 | 83.00 | 89.92 | 63.95 | 92.00 | 86.60 |
|
294 |
+
| mistral-7b-instruct-v0.1-hf | 69.90 | 59.80 | 85.47 | 38.65 | 69.70 | 65.29 | 37.54 | 69.00 | 51.26 | 37.81 | 65.00 | 52.45 |
|
295 |
+
| mistral-7b-instruct-v0.2-hf | 74.76 | 66.99 | 88.89 | 43.97 | 75.25 | 76.86 | 42.01 | 73.00 | 62.61 | 42.24 | 67.00 | 62.25 |
|
296 |
+
| mixtral-8x7b-instruct-v0.1-hf | 85.44 | 80.39 | 92.74 | 55.32 | 85.35 | 82.64 | 48.38 | 78.00 | 75.21 | 53.52 | 75.00 | 74.02 |
|
297 |
+
|
298 |
+
| model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
|
299 |
+
|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
|
300 |
+
| qwen1.5-0.5b-chat-hf | 42.59 | 24.56 | 39.87 | 39.76 | 29.06 | 38.18 | 35.64 | 38.93 | 27.78 | 29.80 | 34.00 | 48.48 |
|
301 |
+
| qwen1.5-1.8b-chat-hf | 50.93 | 56.73 | 44.37 | 42.77 | 35.96 | 51.82 | 38.46 | 49.62 | 35.45 | 27.15 | 47.00 | 63.03 |
|
302 |
+
| qwen1.5-4b-chat-hf | 71.30 | 65.50 | 58.20 | 50.00 | 44.33 | 57.27 | 54.10 | 61.83 | 43.65 | 41.06 | 60.00 | 72.12 |
|
303 |
+
| qwen1.5-7b-chat-hf | 76.85 | 76.61 | 68.49 | 48.80 | 51.72 | 64.55 | 59.23 | 68.70 | 48.94 | 37.09 | 69.00 | 79.39 |
|
304 |
+
| qwen1.5-14b-chat-hf | 75.93 | 80.70 | 69.13 | 51.20 | 55.67 | 64.55 | 67.69 | 74.05 | 57.14 | 47.02 | 74.00 | 82.42 |
|
305 |
+
| qwen1.5-32b-chat-hf | 83.33 | 89.47 | 82.64 | 60.84 | 62.56 | 70.00 | 76.67 | 83.21 | 67.46 | 59.60 | 85.00 | 84.85 |
|
306 |
+
| qwen1.5-72b-chat-hf | 86.11 | 89.47 | 80.71 | 59.04 | 68.47 | 72.73 | 80.00 | 87.79 | 67.72 | 52.32 | 79.00 | 85.45 |
|
307 |
+
| qwen1.5-110b-chat-hf | 83.33 | 87.13 | 81.03 | 54.22 | 69.95 | 73.64 | 78.21 | 87.02 | 75.93 | 57.62 | 84.00 | 88.48 |
|
308 |
+
| internlm2-chat-1.8b-hf | 52.78 | 60.82 | 49.20 | 42.77 | 42.36 | 50.00 | 47.18 | 53.44 | 32.54 | 31.79 | 39.00 | 60.00 |
|
309 |
+
| internlm2-chat-1.8b-sft-hf | 53.70 | 61.40 | 50.16 | 42.17 | 40.89 | 50.00 | 47.69 | 51.15 | 32.54 | 29.14 | 40.00 | 59.39 |
|
310 |
+
| internlm2-chat-7b-hf | 73.15 | 81.87 | 67.85 | 47.59 | 49.75 | 62.73 | 61.79 | 66.41 | 44.97 | 33.77 | 71.00 | 81.82 |
|
311 |
+
| internlm2-chat-7b-sft-hf | 73.15 | 81.87 | 66.88 | 48.19 | 48.77 | 63.64 | 62.31 | 65.65 | 45.77 | 33.77 | 72.00 | 81.82 |
|
312 |
+
| internlm2-chat-20b-hf | 80.56 | 81.87 | 72.99 | 55.42 | 54.19 | 70.00 | 67.95 | 71.76 | 48.15 | 39.74 | 75.00 | 80.00 |
|
313 |
+
| internlm2-chat-20b-sft-hf | 81.48 | 79.53 | 72.99 | 54.82 | 54.19 | 69.09 | 67.95 | 71.76 | 48.94 | 41.06 | 75.00 | 80.00 |
|
314 |
+
| llama-3-8b-instruct-hf | 76.85 | 79.53 | 72.35 | 53.61 | 54.19 | 70.91 | 66.41 | 80.92 | 49.47 | 46.36 | 71.00 | 75.15 |
|
315 |
+
| llama-3-70b-instruct-hf | 87.04 | 88.30 | 82.64 | 56.02 | 67.49 | 74.55 | 86.41 | 88.55 | 74.34 | 65.56 | 91.00 | 86.06 |
|
316 |
+
| llama-3-8b-instruct-lmdeploy | 77.78 | 79.53 | 70.74 | 52.41 | 53.20 | 68.18 | 65.38 | 79.39 | 50.79 | 37.75 | 72.00 | 76.97 |
|
317 |
+
| llama-3-70b-instruct-lmdeploy | 87.96 | 90.64 | 83.28 | 54.82 | 69.46 | 73.64 | 86.92 | 87.02 | 74.87 | 66.23 | 92.00 | 85.45 |
|
318 |
+
| mistral-7b-instruct-v0.1-hf | 64.81 | 70.18 | 63.67 | 41.57 | 38.92 | 68.18 | 49.49 | 61.83 | 33.33 | 32.45 | 55.00 | 66.67 |
|
319 |
+
| mistral-7b-instruct-v0.2-hf | 70.37 | 80.12 | 64.95 | 50.60 | 50.74 | 68.18 | 54.36 | 71.76 | 40.74 | 35.10 | 60.00 | 73.33 |
|
320 |
+
| mixtral-8x7b-instruct-v0.1-hf | 79.63 | 87.72 | 73.63 | 54.82 | 61.58 | 67.27 | 69.49 | 83.21 | 52.91 | 47.02 | 74.00 | 80.61 |
|
321 |
+
|
322 |
+
| model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
|
323 |
+
|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
|
324 |
+
| qwen1.5-0.5b-chat-hf | 45.00 | 41.04 | 30.09 | 39.21 | 24.60 | 35.23 | 33.95 | 25.31 | 36.13 | 31.29 | 49.37 | 38.24 |
|
325 |
+
| qwen1.5-1.8b-chat-hf | 54.00 | 50.29 | 34.26 | 58.49 | 24.60 | 55.96 | 47.53 | 39.18 | 47.74 | 44.17 | 64.98 | 40.81 |
|
326 |
+
| qwen1.5-4b-chat-hf | 61.00 | 64.16 | 46.30 | 71.01 | 39.68 | 72.02 | 54.01 | 65.31 | 63.55 | 63.80 | 71.31 | 51.10 |
|
327 |
+
| qwen1.5-7b-chat-hf | 69.00 | 67.05 | 50.93 | 76.25 | 53.17 | 82.38 | 62.96 | 71.02 | 73.23 | 68.10 | 76.79 | 60.29 |
|
328 |
+
| qwen1.5-14b-chat-hf | 74.00 | 75.14 | 58.33 | 82.89 | 51.59 | 88.60 | 69.44 | 77.96 | 84.19 | 73.62 | 82.70 | 71.32 |
|
329 |
+
| qwen1.5-32b-chat-hf | 80.00 | 80.64 | 70.83 | 89.40 | 60.32 | 94.82 | 81.79 | 79.59 | 90.00 | 86.50 | 88.61 | 80.15 |
|
330 |
+
| qwen1.5-72b-chat-hf | 80.00 | 82.95 | 68.98 | 91.83 | 57.14 | 98.45 | 86.73 | 78.78 | 89.03 | 87.12 | 91.14 | 83.82 |
|
331 |
+
| qwen1.5-110b-chat-hf | 79.00 | 78.03 | 67.13 | 92.98 | 62.70 | 97.93 | 87.04 | 74.29 | 88.71 | 82.82 | 91.14 | 84.93 |
|
332 |
+
| internlm2-chat-1.8b-hf | 48.00 | 49.13 | 44.91 | 57.60 | 26.98 | 61.14 | 50.62 | 51.02 | 52.58 | 57.67 | 67.51 | 37.50 |
|
333 |
+
| internlm2-chat-1.8b-sft-hf | 50.00 | 49.13 | 44.91 | 57.73 | 28.57 | 61.66 | 49.69 | 51.02 | 49.68 | 57.67 | 66.67 | 38.60 |
|
334 |
+
| internlm2-chat-7b-hf | 65.00 | 65.61 | 49.54 | 80.84 | 43.65 | 88.08 | 70.99 | 68.98 | 78.39 | 75.46 | 82.28 | 61.76 |
|
335 |
+
| internlm2-chat-7b-sft-hf | 64.00 | 66.18 | 52.31 | 81.35 | 46.03 | 88.08 | 71.60 | 67.76 | 78.39 | 77.30 | 82.28 | 63.60 |
|
336 |
+
| internlm2-chat-20b-hf | 74.00 | 73.70 | 59.72 | 81.86 | 46.83 | 89.12 | 74.69 | 75.92 | 80.65 | 79.14 | 82.70 | 70.59 |
|
337 |
+
| internlm2-chat-20b-sft-hf | 76.00 | 73.12 | 60.19 | 81.99 | 43.65 | 88.60 | 74.38 | 73.88 | 80.32 | 80.37 | 82.70 | 70.59 |
|
338 |
+
| llama-3-8b-instruct-hf | 72.00 | 73.12 | 55.09 | 84.55 | 50.00 | 90.67 | 77.16 | 77.55 | 81.61 | 77.91 | 84.81 | 75.00 |
|
339 |
+
| llama-3-70b-instruct-hf | 85.00 | 85.26 | 75.00 | 92.72 | 69.05 | 97.41 | 90.43 | 82.04 | 91.61 | 87.12 | 94.09 | 89.71 |
|
340 |
+
| llama-3-8b-instruct-lmdeploy | 72.00 | 72.83 | 52.78 | 82.12 | 51.59 | 89.64 | 76.85 | 76.73 | 80.97 | 76.69 | 84.39 | 74.63 |
|
341 |
+
| llama-3-70b-instruct-lmdeploy | 85.00 | 84.39 | 73.61 | 92.72 | 67.46 | 97.93 | 89.81 | 81.63 | 90.65 | 87.12 | 93.25 | 89.34 |
|
342 |
+
| mistral-7b-instruct-v0.1-hf | 55.00 | 57.51 | 39.81 | 74.07 | 39.68 | 75.65 | 57.72 | 62.04 | 59.35 | 69.33 | 67.93 | 55.88 |
|
343 |
+
| mistral-7b-instruct-v0.2-hf | 61.00 | 66.76 | 46.76 | 78.67 | 36.51 | 84.97 | 68.83 | 70.20 | 68.39 | 69.33 | 73.00 | 58.09 |
|
344 |
+
| mixtral-8x7b-instruct-v0.1-hf | 66.00 | 76.59 | 57.87 | 86.59 | 50.00 | 93.78 | 83.02 | 79.18 | 82.58 | 75.46 | 86.50 | 77.94 |
|
345 |
+
|
346 |
+
| model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
|
347 |
+
|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
|
348 |
+
| qwen1.5-0.5b-chat-hf | 24.44 | 35.26 | 42.16 | 47.26 | 29.82 | 40.55 | 32.29 | 47.00 |
|
349 |
+
| qwen1.5-1.8b-chat-hf | 32.22 | 43.35 | 54.90 | 48.26 | 28.95 | 61.83 | 48.43 | 71.00 |
|
350 |
+
| qwen1.5-4b-chat-hf | 36.30 | 51.45 | 71.08 | 76.62 | 34.21 | 72.29 | 58.30 | 72.00 |
|
351 |
+
| qwen1.5-7b-chat-hf | 31.11 | 61.27 | 76.47 | 79.10 | 42.11 | 81.28 | 61.43 | 83.00 |
|
352 |
+
| qwen1.5-14b-chat-hf | 41.48 | 68.79 | 80.88 | 82.59 | 48.25 | 84.40 | 72.20 | 88.00 |
|
353 |
+
| qwen1.5-32b-chat-hf | 48.52 | 75.72 | 88.73 | 86.07 | 57.02 | 90.46 | 78.03 | 95.00 |
|
354 |
+
| qwen1.5-72b-chat-hf | 51.48 | 73.99 | 90.69 | 87.06 | 59.65 | 92.11 | 79.37 | 94.00 |
|
355 |
+
| qwen1.5-110b-chat-hf | 52.22 | 76.30 | 93.14 | 87.56 | 62.28 | 91.56 | 80.27 | 88.00 |
|
356 |
+
| internlm2-chat-1.8b-hf | 31.48 | 46.82 | 56.37 | 65.17 | 28.07 | 65.87 | 50.22 | 69.00 |
|
357 |
+
| internlm2-chat-1.8b-sft-hf | 30.74 | 47.40 | 54.41 | 64.18 | 29.82 | 66.24 | 48.43 | 69.00 |
|
358 |
+
| internlm2-chat-7b-hf | 33.70 | 67.05 | 79.90 | 81.09 | 48.25 | 84.04 | 67.26 | 84.00 |
|
359 |
+
| internlm2-chat-7b-sft-hf | 35.19 | 67.05 | 79.90 | 80.60 | 48.25 | 84.59 | 65.47 | 85.00 |
|
360 |
+
| internlm2-chat-20b-hf | 36.30 | 66.47 | 88.73 | 85.07 | 51.75 | 85.69 | 70.85 | 87.00 |
|
361 |
+
| internlm2-chat-20b-sft-hf | 35.93 | 65.90 | 87.75 | 85.57 | 52.63 | 84.77 | 70.85 | 87.00 |
|
362 |
+
| llama-3-8b-instruct-hf | 36.67 | 68.79 | 83.82 | 86.57 | 61.40 | 84.95 | 70.85 | 85.00 |
|
363 |
+
| llama-3-70b-instruct-hf | 57.41 | 78.61 | 89.71 | 91.54 | 74.56 | 94.50 | 82.96 | 94.00 |
|
364 |
+
| llama-3-8b-instruct-lmdeploy | 38.52 | 68.79 | 82.84 | 85.57 | 54.39 | 85.50 | 69.96 | 83.00 |
|
365 |
+
| llama-3-70b-instruct-lmdeploy | 54.81 | 79.77 | 90.20 | 92.04 | 71.05 | 94.50 | 82.96 | 93.00 |
|
366 |
+
| mistral-7b-instruct-v0.1-hf | 28.89 | 50.29 | 67.16 | 76.12 | 39.47 | 72.29 | 62.33 | 77.00 |
|
367 |
+
| mistral-7b-instruct-v0.2-hf | 30.74 | 53.18 | 73.04 | 77.11 | 42.11 | 79.82 | 63.68 | 82.00 |
|
368 |
+
| mixtral-8x7b-instruct-v0.1-hf | 35.56 | 73.41 | 85.29 | 87.06 | 60.53 | 86.97 | 74.44 | 86.00 |
|
opencompass/configs/datasets/mmlu/mmlu_all_sets.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mmlu_all_sets = [
|
2 |
+
'college_biology',
|
3 |
+
'college_chemistry',
|
4 |
+
'college_computer_science',
|
5 |
+
'college_mathematics',
|
6 |
+
'college_physics',
|
7 |
+
'electrical_engineering',
|
8 |
+
'astronomy',
|
9 |
+
'anatomy',
|
10 |
+
'abstract_algebra',
|
11 |
+
'machine_learning',
|
12 |
+
'clinical_knowledge',
|
13 |
+
'global_facts',
|
14 |
+
'management',
|
15 |
+
'nutrition',
|
16 |
+
'marketing',
|
17 |
+
'professional_accounting',
|
18 |
+
'high_school_geography',
|
19 |
+
'international_law',
|
20 |
+
'moral_scenarios',
|
21 |
+
'computer_security',
|
22 |
+
'high_school_microeconomics',
|
23 |
+
'professional_law',
|
24 |
+
'medical_genetics',
|
25 |
+
'professional_psychology',
|
26 |
+
'jurisprudence',
|
27 |
+
'world_religions',
|
28 |
+
'philosophy',
|
29 |
+
'virology',
|
30 |
+
'high_school_chemistry',
|
31 |
+
'public_relations',
|
32 |
+
'high_school_macroeconomics',
|
33 |
+
'human_sexuality',
|
34 |
+
'elementary_mathematics',
|
35 |
+
'high_school_physics',
|
36 |
+
'high_school_computer_science',
|
37 |
+
'high_school_european_history',
|
38 |
+
'business_ethics',
|
39 |
+
'moral_disputes',
|
40 |
+
'high_school_statistics',
|
41 |
+
'miscellaneous',
|
42 |
+
'formal_logic',
|
43 |
+
'high_school_government_and_politics',
|
44 |
+
'prehistory',
|
45 |
+
'security_studies',
|
46 |
+
'high_school_biology',
|
47 |
+
'logical_fallacies',
|
48 |
+
'high_school_world_history',
|
49 |
+
'professional_medicine',
|
50 |
+
'high_school_mathematics',
|
51 |
+
'college_medicine',
|
52 |
+
'high_school_us_history',
|
53 |
+
'sociology',
|
54 |
+
'econometrics',
|
55 |
+
'high_school_psychology',
|
56 |
+
'human_aging',
|
57 |
+
'us_foreign_policy',
|
58 |
+
'conceptual_physics',
|
59 |
+
]
|
opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
|
5 |
+
from opencompass.datasets import MMLUDatasetClean as MMLUDataset
|
6 |
+
|
7 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
8 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
9 |
+
|
10 |
+
mmlu_reader_cfg = dict(
|
11 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
12 |
+
output_column='target',
|
13 |
+
train_split='dev')
|
14 |
+
|
15 |
+
mmlu_all_sets = [
|
16 |
+
'college_biology',
|
17 |
+
'college_chemistry',
|
18 |
+
'college_computer_science',
|
19 |
+
'college_mathematics',
|
20 |
+
'college_physics',
|
21 |
+
'electrical_engineering',
|
22 |
+
'astronomy',
|
23 |
+
'anatomy',
|
24 |
+
'abstract_algebra',
|
25 |
+
'machine_learning',
|
26 |
+
'clinical_knowledge',
|
27 |
+
'global_facts',
|
28 |
+
'management',
|
29 |
+
'nutrition',
|
30 |
+
'marketing',
|
31 |
+
'professional_accounting',
|
32 |
+
'high_school_geography',
|
33 |
+
'international_law',
|
34 |
+
'moral_scenarios',
|
35 |
+
'computer_security',
|
36 |
+
'high_school_microeconomics',
|
37 |
+
'professional_law',
|
38 |
+
'medical_genetics',
|
39 |
+
'professional_psychology',
|
40 |
+
'jurisprudence',
|
41 |
+
'world_religions',
|
42 |
+
'philosophy',
|
43 |
+
'virology',
|
44 |
+
'high_school_chemistry',
|
45 |
+
'public_relations',
|
46 |
+
'high_school_macroeconomics',
|
47 |
+
'human_sexuality',
|
48 |
+
'elementary_mathematics',
|
49 |
+
'high_school_physics',
|
50 |
+
'high_school_computer_science',
|
51 |
+
'high_school_european_history',
|
52 |
+
'business_ethics',
|
53 |
+
'moral_disputes',
|
54 |
+
'high_school_statistics',
|
55 |
+
'miscellaneous',
|
56 |
+
'formal_logic',
|
57 |
+
'high_school_government_and_politics',
|
58 |
+
'prehistory',
|
59 |
+
'security_studies',
|
60 |
+
'high_school_biology',
|
61 |
+
'logical_fallacies',
|
62 |
+
'high_school_world_history',
|
63 |
+
'professional_medicine',
|
64 |
+
'high_school_mathematics',
|
65 |
+
'college_medicine',
|
66 |
+
'high_school_us_history',
|
67 |
+
'sociology',
|
68 |
+
'econometrics',
|
69 |
+
'high_school_psychology',
|
70 |
+
'human_aging',
|
71 |
+
'us_foreign_policy',
|
72 |
+
'conceptual_physics',
|
73 |
+
]
|
74 |
+
|
75 |
+
mmlu_datasets = []
|
76 |
+
for _name in mmlu_all_sets:
|
77 |
+
_hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
|
78 |
+
mmlu_infer_cfg = dict(
|
79 |
+
ice_template=dict(
|
80 |
+
type=PromptTemplate,
|
81 |
+
template={
|
82 |
+
opt:
|
83 |
+
f'{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}\n'
|
84 |
+
for opt in ['A', 'B', 'C', 'D']
|
85 |
+
},
|
86 |
+
),
|
87 |
+
prompt_template=dict(
|
88 |
+
type=PromptTemplate,
|
89 |
+
template={
|
90 |
+
opt:
|
91 |
+
f'{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}'
|
92 |
+
for opt in ['A', 'B', 'C', 'D']
|
93 |
+
},
|
94 |
+
ice_token='</E>',
|
95 |
+
),
|
96 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
97 |
+
inferencer=dict(type=PPLInferencer),
|
98 |
+
)
|
99 |
+
|
100 |
+
mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
|
101 |
+
analyze_contamination=True)
|
102 |
+
|
103 |
+
mmlu_datasets.append(
|
104 |
+
dict(
|
105 |
+
abbr=f'lukaemon_mmlu_{_name}',
|
106 |
+
type=MMLUDataset,
|
107 |
+
path='opencompass/mmlu',
|
108 |
+
name=_name,
|
109 |
+
reader_cfg=mmlu_reader_cfg,
|
110 |
+
infer_cfg=mmlu_infer_cfg,
|
111 |
+
eval_cfg=mmlu_eval_cfg,
|
112 |
+
))
|
113 |
+
|
114 |
+
del _name, _hint
|
opencompass/configs/datasets/mmlu/mmlu_gen.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .mmlu_gen_4d595a import mmlu_datasets # noqa: F401, F403
|
opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import MMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
9 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
10 |
+
|
11 |
+
mmlu_reader_cfg = dict(
|
12 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
13 |
+
output_column='target',
|
14 |
+
train_split='dev')
|
15 |
+
|
16 |
+
mmlu_prompt_template = dict(
|
17 |
+
type='PromptTemplate',
|
18 |
+
template=None,
|
19 |
+
ice_token='</E>')
|
20 |
+
|
21 |
+
mmlu_infer_cfg = dict(
|
22 |
+
ice_template=dict(
|
23 |
+
type=PromptTemplate,
|
24 |
+
template=dict(round=[
|
25 |
+
dict(
|
26 |
+
role='HUMAN',
|
27 |
+
prompt='{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '
|
28 |
+
),
|
29 |
+
dict(role='BOT', prompt='{target}\n')
|
30 |
+
])),
|
31 |
+
prompt_template=mmlu_prompt_template,
|
32 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
33 |
+
inferencer=dict(type=GenInferencer))
|
34 |
+
|
35 |
+
mmlu_eval_cfg = dict(
|
36 |
+
evaluator=dict(type=AccEvaluator),
|
37 |
+
pred_postprocessor=dict(type=first_capital_postprocess))
|
38 |
+
|
39 |
+
mmlu_all_sets = [
|
40 |
+
'college_biology',
|
41 |
+
'college_chemistry',
|
42 |
+
'college_computer_science',
|
43 |
+
'college_mathematics',
|
44 |
+
'college_physics',
|
45 |
+
'electrical_engineering',
|
46 |
+
'astronomy',
|
47 |
+
'anatomy',
|
48 |
+
'abstract_algebra',
|
49 |
+
'machine_learning',
|
50 |
+
'clinical_knowledge',
|
51 |
+
'global_facts',
|
52 |
+
'management',
|
53 |
+
'nutrition',
|
54 |
+
'marketing',
|
55 |
+
'professional_accounting',
|
56 |
+
'high_school_geography',
|
57 |
+
'international_law',
|
58 |
+
'moral_scenarios',
|
59 |
+
'computer_security',
|
60 |
+
'high_school_microeconomics',
|
61 |
+
'professional_law',
|
62 |
+
'medical_genetics',
|
63 |
+
'professional_psychology',
|
64 |
+
'jurisprudence',
|
65 |
+
'world_religions',
|
66 |
+
'philosophy',
|
67 |
+
'virology',
|
68 |
+
'high_school_chemistry',
|
69 |
+
'public_relations',
|
70 |
+
'high_school_macroeconomics',
|
71 |
+
'human_sexuality',
|
72 |
+
'elementary_mathematics',
|
73 |
+
'high_school_physics',
|
74 |
+
'high_school_computer_science',
|
75 |
+
'high_school_european_history',
|
76 |
+
'business_ethics',
|
77 |
+
'moral_disputes',
|
78 |
+
'high_school_statistics',
|
79 |
+
'miscellaneous',
|
80 |
+
'formal_logic',
|
81 |
+
'high_school_government_and_politics',
|
82 |
+
'prehistory',
|
83 |
+
'security_studies',
|
84 |
+
'high_school_biology',
|
85 |
+
'logical_fallacies',
|
86 |
+
'high_school_world_history',
|
87 |
+
'professional_medicine',
|
88 |
+
'high_school_mathematics',
|
89 |
+
'college_medicine',
|
90 |
+
'high_school_us_history',
|
91 |
+
'sociology',
|
92 |
+
'econometrics',
|
93 |
+
'high_school_psychology',
|
94 |
+
'human_aging',
|
95 |
+
'us_foreign_policy',
|
96 |
+
'conceptual_physics',
|
97 |
+
]
|
98 |
+
|
99 |
+
mmlu_datasets = []
|
100 |
+
for _name in mmlu_all_sets:
|
101 |
+
mmlu_datasets.append(
|
102 |
+
dict(
|
103 |
+
abbr=f'lukaemon_mmlu_{_name}',
|
104 |
+
type=MMLUDataset,
|
105 |
+
path='opencompass/mmlu',
|
106 |
+
name=_name,
|
107 |
+
reader_cfg=mmlu_reader_cfg,
|
108 |
+
infer_cfg=mmlu_infer_cfg.copy(),
|
109 |
+
eval_cfg=mmlu_eval_cfg))
|
110 |
+
|
111 |
+
mmlu_datasets[-1]['infer_cfg'][
|
112 |
+
'prompt_template'] = mmlu_prompt_template.copy()
|
113 |
+
mmlu_datasets[-1]['infer_cfg']['prompt_template']['template'] = \
|
114 |
+
dict(
|
115 |
+
begin=[
|
116 |
+
dict(role='SYSTEM', fallback_role='HUMAN', prompt=f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.'),
|
117 |
+
'</E>',
|
118 |
+
],
|
119 |
+
round=[
|
120 |
+
dict(role='HUMAN', prompt='{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '),
|
121 |
+
]
|
122 |
+
)
|
123 |
+
|
124 |
+
del _name
|
opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
5 |
+
from opencompass.datasets import MMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
9 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
10 |
+
|
11 |
+
mmlu_reader_cfg = dict(
|
12 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
13 |
+
output_column='target',
|
14 |
+
train_split='dev')
|
15 |
+
|
16 |
+
mmlu_all_sets = [
|
17 |
+
'college_biology',
|
18 |
+
'college_chemistry',
|
19 |
+
'college_computer_science',
|
20 |
+
'college_mathematics',
|
21 |
+
'college_physics',
|
22 |
+
'electrical_engineering',
|
23 |
+
'astronomy',
|
24 |
+
'anatomy',
|
25 |
+
'abstract_algebra',
|
26 |
+
'machine_learning',
|
27 |
+
'clinical_knowledge',
|
28 |
+
'global_facts',
|
29 |
+
'management',
|
30 |
+
'nutrition',
|
31 |
+
'marketing',
|
32 |
+
'professional_accounting',
|
33 |
+
'high_school_geography',
|
34 |
+
'international_law',
|
35 |
+
'moral_scenarios',
|
36 |
+
'computer_security',
|
37 |
+
'high_school_microeconomics',
|
38 |
+
'professional_law',
|
39 |
+
'medical_genetics',
|
40 |
+
'professional_psychology',
|
41 |
+
'jurisprudence',
|
42 |
+
'world_religions',
|
43 |
+
'philosophy',
|
44 |
+
'virology',
|
45 |
+
'high_school_chemistry',
|
46 |
+
'public_relations',
|
47 |
+
'high_school_macroeconomics',
|
48 |
+
'human_sexuality',
|
49 |
+
'elementary_mathematics',
|
50 |
+
'high_school_physics',
|
51 |
+
'high_school_computer_science',
|
52 |
+
'high_school_european_history',
|
53 |
+
'business_ethics',
|
54 |
+
'moral_disputes',
|
55 |
+
'high_school_statistics',
|
56 |
+
'miscellaneous',
|
57 |
+
'formal_logic',
|
58 |
+
'high_school_government_and_politics',
|
59 |
+
'prehistory',
|
60 |
+
'security_studies',
|
61 |
+
'high_school_biology',
|
62 |
+
'logical_fallacies',
|
63 |
+
'high_school_world_history',
|
64 |
+
'professional_medicine',
|
65 |
+
'high_school_mathematics',
|
66 |
+
'college_medicine',
|
67 |
+
'high_school_us_history',
|
68 |
+
'sociology',
|
69 |
+
'econometrics',
|
70 |
+
'high_school_psychology',
|
71 |
+
'human_aging',
|
72 |
+
'us_foreign_policy',
|
73 |
+
'conceptual_physics',
|
74 |
+
]
|
75 |
+
|
76 |
+
mmlu_datasets = []
|
77 |
+
for _name in mmlu_all_sets:
|
78 |
+
_hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
|
79 |
+
mmlu_infer_cfg = dict(
|
80 |
+
ice_template=dict(
|
81 |
+
type=PromptTemplate,
|
82 |
+
template=dict(round=[
|
83 |
+
dict(
|
84 |
+
role='HUMAN',
|
85 |
+
prompt=
|
86 |
+
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
87 |
+
),
|
88 |
+
dict(role='BOT', prompt='{target}\n')
|
89 |
+
]),
|
90 |
+
),
|
91 |
+
prompt_template=dict(
|
92 |
+
type=PromptTemplate,
|
93 |
+
template=dict(
|
94 |
+
begin='</E>',
|
95 |
+
round=[
|
96 |
+
dict(
|
97 |
+
role='HUMAN',
|
98 |
+
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
99 |
+
),
|
100 |
+
],
|
101 |
+
),
|
102 |
+
ice_token='</E>',
|
103 |
+
),
|
104 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
105 |
+
inferencer=dict(type=GenInferencer),
|
106 |
+
)
|
107 |
+
|
108 |
+
mmlu_eval_cfg = dict(
|
109 |
+
evaluator=dict(type=AccwithDetailsEvaluator),
|
110 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
111 |
+
|
112 |
+
mmlu_datasets.append(
|
113 |
+
dict(
|
114 |
+
abbr=f'lukaemon_mmlu_{_name}',
|
115 |
+
type=MMLUDataset,
|
116 |
+
path='opencompass/mmlu',
|
117 |
+
name=_name,
|
118 |
+
reader_cfg=mmlu_reader_cfg,
|
119 |
+
infer_cfg=mmlu_infer_cfg,
|
120 |
+
eval_cfg=mmlu_eval_cfg,
|
121 |
+
))
|
122 |
+
|
123 |
+
del _name, _hint
|
opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import MMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
9 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
10 |
+
|
11 |
+
mmlu_reader_cfg = dict(
|
12 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
13 |
+
output_column='target',
|
14 |
+
train_split='dev')
|
15 |
+
|
16 |
+
mmlu_all_sets = [
|
17 |
+
'college_biology',
|
18 |
+
'college_chemistry',
|
19 |
+
'college_computer_science',
|
20 |
+
'college_mathematics',
|
21 |
+
'college_physics',
|
22 |
+
'electrical_engineering',
|
23 |
+
'astronomy',
|
24 |
+
'anatomy',
|
25 |
+
'abstract_algebra',
|
26 |
+
'machine_learning',
|
27 |
+
'clinical_knowledge',
|
28 |
+
'global_facts',
|
29 |
+
'management',
|
30 |
+
'nutrition',
|
31 |
+
'marketing',
|
32 |
+
'professional_accounting',
|
33 |
+
'high_school_geography',
|
34 |
+
'international_law',
|
35 |
+
'moral_scenarios',
|
36 |
+
'computer_security',
|
37 |
+
'high_school_microeconomics',
|
38 |
+
'professional_law',
|
39 |
+
'medical_genetics',
|
40 |
+
'professional_psychology',
|
41 |
+
'jurisprudence',
|
42 |
+
'world_religions',
|
43 |
+
'philosophy',
|
44 |
+
'virology',
|
45 |
+
'high_school_chemistry',
|
46 |
+
'public_relations',
|
47 |
+
'high_school_macroeconomics',
|
48 |
+
'human_sexuality',
|
49 |
+
'elementary_mathematics',
|
50 |
+
'high_school_physics',
|
51 |
+
'high_school_computer_science',
|
52 |
+
'high_school_european_history',
|
53 |
+
'business_ethics',
|
54 |
+
'moral_disputes',
|
55 |
+
'high_school_statistics',
|
56 |
+
'miscellaneous',
|
57 |
+
'formal_logic',
|
58 |
+
'high_school_government_and_politics',
|
59 |
+
'prehistory',
|
60 |
+
'security_studies',
|
61 |
+
'high_school_biology',
|
62 |
+
'logical_fallacies',
|
63 |
+
'high_school_world_history',
|
64 |
+
'professional_medicine',
|
65 |
+
'high_school_mathematics',
|
66 |
+
'college_medicine',
|
67 |
+
'high_school_us_history',
|
68 |
+
'sociology',
|
69 |
+
'econometrics',
|
70 |
+
'high_school_psychology',
|
71 |
+
'human_aging',
|
72 |
+
'us_foreign_policy',
|
73 |
+
'conceptual_physics',
|
74 |
+
]
|
75 |
+
|
76 |
+
mmlu_datasets = []
|
77 |
+
for _name in mmlu_all_sets:
|
78 |
+
_hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
|
79 |
+
mmlu_infer_cfg = dict(
|
80 |
+
ice_template=dict(
|
81 |
+
type=PromptTemplate,
|
82 |
+
template=dict(round=[
|
83 |
+
dict(
|
84 |
+
role='HUMAN',
|
85 |
+
prompt=
|
86 |
+
f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
|
87 |
+
),
|
88 |
+
dict(role='BOT', prompt='{target}\n')
|
89 |
+
]),
|
90 |
+
),
|
91 |
+
prompt_template=dict(
|
92 |
+
type=PromptTemplate,
|
93 |
+
template=dict(
|
94 |
+
begin='</E>',
|
95 |
+
round=[
|
96 |
+
dict(
|
97 |
+
role='HUMAN',
|
98 |
+
prompt=
|
99 |
+
f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
|
100 |
+
),
|
101 |
+
],
|
102 |
+
),
|
103 |
+
ice_token='</E>',
|
104 |
+
),
|
105 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
106 |
+
inferencer=dict(type=GenInferencer),
|
107 |
+
)
|
108 |
+
|
109 |
+
mmlu_eval_cfg = dict(
|
110 |
+
evaluator=dict(type=AccEvaluator),
|
111 |
+
pred_postprocessor=dict(type=first_capital_postprocess))
|
112 |
+
|
113 |
+
mmlu_datasets.append(
|
114 |
+
dict(
|
115 |
+
abbr=f'lukaemon_mmlu_{_name}',
|
116 |
+
type=MMLUDataset,
|
117 |
+
path='opencompass/mmlu',
|
118 |
+
name=_name,
|
119 |
+
reader_cfg=mmlu_reader_cfg,
|
120 |
+
infer_cfg=mmlu_infer_cfg,
|
121 |
+
eval_cfg=mmlu_eval_cfg,
|
122 |
+
))
|
123 |
+
|
124 |
+
del _name, _hint
|
opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import MMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
7 |
+
|
8 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
9 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
10 |
+
mmlu_reader_cfg = dict(
|
11 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
12 |
+
output_column='target',
|
13 |
+
train_split='dev')
|
14 |
+
|
15 |
+
mmlu_all_sets = [
|
16 |
+
'college_biology',
|
17 |
+
'college_chemistry',
|
18 |
+
'college_computer_science',
|
19 |
+
'college_mathematics',
|
20 |
+
'college_physics',
|
21 |
+
'electrical_engineering',
|
22 |
+
'astronomy',
|
23 |
+
'anatomy',
|
24 |
+
'abstract_algebra',
|
25 |
+
'machine_learning',
|
26 |
+
'clinical_knowledge',
|
27 |
+
'global_facts',
|
28 |
+
'management',
|
29 |
+
'nutrition',
|
30 |
+
'marketing',
|
31 |
+
'professional_accounting',
|
32 |
+
'high_school_geography',
|
33 |
+
'international_law',
|
34 |
+
'moral_scenarios',
|
35 |
+
'computer_security',
|
36 |
+
'high_school_microeconomics',
|
37 |
+
'professional_law',
|
38 |
+
'medical_genetics',
|
39 |
+
'professional_psychology',
|
40 |
+
'jurisprudence',
|
41 |
+
'world_religions',
|
42 |
+
'philosophy',
|
43 |
+
'virology',
|
44 |
+
'high_school_chemistry',
|
45 |
+
'public_relations',
|
46 |
+
'high_school_macroeconomics',
|
47 |
+
'human_sexuality',
|
48 |
+
'elementary_mathematics',
|
49 |
+
'high_school_physics',
|
50 |
+
'high_school_computer_science',
|
51 |
+
'high_school_european_history',
|
52 |
+
'business_ethics',
|
53 |
+
'moral_disputes',
|
54 |
+
'high_school_statistics',
|
55 |
+
'miscellaneous',
|
56 |
+
'formal_logic',
|
57 |
+
'high_school_government_and_politics',
|
58 |
+
'prehistory',
|
59 |
+
'security_studies',
|
60 |
+
'high_school_biology',
|
61 |
+
'logical_fallacies',
|
62 |
+
'high_school_world_history',
|
63 |
+
'professional_medicine',
|
64 |
+
'high_school_mathematics',
|
65 |
+
'college_medicine',
|
66 |
+
'high_school_us_history',
|
67 |
+
'sociology',
|
68 |
+
'econometrics',
|
69 |
+
'high_school_psychology',
|
70 |
+
'human_aging',
|
71 |
+
'us_foreign_policy',
|
72 |
+
'conceptual_physics',
|
73 |
+
]
|
74 |
+
|
75 |
+
mmlu_datasets = []
|
76 |
+
for _name in mmlu_all_sets:
|
77 |
+
_hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
|
78 |
+
mmlu_infer_cfg = dict(
|
79 |
+
ice_template=dict(
|
80 |
+
type=PromptTemplate,
|
81 |
+
template=
|
82 |
+
'{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: {target}\n',
|
83 |
+
),
|
84 |
+
prompt_template=dict(
|
85 |
+
type=PromptTemplate,
|
86 |
+
template=
|
87 |
+
f'{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:',
|
88 |
+
ice_token='</E>',
|
89 |
+
),
|
90 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
91 |
+
inferencer=dict(type=GenInferencer),
|
92 |
+
)
|
93 |
+
|
94 |
+
mmlu_eval_cfg = dict(
|
95 |
+
evaluator=dict(type=AccEvaluator),
|
96 |
+
pred_postprocessor=dict(type=first_capital_postprocess),
|
97 |
+
)
|
98 |
+
|
99 |
+
mmlu_datasets.append(
|
100 |
+
dict(
|
101 |
+
abbr=f'lukaemon_mmlu_{_name}',
|
102 |
+
type=MMLUDataset,
|
103 |
+
path='opencompass/mmlu',
|
104 |
+
name=_name,
|
105 |
+
reader_cfg=mmlu_reader_cfg,
|
106 |
+
infer_cfg=mmlu_infer_cfg,
|
107 |
+
eval_cfg=mmlu_eval_cfg,
|
108 |
+
))
|
109 |
+
|
110 |
+
del _name, _hint
|
opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
5 |
+
from opencompass.datasets import MMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
|
8 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
9 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
10 |
+
|
11 |
+
mmlu_reader_cfg = dict(
|
12 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
13 |
+
output_column='target',
|
14 |
+
train_split='dev')
|
15 |
+
|
16 |
+
mmlu_all_sets = [
|
17 |
+
'college_biology',
|
18 |
+
'college_chemistry',
|
19 |
+
'college_computer_science',
|
20 |
+
'college_mathematics',
|
21 |
+
'college_physics',
|
22 |
+
'electrical_engineering',
|
23 |
+
'astronomy',
|
24 |
+
'anatomy',
|
25 |
+
'abstract_algebra',
|
26 |
+
'machine_learning',
|
27 |
+
'clinical_knowledge',
|
28 |
+
'global_facts',
|
29 |
+
'management',
|
30 |
+
'nutrition',
|
31 |
+
'marketing',
|
32 |
+
'professional_accounting',
|
33 |
+
'high_school_geography',
|
34 |
+
'international_law',
|
35 |
+
'moral_scenarios',
|
36 |
+
'computer_security',
|
37 |
+
'high_school_microeconomics',
|
38 |
+
'professional_law',
|
39 |
+
'medical_genetics',
|
40 |
+
'professional_psychology',
|
41 |
+
'jurisprudence',
|
42 |
+
'world_religions',
|
43 |
+
'philosophy',
|
44 |
+
'virology',
|
45 |
+
'high_school_chemistry',
|
46 |
+
'public_relations',
|
47 |
+
'high_school_macroeconomics',
|
48 |
+
'human_sexuality',
|
49 |
+
'elementary_mathematics',
|
50 |
+
'high_school_physics',
|
51 |
+
'high_school_computer_science',
|
52 |
+
'high_school_european_history',
|
53 |
+
'business_ethics',
|
54 |
+
'moral_disputes',
|
55 |
+
'high_school_statistics',
|
56 |
+
'miscellaneous',
|
57 |
+
'formal_logic',
|
58 |
+
'high_school_government_and_politics',
|
59 |
+
'prehistory',
|
60 |
+
'security_studies',
|
61 |
+
'high_school_biology',
|
62 |
+
'logical_fallacies',
|
63 |
+
'high_school_world_history',
|
64 |
+
'professional_medicine',
|
65 |
+
'high_school_mathematics',
|
66 |
+
'college_medicine',
|
67 |
+
'high_school_us_history',
|
68 |
+
'sociology',
|
69 |
+
'econometrics',
|
70 |
+
'high_school_psychology',
|
71 |
+
'human_aging',
|
72 |
+
'us_foreign_policy',
|
73 |
+
'conceptual_physics',
|
74 |
+
]
|
75 |
+
|
76 |
+
mmlu_datasets = []
|
77 |
+
for _name in mmlu_all_sets:
|
78 |
+
_hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
|
79 |
+
mmlu_infer_cfg = dict(
|
80 |
+
ice_template=dict(
|
81 |
+
type=PromptTemplate,
|
82 |
+
template=dict(round=[
|
83 |
+
dict(
|
84 |
+
role='HUMAN',
|
85 |
+
prompt=
|
86 |
+
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
87 |
+
),
|
88 |
+
dict(role='BOT', prompt='{target}\n')
|
89 |
+
]),
|
90 |
+
),
|
91 |
+
prompt_template=dict(
|
92 |
+
type=PromptTemplate,
|
93 |
+
template=dict(
|
94 |
+
begin='</E>',
|
95 |
+
round=[
|
96 |
+
dict(
|
97 |
+
role='HUMAN',
|
98 |
+
prompt=
|
99 |
+
f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
|
100 |
+
),
|
101 |
+
],
|
102 |
+
),
|
103 |
+
ice_token='</E>',
|
104 |
+
),
|
105 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
106 |
+
inferencer=dict(type=GenInferencer),
|
107 |
+
)
|
108 |
+
|
109 |
+
mmlu_eval_cfg = dict(
|
110 |
+
evaluator=dict(type=AccEvaluator),
|
111 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
112 |
+
|
113 |
+
mmlu_datasets.append(
|
114 |
+
dict(
|
115 |
+
abbr=f'lukaemon_mmlu_{_name}',
|
116 |
+
type=MMLUDataset,
|
117 |
+
path='opencompass/mmlu',
|
118 |
+
name=_name,
|
119 |
+
reader_cfg=mmlu_reader_cfg,
|
120 |
+
infer_cfg=mmlu_infer_cfg,
|
121 |
+
eval_cfg=mmlu_eval_cfg,
|
122 |
+
))
|
123 |
+
|
124 |
+
del _name, _hint
|
opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
5 |
+
from opencompass.datasets import MMLUDataset
|
6 |
+
from opencompass.utils.text_postprocessors import first_option_postprocess
|
7 |
+
from opencompass.utils.model_postprocessors import navie_model_postprocess
|
8 |
+
from opencompass.utils.postprocessors.naive import OPTION_NAVIE_PROMPT_TEMPLATE
|
9 |
+
|
10 |
+
|
11 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
12 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
13 |
+
|
14 |
+
mmlu_reader_cfg = dict(
|
15 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
16 |
+
output_column='target',
|
17 |
+
train_split='dev')
|
18 |
+
|
19 |
+
mmlu_all_sets = [
|
20 |
+
'college_biology',
|
21 |
+
'college_chemistry',
|
22 |
+
'college_computer_science',
|
23 |
+
'college_mathematics',
|
24 |
+
'college_physics',
|
25 |
+
'electrical_engineering',
|
26 |
+
'astronomy',
|
27 |
+
'anatomy',
|
28 |
+
'abstract_algebra',
|
29 |
+
'machine_learning',
|
30 |
+
'clinical_knowledge',
|
31 |
+
'global_facts',
|
32 |
+
'management',
|
33 |
+
'nutrition',
|
34 |
+
'marketing',
|
35 |
+
'professional_accounting',
|
36 |
+
'high_school_geography',
|
37 |
+
'international_law',
|
38 |
+
'moral_scenarios',
|
39 |
+
'computer_security',
|
40 |
+
'high_school_microeconomics',
|
41 |
+
'professional_law',
|
42 |
+
'medical_genetics',
|
43 |
+
'professional_psychology',
|
44 |
+
'jurisprudence',
|
45 |
+
'world_religions',
|
46 |
+
'philosophy',
|
47 |
+
'virology',
|
48 |
+
'high_school_chemistry',
|
49 |
+
'public_relations',
|
50 |
+
'high_school_macroeconomics',
|
51 |
+
'human_sexuality',
|
52 |
+
'elementary_mathematics',
|
53 |
+
'high_school_physics',
|
54 |
+
'high_school_computer_science',
|
55 |
+
'high_school_european_history',
|
56 |
+
'business_ethics',
|
57 |
+
'moral_disputes',
|
58 |
+
'high_school_statistics',
|
59 |
+
'miscellaneous',
|
60 |
+
'formal_logic',
|
61 |
+
'high_school_government_and_politics',
|
62 |
+
'prehistory',
|
63 |
+
'security_studies',
|
64 |
+
'high_school_biology',
|
65 |
+
'logical_fallacies',
|
66 |
+
'high_school_world_history',
|
67 |
+
'professional_medicine',
|
68 |
+
'high_school_mathematics',
|
69 |
+
'college_medicine',
|
70 |
+
'high_school_us_history',
|
71 |
+
'sociology',
|
72 |
+
'econometrics',
|
73 |
+
'high_school_psychology',
|
74 |
+
'human_aging',
|
75 |
+
'us_foreign_policy',
|
76 |
+
'conceptual_physics',
|
77 |
+
]
|
78 |
+
|
79 |
+
mmlu_datasets = []
|
80 |
+
for _name in mmlu_all_sets:
|
81 |
+
_hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
|
82 |
+
mmlu_infer_cfg = dict(
|
83 |
+
ice_template=dict(
|
84 |
+
type=PromptTemplate,
|
85 |
+
template=dict(round=[
|
86 |
+
dict(
|
87 |
+
role='HUMAN',
|
88 |
+
prompt=
|
89 |
+
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
90 |
+
),
|
91 |
+
dict(role='BOT', prompt='{target}\n')
|
92 |
+
]),
|
93 |
+
),
|
94 |
+
prompt_template=dict(
|
95 |
+
type=PromptTemplate,
|
96 |
+
template=dict(
|
97 |
+
begin='</E>',
|
98 |
+
round=[
|
99 |
+
dict(
|
100 |
+
role='HUMAN',
|
101 |
+
prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
|
102 |
+
),
|
103 |
+
],
|
104 |
+
),
|
105 |
+
ice_token='</E>',
|
106 |
+
),
|
107 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
108 |
+
inferencer=dict(type=GenInferencer),
|
109 |
+
)
|
110 |
+
|
111 |
+
# # You can write your own postprocess prompt like:
|
112 |
+
# MMLU_NAVIE_PROMPT_TEMPLATE = """
|
113 |
+
# There is a detailed explanation of the final answer you should extract:
|
114 |
+
# 1. ...
|
115 |
+
# 2. ...
|
116 |
+
# ...
|
117 |
+
# """
|
118 |
+
|
119 |
+
mmlu_eval_cfg = dict(
|
120 |
+
evaluator=dict(type=AccwithDetailsEvaluator),
|
121 |
+
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
122 |
+
model_postprocessor=dict(
|
123 |
+
type=navie_model_postprocess,
|
124 |
+
custom_instruction=OPTION_NAVIE_PROMPT_TEMPLATE,
|
125 |
+
model_name='',
|
126 |
+
api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
|
127 |
+
)
|
128 |
+
|
129 |
+
|
130 |
+
mmlu_datasets.append(
|
131 |
+
dict(
|
132 |
+
abbr=f'lukaemon_mmlu_{_name}',
|
133 |
+
type=MMLUDataset,
|
134 |
+
path='opencompass/mmlu',
|
135 |
+
name=_name,
|
136 |
+
reader_cfg=mmlu_reader_cfg,
|
137 |
+
infer_cfg=mmlu_infer_cfg,
|
138 |
+
eval_cfg=mmlu_eval_cfg,
|
139 |
+
))
|
140 |
+
|
141 |
+
del _name, _hint
|
opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
3 |
+
from opencompass.openicl.icl_retriever import ZeroRetriever
|
4 |
+
from opencompass.openicl.icl_inferencer import GenInferencer
|
5 |
+
from opencompass.openicl.icl_evaluator import AccEvaluator
|
6 |
+
from opencompass.datasets import MMLUDataset
|
7 |
+
from opencompass.utils.text_postprocessors import match_answer_pattern
|
8 |
+
|
9 |
+
with read_base():
|
10 |
+
from .mmlu_all_sets import mmlu_all_sets
|
11 |
+
|
12 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
13 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
14 |
+
|
15 |
+
QUERY_TEMPLATE = """
|
16 |
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
|
17 |
+
|
18 |
+
{input}
|
19 |
+
|
20 |
+
A) {A}
|
21 |
+
B) {B}
|
22 |
+
C) {C}
|
23 |
+
D) {D}
|
24 |
+
""".strip()
|
25 |
+
|
26 |
+
mmlu_reader_cfg = dict(
|
27 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
28 |
+
output_column='target',
|
29 |
+
train_split='dev')
|
30 |
+
|
31 |
+
mmlu_datasets = []
|
32 |
+
for name in mmlu_all_sets:
|
33 |
+
mmlu_infer_cfg = dict(
|
34 |
+
prompt_template=dict(
|
35 |
+
type=PromptTemplate,
|
36 |
+
template=dict(
|
37 |
+
round=[
|
38 |
+
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
39 |
+
],
|
40 |
+
),
|
41 |
+
),
|
42 |
+
retriever=dict(type=ZeroRetriever),
|
43 |
+
inferencer=dict(type=GenInferencer),
|
44 |
+
)
|
45 |
+
|
46 |
+
mmlu_eval_cfg = dict(
|
47 |
+
evaluator=dict(type=AccEvaluator),
|
48 |
+
pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'))
|
49 |
+
|
50 |
+
mmlu_datasets.append(
|
51 |
+
dict(
|
52 |
+
abbr=f'lukaemon_mmlu_{name}',
|
53 |
+
type=MMLUDataset,
|
54 |
+
path='opencompass/mmlu',
|
55 |
+
name=name,
|
56 |
+
reader_cfg=mmlu_reader_cfg,
|
57 |
+
infer_cfg=mmlu_infer_cfg,
|
58 |
+
eval_cfg=mmlu_eval_cfg,
|
59 |
+
))
|
opencompass/configs/datasets/mmlu/mmlu_ppl.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from mmengine.config import read_base
|
2 |
+
|
3 |
+
with read_base():
|
4 |
+
from .mmlu_ppl_ac766d import mmlu_datasets # noqa: F401, F403
|
opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
2 |
+
from opencompass.openicl.icl_retriever import FixKRetriever
|
3 |
+
from opencompass.openicl.icl_inferencer import PPLInferencer
|
4 |
+
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
5 |
+
from opencompass.datasets import MMLUDataset
|
6 |
+
|
7 |
+
# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
|
8 |
+
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
9 |
+
|
10 |
+
mmlu_reader_cfg = dict(
|
11 |
+
input_columns=['input', 'A', 'B', 'C', 'D'],
|
12 |
+
output_column='target',
|
13 |
+
train_split='dev')
|
14 |
+
|
15 |
+
mmlu_all_sets = [
|
16 |
+
'college_biology',
|
17 |
+
'college_chemistry',
|
18 |
+
'college_computer_science',
|
19 |
+
'college_mathematics',
|
20 |
+
'college_physics',
|
21 |
+
'electrical_engineering',
|
22 |
+
'astronomy',
|
23 |
+
'anatomy',
|
24 |
+
'abstract_algebra',
|
25 |
+
'machine_learning',
|
26 |
+
'clinical_knowledge',
|
27 |
+
'global_facts',
|
28 |
+
'management',
|
29 |
+
'nutrition',
|
30 |
+
'marketing',
|
31 |
+
'professional_accounting',
|
32 |
+
'high_school_geography',
|
33 |
+
'international_law',
|
34 |
+
'moral_scenarios',
|
35 |
+
'computer_security',
|
36 |
+
'high_school_microeconomics',
|
37 |
+
'professional_law',
|
38 |
+
'medical_genetics',
|
39 |
+
'professional_psychology',
|
40 |
+
'jurisprudence',
|
41 |
+
'world_religions',
|
42 |
+
'philosophy',
|
43 |
+
'virology',
|
44 |
+
'high_school_chemistry',
|
45 |
+
'public_relations',
|
46 |
+
'high_school_macroeconomics',
|
47 |
+
'human_sexuality',
|
48 |
+
'elementary_mathematics',
|
49 |
+
'high_school_physics',
|
50 |
+
'high_school_computer_science',
|
51 |
+
'high_school_european_history',
|
52 |
+
'business_ethics',
|
53 |
+
'moral_disputes',
|
54 |
+
'high_school_statistics',
|
55 |
+
'miscellaneous',
|
56 |
+
'formal_logic',
|
57 |
+
'high_school_government_and_politics',
|
58 |
+
'prehistory',
|
59 |
+
'security_studies',
|
60 |
+
'high_school_biology',
|
61 |
+
'logical_fallacies',
|
62 |
+
'high_school_world_history',
|
63 |
+
'professional_medicine',
|
64 |
+
'high_school_mathematics',
|
65 |
+
'college_medicine',
|
66 |
+
'high_school_us_history',
|
67 |
+
'sociology',
|
68 |
+
'econometrics',
|
69 |
+
'high_school_psychology',
|
70 |
+
'human_aging',
|
71 |
+
'us_foreign_policy',
|
72 |
+
'conceptual_physics',
|
73 |
+
]
|
74 |
+
|
75 |
+
mmlu_datasets = []
|
76 |
+
for _name in mmlu_all_sets:
|
77 |
+
_hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
|
78 |
+
question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
|
79 |
+
mmlu_infer_cfg = dict(
|
80 |
+
ice_template=dict(
|
81 |
+
type=PromptTemplate,
|
82 |
+
template={opt: f'{question_overall}\nAnswer: {opt}\n' for opt in ['A', 'B', 'C', 'D']},
|
83 |
+
),
|
84 |
+
prompt_template=dict(
|
85 |
+
type=PromptTemplate,
|
86 |
+
template={opt: f'{_hint}</E>{question_overall}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
|
87 |
+
ice_token='</E>',
|
88 |
+
),
|
89 |
+
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
90 |
+
inferencer=dict(type=PPLInferencer),
|
91 |
+
)
|
92 |
+
|
93 |
+
mmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator), )
|
94 |
+
|
95 |
+
mmlu_datasets.append(
|
96 |
+
dict(
|
97 |
+
abbr=f'lukaemon_mmlu_{_name}',
|
98 |
+
type=MMLUDataset,
|
99 |
+
path='opencompass/mmlu',
|
100 |
+
name=_name,
|
101 |
+
reader_cfg=mmlu_reader_cfg,
|
102 |
+
infer_cfg=mmlu_infer_cfg,
|
103 |
+
eval_cfg=mmlu_eval_cfg,
|
104 |
+
))
|
105 |
+
|
106 |
+
del _name, _hint
|