tuandunghcmut commited on
Commit
cc8629b
·
verified ·
1 Parent(s): 56c27a3

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py +55 -0
  2. opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py +53 -0
  3. opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py +48 -0
  4. opencompass/configs/datasets/ARC_c/ARC_c_few_shot_ppl.py +63 -0
  5. opencompass/configs/datasets/ARC_c/ARC_c_gen.py +4 -0
  6. opencompass/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py +44 -0
  7. opencompass/configs/datasets/ARC_c/ARC_c_ppl.py +4 -0
  8. opencompass/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py +37 -0
  9. opencompass/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py +54 -0
  10. opencompass/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py +36 -0
  11. opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py +4 -0
  12. opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py +51 -0
  13. opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl.py +4 -0
  14. opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py +45 -0
  15. opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_acccb5.py +39 -0
  16. opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py +4 -0
  17. opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py +29 -0
  18. opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_30dea0.py +42 -0
  19. opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py +35 -0
  20. opencompass/configs/datasets/humaneval/README.md +69 -0
  21. opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py +36 -0
  22. opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py +36 -0
  23. opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py +36 -0
  24. opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py +33 -0
  25. opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py +31 -0
  26. opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py +41 -0
  27. opencompass/configs/datasets/humaneval/humaneval_gen.py +4 -0
  28. opencompass/configs/datasets/humaneval/humaneval_gen_66a7f4.py +35 -0
  29. opencompass/configs/datasets/humaneval/humaneval_gen_8e312c.py +37 -0
  30. opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py +36 -0
  31. opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py +36 -0
  32. opencompass/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py +37 -0
  33. opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py +4 -0
  34. opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py +35 -0
  35. opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py +37 -0
  36. opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py +36 -0
  37. opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py +37 -0
  38. opencompass/configs/datasets/mmlu/README.md +368 -0
  39. opencompass/configs/datasets/mmlu/mmlu_all_sets.py +59 -0
  40. opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py +114 -0
  41. opencompass/configs/datasets/mmlu/mmlu_gen.py +4 -0
  42. opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py +124 -0
  43. opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py +123 -0
  44. opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py +124 -0
  45. opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py +110 -0
  46. opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py +124 -0
  47. opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py +141 -0
  48. opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py +59 -0
  49. opencompass/configs/datasets/mmlu/mmlu_ppl.py +4 -0
  50. opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py +106 -0
opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
5
+ from opencompass.datasets import ARCDatasetClean as ARCDataset
6
+
7
+ ARC_c_reader_cfg = dict(
8
+ input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
9
+ output_column='answerKey')
10
+
11
+ ARC_c_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template={
15
+ 'A':
16
+ dict(
17
+ round=[
18
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
19
+ dict(role='BOT', prompt='{textA}')
20
+ ], ),
21
+ 'B':
22
+ dict(
23
+ round=[
24
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
25
+ dict(role='BOT', prompt='{textB}')
26
+ ], ),
27
+ 'C':
28
+ dict(
29
+ round=[
30
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
31
+ dict(role='BOT', prompt='{textC}')
32
+ ], ),
33
+ 'D':
34
+ dict(
35
+ round=[
36
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
37
+ dict(role='BOT', prompt='{textD}')
38
+ ], ),
39
+ }),
40
+ retriever=dict(type=ZeroRetriever),
41
+ inferencer=dict(type=PPLInferencer))
42
+
43
+ ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
44
+ analyze_contamination=True)
45
+
46
+ ARC_c_datasets = [
47
+ dict(
48
+ type=ARCDataset,
49
+ abbr='ARC-c-test',
50
+ path='opencompass/ai2_arc-test',
51
+ name='ARC-Challenge',
52
+ reader_cfg=ARC_c_reader_cfg,
53
+ infer_cfg=ARC_c_infer_cfg,
54
+ eval_cfg=ARC_c_eval_cfg)
55
+ ]
opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import ARCDataset
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess, match_answer_pattern
7
+
8
+ QUERY_TEMPLATE = """
9
+ Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
10
+
11
+ {question}
12
+
13
+ A. {textA}
14
+ B. {textB}
15
+ C. {textC}
16
+ D. {textD}
17
+ """.strip()
18
+
19
+ ARC_c_reader_cfg = dict(
20
+ input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
21
+ output_column='answerKey')
22
+
23
+ ARC_c_infer_cfg = dict(
24
+ prompt_template=dict(
25
+ type=PromptTemplate,
26
+ template=dict(
27
+ round=[
28
+ dict(
29
+ role='HUMAN',
30
+ prompt=QUERY_TEMPLATE)
31
+ ], ),
32
+ ),
33
+ retriever=dict(type=ZeroRetriever),
34
+ inferencer=dict(type=GenInferencer),
35
+ )
36
+
37
+ ARC_c_eval_cfg = dict(
38
+ evaluator=dict(type=AccEvaluator),
39
+ pred_role='BOT',
40
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
41
+ )
42
+
43
+ ARC_c_datasets = [
44
+ dict(
45
+ abbr='ARC-c',
46
+ type=ARCDataset,
47
+ path='opencompass/ai2_arc-dev',
48
+ name='ARC-Challenge',
49
+ reader_cfg=ARC_c_reader_cfg,
50
+ infer_cfg=ARC_c_infer_cfg,
51
+ eval_cfg=ARC_c_eval_cfg,
52
+ )
53
+ ]
opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import ARCDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ ARC_c_reader_cfg = dict(
9
+ input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
10
+ output_column='answerKey',
11
+ )
12
+
13
+ ARC_c_infer_cfg = dict(
14
+ ice_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(
17
+ begin='</E>',
18
+ round=[
19
+ dict(
20
+ role='HUMAN',
21
+ prompt='Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:',
22
+ ),
23
+ dict(role='BOT', prompt='{answerKey}'),
24
+ ],
25
+ ),
26
+ ice_token='</E>',
27
+ ),
28
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
29
+ inferencer=dict(type=GenInferencer, max_out_len=50),
30
+ )
31
+
32
+ ARC_c_eval_cfg = dict(
33
+ evaluator=dict(type=AccEvaluator),
34
+ pred_role='BOT',
35
+ pred_postprocessor=dict(type=first_capital_postprocess),
36
+ )
37
+
38
+ ARC_c_datasets = [
39
+ dict(
40
+ abbr='ARC-c',
41
+ type=ARCDataset,
42
+ path='opencompass/ai2_arc-dev',
43
+ name='ARC-Challenge',
44
+ reader_cfg=ARC_c_reader_cfg,
45
+ infer_cfg=ARC_c_infer_cfg,
46
+ eval_cfg=ARC_c_eval_cfg,
47
+ )
48
+ ]
opencompass/configs/datasets/ARC_c/ARC_c_few_shot_ppl.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import ARCDataset
6
+
7
+ ARC_c_reader_cfg = dict(
8
+ input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
9
+ output_column='answerKey',
10
+ )
11
+
12
+ ARC_c_infer_cfg = dict(
13
+ ice_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 'A': dict(
17
+ begin='</E>',
18
+ round=[
19
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
20
+ dict(role='BOT', prompt='{textA}'),
21
+ ],
22
+ ),
23
+ 'B': dict(
24
+ begin='</E>',
25
+ round=[
26
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
27
+ dict(role='BOT', prompt='{textB}'),
28
+ ],
29
+ ),
30
+ 'C': dict(
31
+ begin='</E>',
32
+ round=[
33
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
34
+ dict(role='BOT', prompt='{textC}'),
35
+ ],
36
+ ),
37
+ 'D': dict(
38
+ begin='</E>',
39
+ round=[
40
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
41
+ dict(role='BOT', prompt='{textD}'),
42
+ ],
43
+ ),
44
+ },
45
+ ice_token='</E>',
46
+ ),
47
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
48
+ inferencer=dict(type=PPLInferencer),
49
+ )
50
+
51
+ ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
52
+
53
+ ARC_c_datasets = [
54
+ dict(
55
+ type=ARCDataset,
56
+ abbr='ARC-c',
57
+ path='opencompass/ai2_arc-dev',
58
+ name='ARC-Challenge',
59
+ reader_cfg=ARC_c_reader_cfg,
60
+ infer_cfg=ARC_c_infer_cfg,
61
+ eval_cfg=ARC_c_eval_cfg,
62
+ )
63
+ ]
opencompass/configs/datasets/ARC_c/ARC_c_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .ARC_c_gen_1e0de5 import ARC_c_datasets # noqa: F401, F403
opencompass/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import ARCDataset
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ ARC_c_reader_cfg = dict(
9
+ input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
10
+ output_column='answerKey')
11
+
12
+ ARC_c_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template=dict(
16
+ round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt=
20
+ 'Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:'
21
+ )
22
+ ], ),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ ARC_c_eval_cfg = dict(
29
+ evaluator=dict(type=AccEvaluator),
30
+ pred_role='BOT',
31
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
32
+ )
33
+
34
+ ARC_c_datasets = [
35
+ dict(
36
+ abbr='ARC-c',
37
+ type=ARCDataset,
38
+ path='opencompass/ai2_arc-dev',
39
+ name='ARC-Challenge',
40
+ reader_cfg=ARC_c_reader_cfg,
41
+ infer_cfg=ARC_c_infer_cfg,
42
+ eval_cfg=ARC_c_eval_cfg,
43
+ )
44
+ ]
opencompass/configs/datasets/ARC_c/ARC_c_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .ARC_c_ppl_a450bd import ARC_c_datasets # noqa: F401, F403
opencompass/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import ARCDataset
6
+
7
+ ARC_c_reader_cfg = dict(
8
+ input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
9
+ output_column='answerKey')
10
+
11
+ ARC_c_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template={
15
+ opt: dict(
16
+ round=[
17
+ dict(role='HUMAN', prompt=f'{{question}}\nA. {{textA}}\nB. {{textB}}\nC. {{textC}}\nD. {{textD}}'),
18
+ dict(role='BOT', prompt=f'Answer: {opt}'),
19
+ ]
20
+ ) for opt in ['A', 'B', 'C', 'D']
21
+ },
22
+ ),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=PPLInferencer))
25
+
26
+ ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
27
+
28
+ ARC_c_datasets = [
29
+ dict(
30
+ type=ARCDataset,
31
+ abbr='ARC-c',
32
+ path='opencompass/ai2_arc-dev',
33
+ name='ARC-Challenge',
34
+ reader_cfg=ARC_c_reader_cfg,
35
+ infer_cfg=ARC_c_infer_cfg,
36
+ eval_cfg=ARC_c_eval_cfg)
37
+ ]
opencompass/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import ARCDataset
6
+
7
+ ARC_c_reader_cfg = dict(
8
+ input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
9
+ output_column='answerKey')
10
+
11
+ ARC_c_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template={
15
+ 'A':
16
+ dict(
17
+ round=[
18
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
19
+ dict(role='BOT', prompt='{textA}')
20
+ ], ),
21
+ 'B':
22
+ dict(
23
+ round=[
24
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
25
+ dict(role='BOT', prompt='{textB}')
26
+ ], ),
27
+ 'C':
28
+ dict(
29
+ round=[
30
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
31
+ dict(role='BOT', prompt='{textC}')
32
+ ], ),
33
+ 'D':
34
+ dict(
35
+ round=[
36
+ dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
37
+ dict(role='BOT', prompt='{textD}')
38
+ ], ),
39
+ }),
40
+ retriever=dict(type=ZeroRetriever),
41
+ inferencer=dict(type=PPLInferencer))
42
+
43
+ ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
44
+
45
+ ARC_c_datasets = [
46
+ dict(
47
+ type=ARCDataset,
48
+ abbr='ARC-c',
49
+ path='opencompass/ai2_arc-dev',
50
+ name='ARC-Challenge',
51
+ reader_cfg=ARC_c_reader_cfg,
52
+ infer_cfg=ARC_c_infer_cfg,
53
+ eval_cfg=ARC_c_eval_cfg)
54
+ ]
opencompass/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ # with read_base():
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+ from opencompass.openicl.icl_inferencer import PPLInferencer
6
+ from opencompass.openicl.icl_evaluator import AccEvaluator
7
+ from opencompass.datasets import ARCDataset
8
+
9
+ ARC_c_reader_cfg = dict(
10
+ input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
11
+ output_column='answerKey')
12
+
13
+ ARC_c_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template={
17
+ 'A': 'Question: {question}\nAnswer: {textA}',
18
+ 'B': 'Question: {question}\nAnswer: {textB}',
19
+ 'C': 'Question: {question}\nAnswer: {textC}',
20
+ 'D': 'Question: {question}\nAnswer: {textD}'
21
+ }),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(type=PPLInferencer))
24
+
25
+ ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
26
+
27
+ ARC_c_datasets = [
28
+ dict(
29
+ type=ARCDataset,
30
+ abbr='ARC-c',
31
+ path='opencompass/ai2_arc-dev',
32
+ name='ARC-Challenge',
33
+ reader_cfg=ARC_c_reader_cfg,
34
+ infer_cfg=ARC_c_infer_cfg,
35
+ eval_cfg=ARC_c_eval_cfg)
36
+ ]
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_chid_gen_0a29a2 import chid_datasets # noqa: F401, F403
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CHIDDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ chid_reader_cfg = dict(
9
+ input_columns=['content','A','B','C','D','E','F','G'],
10
+ output_column='answer',
11
+ )
12
+
13
+ chid_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(
17
+ round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt=
21
+ '{content}\n请选择______处所填的词\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nF. {F}\nG. {G}\n请从”A“,”B“,”C“,”D“,”E“,”F“,”G“中进行选择。答:',
22
+ ),
23
+ ])),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ chid_eval_cfg = dict(
29
+ evaluator=dict(type=AccEvaluator),
30
+ pred_role='BOT',
31
+ pred_postprocessor=dict(type=first_capital_postprocess),
32
+ )
33
+
34
+ chid_datasets = [
35
+ dict(
36
+ abbr='chid-dev',
37
+ type=CHIDDatasetV2,
38
+ path='./data/FewCLUE/chid/dev_few_all.json',
39
+ reader_cfg=chid_reader_cfg,
40
+ infer_cfg=chid_infer_cfg,
41
+ eval_cfg=chid_eval_cfg,
42
+ ),
43
+ dict(
44
+ abbr='chid-test',
45
+ type=CHIDDatasetV2,
46
+ path='./data/FewCLUE/chid/test_public.json',
47
+ reader_cfg=chid_reader_cfg,
48
+ infer_cfg=chid_infer_cfg,
49
+ eval_cfg=chid_eval_cfg,
50
+ ),
51
+ ]
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_chid_ppl_8f2872 import chid_datasets # noqa: F401, F403
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CHIDDataset
6
+
7
+ chid_reader_cfg = dict(
8
+ input_columns=[f'content{i}' for i in range(7)], output_column='answer')
9
+
10
+ chid_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template={
14
+ i: dict(
15
+ round=[
16
+ dict(role='HUMAN', prompt=f'以下句子是否通顺?\n{{content{i}}}'),
17
+ dict(role='BOT', prompt='这个句子是通顺的。'),
18
+ ], )
19
+ for i in range(7)
20
+ }),
21
+ retriever=dict(type=ZeroRetriever),
22
+ inferencer=dict(type=PPLInferencer))
23
+
24
+ chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
25
+
26
+ chid_datasets = [
27
+ dict(
28
+ type=CHIDDataset,
29
+ path='json',
30
+ abbr='chid-dev',
31
+ data_files='./data/FewCLUE/chid/dev_few_all.json',
32
+ split='train',
33
+ reader_cfg=chid_reader_cfg,
34
+ infer_cfg=chid_infer_cfg,
35
+ eval_cfg=chid_eval_cfg),
36
+ dict(
37
+ type=CHIDDataset,
38
+ path='json',
39
+ abbr='chid-test',
40
+ data_files='./data/FewCLUE/chid/test_public.json',
41
+ split='train',
42
+ reader_cfg=chid_reader_cfg,
43
+ infer_cfg=chid_infer_cfg,
44
+ eval_cfg=chid_eval_cfg),
45
+ ]
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_acccb5.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CHIDDataset
6
+
7
+ chid_reader_cfg = dict(
8
+ input_columns=[f'content{i}' for i in range(7)], output_column='answer')
9
+
10
+ chid_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template={i: f'以下句子是否通顺?\n{{content{i}}}\n这个句子是通顺的。'
14
+ for i in range(7)}),
15
+ retriever=dict(type=ZeroRetriever),
16
+ inferencer=dict(type=PPLInferencer))
17
+
18
+ chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
19
+
20
+ chid_datasets = [
21
+ dict(
22
+ type=CHIDDataset,
23
+ path='json',
24
+ abbr='chid-dev',
25
+ data_files='./data/FewCLUE/chid/dev_few_all.json',
26
+ split='train',
27
+ reader_cfg=chid_reader_cfg,
28
+ infer_cfg=chid_infer_cfg,
29
+ eval_cfg=chid_eval_cfg),
30
+ dict(
31
+ type=CHIDDataset,
32
+ path='json',
33
+ abbr='chid-test',
34
+ data_files='./data/FewCLUE/chid/test_public.json',
35
+ split='train',
36
+ reader_cfg=chid_reader_cfg,
37
+ infer_cfg=chid_infer_cfg,
38
+ eval_cfg=chid_eval_cfg),
39
+ ]
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets # noqa: F401, F403
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import EMEvaluator
5
+ from opencompass.datasets import ReCoRDDataset, ReCoRD_postprocess
6
+
7
+ ReCoRD_reader_cfg = dict(
8
+ input_columns=['question', 'text'], output_column='answers')
9
+
10
+ ReCoRD_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=
14
+ 'Passage:{text}\nResult:{question}\nQuestion: What entity does ____ refer to in the result?Give me the entity name:'),
15
+ retriever=dict(type=ZeroRetriever),
16
+ inferencer=dict(type=GenInferencer))
17
+
18
+ ReCoRD_eval_cfg = dict(
19
+ evaluator=dict(type=EMEvaluator), pred_postprocessor=dict(type=ReCoRD_postprocess))
20
+
21
+ ReCoRD_datasets = [
22
+ dict(
23
+ type=ReCoRDDataset,
24
+ abbr='ReCoRD',
25
+ path='./data/SuperGLUE/ReCoRD/val.jsonl',
26
+ reader_cfg=ReCoRD_reader_cfg,
27
+ infer_cfg=ReCoRD_infer_cfg,
28
+ eval_cfg=ReCoRD_eval_cfg)
29
+ ]
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_30dea0.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import EMEvaluator
5
+ from opencompass.datasets import ReCoRDDataset
6
+
7
+ ReCoRD_reader_cfg = dict(
8
+ input_columns=['question', 'text'],
9
+ output_column='answers',
10
+ )
11
+
12
+ ReCoRD_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template=dict(round=[
16
+ dict(
17
+ role='HUMAN',
18
+ prompt=
19
+ 'Passage: {text}\nResult: {question}\nQuestion: What entity does ____ refer to in the result? Give me the entity name:'
20
+ ),
21
+ ]),
22
+ ),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer),
25
+ )
26
+
27
+ ReCoRD_eval_cfg = dict(
28
+ evaluator=dict(type=EMEvaluator),
29
+ pred_role='BOT',
30
+ pred_postprocessor=dict(type='ReCoRD'),
31
+ )
32
+
33
+ ReCoRD_datasets = [
34
+ dict(
35
+ type=ReCoRDDataset,
36
+ abbr='ReCoRD',
37
+ path='./data/SuperGLUE/ReCoRD/val.jsonl',
38
+ reader_cfg=ReCoRD_reader_cfg,
39
+ infer_cfg=ReCoRD_infer_cfg,
40
+ eval_cfg=ReCoRD_eval_cfg,
41
+ )
42
+ ]
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import EMEvaluator
5
+ from opencompass.datasets import ReCoRDDatasetV2, ReCoRD_postprocess
6
+
7
+ ReCoRD_reader_cfg = dict(
8
+ input_columns=['question', 'text'], output_column='answers')
9
+
10
+ ReCoRD_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN', prompt='Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:'
16
+ ),
17
+ ]),
18
+ ),
19
+ retriever=dict(type=ZeroRetriever),
20
+ inferencer=dict(type=GenInferencer))
21
+
22
+ ReCoRD_eval_cfg = dict(
23
+ evaluator=dict(type=EMEvaluator),
24
+ pred_role='BOT',
25
+ pred_postprocessor=dict(type=ReCoRD_postprocess))
26
+
27
+ ReCoRD_datasets = [
28
+ dict(
29
+ type=ReCoRDDatasetV2,
30
+ abbr='ReCoRD',
31
+ path='./data/SuperGLUE/ReCoRD/val.jsonl',
32
+ reader_cfg=ReCoRD_reader_cfg,
33
+ infer_cfg=ReCoRD_infer_cfg,
34
+ eval_cfg=ReCoRD_eval_cfg)
35
+ ]
opencompass/configs/datasets/humaneval/README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HumanEval
2
+
3
+ ```bash
4
+ python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
5
+ python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
6
+ ```
7
+
8
+ ## Base Models
9
+
10
+ | model | pass@1 |
11
+ |:------------------------:|---------:|
12
+ | llama-7b-turbomind | 12.80 |
13
+ | llama-13b-turbomind | 15.24 |
14
+ | llama-30b-turbomind | 9.15 |
15
+ | llama-65b-turbomind | 7.32 |
16
+ | llama-2-7b-turbomind | 14.02 |
17
+ | llama-2-13b-turbomind | 15.24 |
18
+ | llama-2-70b-turbomind | 15.24 |
19
+ | llama-3-8b-turbomind | 28.05 |
20
+ | llama-3-70b-turbomind | 28.05 |
21
+ | internlm2-1.8b-turbomind | 30.49 |
22
+ | internlm2-7b-turbomind | 48.17 |
23
+ | internlm2-20b-turbomind | 51.83 |
24
+ | qwen-1.8b-turbomind | 16.46 |
25
+ | qwen-7b-turbomind | 23.78 |
26
+ | qwen-14b-turbomind | 23.78 |
27
+ | qwen-72b-turbomind | 66.46 |
28
+ | qwen1.5-0.5b-hf | 8.54 |
29
+ | qwen1.5-1.8b-hf | 23.17 |
30
+ | qwen1.5-4b-hf | 41.46 |
31
+ | qwen1.5-7b-hf | 53.05 |
32
+ | qwen1.5-14b-hf | 57.32 |
33
+ | qwen1.5-32b-hf | 70.12 |
34
+ | qwen1.5-72b-hf | 65.85 |
35
+ | qwen1.5-moe-a2-7b-hf | 45.73 |
36
+ | mistral-7b-v0.1-hf | 14.02 |
37
+ | mistral-7b-v0.2-hf | 9.15 |
38
+ | mixtral-8x7b-v0.1-hf | 24.39 |
39
+ | mixtral-8x22b-v0.1-hf | 16.46 |
40
+ | yi-6b-hf | 14.63 |
41
+ | yi-34b-hf | 17.07 |
42
+ | deepseek-7b-base-hf | 18.29 |
43
+ | deepseek-67b-base-hf | 23.17 |
44
+
45
+ ## Chat Models
46
+
47
+ | model | pass@1 |
48
+ |:-----------------------------:|---------:|
49
+ | qwen1.5-0.5b-chat-hf | 9.15 |
50
+ | qwen1.5-1.8b-chat-hf | 15.85 |
51
+ | qwen1.5-4b-chat-hf | 30.49 |
52
+ | qwen1.5-7b-chat-hf | 40.85 |
53
+ | qwen1.5-14b-chat-hf | 50.00 |
54
+ | qwen1.5-32b-chat-hf | 57.93 |
55
+ | qwen1.5-72b-chat-hf | 60.37 |
56
+ | qwen1.5-110b-chat-hf | 65.24 |
57
+ | internlm2-chat-1.8b-hf | 33.54 |
58
+ | internlm2-chat-1.8b-sft-hf | 34.15 |
59
+ | internlm2-chat-7b-hf | 56.71 |
60
+ | internlm2-chat-7b-sft-hf | 61.59 |
61
+ | internlm2-chat-20b-hf | 67.68 |
62
+ | internlm2-chat-20b-sft-hf | 67.68 |
63
+ | llama-3-8b-instruct-hf | 55.49 |
64
+ | llama-3-70b-instruct-hf | 70.73 |
65
+ | llama-3-8b-instruct-lmdeploy | 57.93 |
66
+ | llama-3-70b-instruct-lmdeploy | 70.73 |
67
+ | mistral-7b-instruct-v0.1-hf | 32.32 |
68
+ | mistral-7b-instruct-v0.2-hf | 29.27 |
69
+ | mixtral-8x7b-instruct-v0.1-hf | 34.15 |
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN',
16
+ prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'),
17
+ ])),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=GenInferencer, max_out_len=512))
20
+
21
+ humaneval_eval_cfg = dict(
22
+ evaluator=dict(type=HumanEvalEvaluator),
23
+ pred_role='BOT',
24
+ k=[1, 10, 100], # the parameter only for humaneval
25
+ pred_postprocessor=dict(type=humaneval_postprocess),
26
+ )
27
+
28
+ humaneval_datasets = [
29
+ dict(
30
+ abbr='openai_humaneval',
31
+ type=HumanevalDataset,
32
+ path='opencompass/humaneval',
33
+ reader_cfg=humaneval_reader_cfg,
34
+ infer_cfg=humaneval_infer_cfg,
35
+ eval_cfg=humaneval_eval_cfg)
36
+ ]
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN',
16
+ prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
17
+ ])),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=GenInferencer, max_out_len=512))
20
+
21
+ humaneval_eval_cfg = dict(
22
+ evaluator=dict(type=HumanEvalEvaluator),
23
+ pred_role='BOT',
24
+ k=[1, 10, 100], # the parameter only for humaneval
25
+ pred_postprocessor=dict(type=humaneval_postprocess),
26
+ )
27
+
28
+ humaneval_datasets = [
29
+ dict(
30
+ abbr='openai_humaneval',
31
+ type=HumanevalDataset,
32
+ path='opencompass/humaneval',
33
+ reader_cfg=humaneval_reader_cfg,
34
+ infer_cfg=humaneval_infer_cfg,
35
+ eval_cfg=humaneval_eval_cfg)
36
+ ]
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN',
16
+ prompt='{prompt}'),
17
+ ])),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=GenInferencer, max_out_len=512))
20
+
21
+ humaneval_eval_cfg = dict(
22
+ evaluator=dict(type=HumanEvalEvaluator),
23
+ pred_role='BOT',
24
+ k=[1, 10, 100], # the parameter only for humaneval
25
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
26
+ )
27
+
28
+ humaneval_datasets = [
29
+ dict(
30
+ abbr='openai_humaneval',
31
+ type=HumanevalDataset,
32
+ path='opencompass/humaneval',
33
+ reader_cfg=humaneval_reader_cfg,
34
+ infer_cfg=humaneval_infer_cfg,
35
+ eval_cfg=humaneval_eval_cfg)
36
+ ]
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template='Complete the following python code:\n{prompt}',
14
+ ),
15
+ retriever=dict(type=ZeroRetriever),
16
+ inferencer=dict(type=GenInferencer, max_out_len=512))
17
+
18
+ humaneval_eval_cfg = dict(
19
+ evaluator=dict(type=HumanEvalEvaluator),
20
+ pred_role='BOT',
21
+ k=[1, 10, 100], # the parameter only for humaneval
22
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
23
+ )
24
+
25
+ humaneval_datasets = [
26
+ dict(
27
+ abbr='openai_humaneval',
28
+ type=HumanevalDataset,
29
+ path='opencompass/humaneval',
30
+ reader_cfg=humaneval_reader_cfg,
31
+ infer_cfg=humaneval_infer_cfg,
32
+ eval_cfg=humaneval_eval_cfg)
33
+ ]
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template='{prompt}'),
14
+ retriever=dict(type=ZeroRetriever),
15
+ inferencer=dict(type=GenInferencer, max_out_len=512))
16
+
17
+ humaneval_eval_cfg = dict(
18
+ evaluator=dict(type=HumanEvalEvaluator),
19
+ k=[1, 10, 100], # the parameter only for humaneval
20
+ pred_postprocessor=dict(type=humaneval_postprocess),
21
+ )
22
+
23
+ humaneval_datasets = [
24
+ dict(
25
+ abbr='openai_humaneval',
26
+ type=HumanevalDataset,
27
+ path='opencompass/humaneval',
28
+ reader_cfg=humaneval_reader_cfg,
29
+ infer_cfg=humaneval_infer_cfg,
30
+ eval_cfg=humaneval_eval_cfg)
31
+ ]
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(
14
+ begin=[
15
+ dict(
16
+ role='SYSTEM',
17
+ fallback_role='HUMAN',
18
+ prompt='Complete the following python code:'),
19
+ ],
20
+ round=[
21
+ dict(role='HUMAN', prompt='{prompt}'),
22
+ ])),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer))
25
+
26
+ humaneval_eval_cfg = dict(
27
+ evaluator=dict(type=HumanEvalEvaluator),
28
+ pred_role='BOT',
29
+ k=[1, 10, 100], # the parameter only for humaneval
30
+ pred_postprocessor=dict(type=humaneval_postprocess),
31
+ )
32
+
33
+ humaneval_datasets = [
34
+ dict(
35
+ abbr='openai_humaneval',
36
+ type=HumanevalDataset,
37
+ path='opencompass/humaneval',
38
+ reader_cfg=humaneval_reader_cfg,
39
+ infer_cfg=humaneval_infer_cfg,
40
+ eval_cfg=humaneval_eval_cfg)
41
+ ]
opencompass/configs/datasets/humaneval/humaneval_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .humaneval_gen_8e312c import humaneval_datasets # noqa: F401, F403
opencompass/configs/datasets/humaneval/humaneval_gen_66a7f4.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
7
+
8
+ HUMANEVAL_TEMPLATE = dict(
9
+ round=[
10
+ dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
11
+ ]
12
+ )
13
+
14
+ humaneval_infer_cfg = dict(
15
+ prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
16
+ retriever=dict(type=ZeroRetriever),
17
+ inferencer=dict(type=GenInferencer, max_out_len=1024),
18
+ )
19
+
20
+ humaneval_eval_cfg = dict(
21
+ evaluator=dict(type=HumanEvalEvaluator),
22
+ k=[1, 10, 100],
23
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
24
+ )
25
+
26
+ humaneval_datasets = [
27
+ dict(
28
+ abbr='openai_humaneval',
29
+ type=HumanevalDataset,
30
+ path='opencompass/humaneval',
31
+ reader_cfg=humaneval_reader_cfg,
32
+ infer_cfg=humaneval_infer_cfg,
33
+ eval_cfg=humaneval_eval_cfg,
34
+ )
35
+ ]
opencompass/configs/datasets/humaneval/humaneval_gen_8e312c.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIS SHALL ALSO BE DEPRECATED
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
6
+
7
+ humaneval_reader_cfg = dict(
8
+ input_columns=['prompt'], output_column='task_id', train_split='test')
9
+
10
+ # TODO: allow empty output-column
11
+ humaneval_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template=dict(round=[
15
+ dict(
16
+ role='HUMAN',
17
+ prompt='Complete the following python code:\n{prompt}'),
18
+ ])),
19
+ retriever=dict(type=ZeroRetriever),
20
+ inferencer=dict(type=GenInferencer, max_out_len=512))
21
+
22
+ humaneval_eval_cfg = dict(
23
+ evaluator=dict(type=HumanEvalEvaluator),
24
+ pred_role='BOT',
25
+ k=[1, 10, 100], # the parameter only for humaneval
26
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
27
+ )
28
+
29
+ humaneval_datasets = [
30
+ dict(
31
+ abbr='openai_humaneval',
32
+ type=HumanevalDataset,
33
+ path='opencompass/humaneval',
34
+ reader_cfg=humaneval_reader_cfg,
35
+ infer_cfg=humaneval_infer_cfg,
36
+ eval_cfg=humaneval_eval_cfg)
37
+ ]
opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN',
16
+ prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
17
+ ])),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=GenInferencer, max_out_len=512))
20
+
21
+ humaneval_eval_cfg = dict(
22
+ evaluator=dict(type=HumanEvalEvaluator),
23
+ pred_role='BOT',
24
+ k=[1, 10, 100], # the parameter only for humaneval
25
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
26
+ )
27
+
28
+ humaneval_datasets = [
29
+ dict(
30
+ abbr='openai_humaneval',
31
+ type=HumanevalDataset,
32
+ path='opencompass/humaneval',
33
+ reader_cfg=humaneval_reader_cfg,
34
+ infer_cfg=humaneval_infer_cfg,
35
+ eval_cfg=humaneval_eval_cfg)
36
+ ]
opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN',
16
+ prompt='Complete the following python code:\n{prompt}'),
17
+ ])),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=GenInferencer, max_out_len=512))
20
+
21
+ humaneval_eval_cfg = dict(
22
+ evaluator=dict(type=HumanEvalEvaluator),
23
+ pred_role='BOT',
24
+ k=[1, 10, 100], # the parameter only for humaneval
25
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
26
+ )
27
+
28
+ humaneval_datasets = [
29
+ dict(
30
+ abbr='openai_humaneval_passk',
31
+ type=HumanevalDataset,
32
+ path='opencompass/humaneval',
33
+ reader_cfg=humaneval_reader_cfg,
34
+ infer_cfg=humaneval_infer_cfg,
35
+ eval_cfg=humaneval_eval_cfg)
36
+ ]
opencompass/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN',
16
+ prompt='Complete the following python code:\n{prompt}'),
17
+ ])),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=GenInferencer, max_out_len=512))
20
+
21
+ humaneval_eval_cfg = dict(
22
+ evaluator=dict(type=HumanEvalEvaluator),
23
+ pred_role='BOT',
24
+ k=[1, 10, 100], # the parameter only for humaneval
25
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
26
+ )
27
+
28
+ humaneval_datasets = [
29
+ dict(
30
+ abbr='openai_humaneval_repeat10',
31
+ type=HumanevalDataset,
32
+ path='opencompass/humaneval',
33
+ num_repeats=10,
34
+ reader_cfg=humaneval_reader_cfg,
35
+ infer_cfg=humaneval_infer_cfg,
36
+ eval_cfg=humaneval_eval_cfg)
37
+ ]
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .humaneval_plus_gen_8e312c import humaneval_plus_datasets # noqa: F401, F403
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
7
+
8
+ HUMANEVAL_TEMPLATE = dict(
9
+ round=[
10
+ dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
11
+ ]
12
+ )
13
+
14
+ humaneval_plus_infer_cfg = dict(
15
+ prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
16
+ retriever=dict(type=ZeroRetriever),
17
+ inferencer=dict(type=GenInferencer, max_out_len=1024),
18
+ )
19
+
20
+ humaneval_plus_eval_cfg = dict(
21
+ evaluator=dict(type=HumanEvalPlusEvaluator),
22
+ k=[1, 10, 100],
23
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
24
+ )
25
+
26
+ humaneval_plus_datasets = [
27
+ dict(
28
+ abbr='humaneval_plus',
29
+ type=HumanevalDataset,
30
+ path='opencompass/humaneval',
31
+ reader_cfg=humaneval_plus_reader_cfg,
32
+ infer_cfg=humaneval_plus_infer_cfg,
33
+ eval_cfg=humaneval_plus_eval_cfg,
34
+ )
35
+ ]
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # THIS SHALL ALSO BE DEPRECATED
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
6
+
7
+ humaneval_plus_reader_cfg = dict(
8
+ input_columns=['prompt'], output_column='task_id', train_split='test')
9
+
10
+ # TODO: allow empty output-column
11
+ humaneval_plus_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template=dict(round=[
15
+ dict(
16
+ role='HUMAN',
17
+ prompt='Complete the following python code:\n{prompt}'),
18
+ ])),
19
+ retriever=dict(type=ZeroRetriever),
20
+ inferencer=dict(type=GenInferencer, max_out_len=512))
21
+
22
+ humaneval_plus_eval_cfg = dict(
23
+ evaluator=dict(type=HumanEvalPlusEvaluator),
24
+ pred_role='BOT',
25
+ k=[1, 10, 100], # the parameter only for humaneval
26
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
27
+ )
28
+
29
+ humaneval_plus_datasets = [
30
+ dict(
31
+ abbr='humaneval_plus',
32
+ type=HumanevalDataset,
33
+ path='opencompass/humaneval',
34
+ reader_cfg=humaneval_plus_reader_cfg,
35
+ infer_cfg=humaneval_plus_infer_cfg,
36
+ eval_cfg=humaneval_plus_eval_cfg)
37
+ ]
opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_plus_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_plus_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN',
16
+ prompt='Complete the following python code:\n{prompt}'),
17
+ ])),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=GenInferencer, max_out_len=512))
20
+
21
+ humaneval_plus_eval_cfg = dict(
22
+ evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
23
+ pred_role='BOT',
24
+ k=[1, 10, 100], # the parameter only for humaneval
25
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
26
+ )
27
+
28
+ humaneval_plus_datasets = [
29
+ dict(
30
+ abbr='humaneval_plus_passk',
31
+ type=HumanevalDataset,
32
+ path='opencompass/humaneval',
33
+ reader_cfg=humaneval_plus_reader_cfg,
34
+ infer_cfg=humaneval_plus_infer_cfg,
35
+ eval_cfg=humaneval_plus_eval_cfg)
36
+ ]
opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
5
+
6
+ humaneval_plus_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # TODO: allow empty output-column
10
+ humaneval_plus_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(round=[
14
+ dict(
15
+ role='HUMAN',
16
+ prompt='Complete the following python code:\n{prompt}'),
17
+ ])),
18
+ retriever=dict(type=ZeroRetriever),
19
+ inferencer=dict(type=GenInferencer, max_out_len=512))
20
+
21
+ humaneval_plus_eval_cfg = dict(
22
+ evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
23
+ pred_role='BOT',
24
+ k=[1, 10, 100], # the parameter only for humaneval
25
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
26
+ )
27
+
28
+ humaneval_plus_datasets = [
29
+ dict(
30
+ abbr='humaneval_plus_repeat10',
31
+ type=HumanevalDataset,
32
+ path='opencompass/humaneval',
33
+ num_repeats=10,
34
+ reader_cfg=humaneval_plus_reader_cfg,
35
+ infer_cfg=humaneval_plus_infer_cfg,
36
+ eval_cfg=humaneval_plus_eval_cfg)
37
+ ]
opencompass/configs/datasets/mmlu/README.md ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MMLU
2
+
3
+ ```bash
4
+ python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
5
+ python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
6
+ ```
7
+
8
+ ## Base Models
9
+
10
+ | model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
11
+ |:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
12
+ | llama-7b-turbomind | 35.66 | 31.22 | 37.70 | 38.90 | 37.01 |
13
+ | llama-13b-turbomind | 47.76 | 37.68 | 55.36 | 52.43 | 50.83 |
14
+ | llama-30b-turbomind | 58.55 | 46.95 | 67.35 | 65.13 | 60.78 |
15
+ | llama-65b-turbomind | 63.78 | 52.35 | 73.68 | 70.84 | 64.29 |
16
+ | llama-2-7b-turbomind | 46.78 | 37.81 | 52.11 | 51.69 | 50.04 |
17
+ | llama-2-13b-turbomind | 55.76 | 44.61 | 63.86 | 62.97 | 57.35 |
18
+ | llama-2-70b-turbomind | 69.87 | 58.30 | 79.86 | 75.84 | 71.58 |
19
+ | llama-3-8b-turbomind | 66.43 | 55.95 | 76.11 | 70.29 | 68.96 |
20
+ | llama-3-70b-turbomind | 79.35 | 70.66 | 87.54 | 83.43 | 80.42 |
21
+ | internlm2-1.8b-turbomind | 45.99 | 39.63 | 51.02 | 48.65 | 47.96 |
22
+ | internlm2-7b-turbomind | 65.84 | 56.48 | 74.43 | 69.68 | 67.75 |
23
+ | internlm2-20b-turbomind | 67.58 | 59.01 | 76.04 | 71.20 | 68.69 |
24
+ | qwen-1.8b-turbomind | 46.61 | 38.91 | 51.35 | 49.57 | 50.51 |
25
+ | qwen-7b-turbomind | 59.75 | 50.16 | 67.98 | 63.48 | 62.44 |
26
+ | qwen-14b-turbomind | 67.85 | 59.13 | 76.18 | 71.62 | 69.12 |
27
+ | qwen-72b-turbomind | 77.36 | 68.70 | 85.28 | 80.60 | 79.45 |
28
+ | qwen1.5-0.5b-hf | 39.98 | 33.96 | 45.08 | 41.59 | 42.48 |
29
+ | qwen1.5-1.8b-hf | 47.14 | 39.47 | 52.70 | 49.01 | 51.33 |
30
+ | qwen1.5-4b-hf | 57.03 | 47.80 | 64.86 | 60.10 | 60.20 |
31
+ | qwen1.5-7b-hf | 62.15 | 53.22 | 70.25 | 65.62 | 64.26 |
32
+ | qwen1.5-14b-hf | 69.10 | 61.46 | 77.57 | 71.25 | 70.29 |
33
+ | qwen1.5-32b-hf | 73.88 | 65.60 | 81.41 | 77.10 | 75.79 |
34
+ | qwen1.5-72b-hf | 77.02 | 69.00 | 84.55 | 80.60 | 78.21 |
35
+ | qwen1.5-moe-a2-7b-hf | 62.09 | 53.27 | 70.74 | 63.80 | 65.28 |
36
+ | mistral-7b-v0.1-hf | 64.04 | 53.21 | 73.65 | 68.04 | 67.00 |
37
+ | mistral-7b-v0.2-hf | 63.85 | 53.21 | 72.17 | 68.40 | 67.15 |
38
+ | mixtral-8x7b-v0.1-hf | 71.80 | 61.70 | 81.03 | 75.51 | 74.35 |
39
+ | mixtral-8x22b-v0.1-hf | 77.67 | 68.94 | 86.81 | 81.23 | 78.43 |
40
+ | yi-6b-hf | 64.08 | 52.61 | 74.10 | 68.58 | 67.11 |
41
+ | yi-34b-hf | 76.26 | 66.73 | 83.74 | 81.78 | 77.77 |
42
+ | deepseek-7b-base-hf | 49.22 | 40.17 | 56.73 | 53.46 | 51.26 |
43
+ | deepseek-67b-base-hf | 71.95 | 60.57 | 81.69 | 77.11 | 74.42 |
44
+
45
+ ### Details
46
+
47
+ | model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
48
+ |:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
49
+ | llama-7b-turbomind | 37.50 | 30.00 | 30.00 | 33.00 | 23.53 | 23.45 | 34.87 | 37.78 | 25.00 | 27.68 | 34.34 | 31.00 |
50
+ | llama-13b-turbomind | 46.53 | 30.00 | 42.00 | 36.00 | 18.63 | 42.76 | 46.71 | 46.67 | 30.00 | 32.14 | 45.66 | 37.00 |
51
+ | llama-30b-turbomind | 59.03 | 45.00 | 47.00 | 35.00 | 26.47 | 53.10 | 61.18 | 51.85 | 37.00 | 41.07 | 57.36 | 38.00 |
52
+ | llama-65b-turbomind | 68.75 | 49.00 | 47.00 | 37.00 | 35.29 | 55.17 | 73.03 | 57.78 | 30.00 | 48.21 | 66.04 | 38.00 |
53
+ | llama-2-7b-turbomind | 46.53 | 34.00 | 33.00 | 34.00 | 22.55 | 47.59 | 40.13 | 47.41 | 29.00 | 38.39 | 46.42 | 32.00 |
54
+ | llama-2-13b-turbomind | 59.03 | 44.00 | 48.00 | 29.00 | 26.47 | 50.34 | 53.29 | 49.63 | 35.00 | 28.57 | 60.00 | 32.00 |
55
+ | llama-2-70b-turbomind | 84.72 | 51.00 | 60.00 | 39.00 | 37.25 | 65.52 | 81.58 | 63.70 | 32.00 | 52.68 | 72.08 | 46.00 |
56
+ | llama-3-8b-turbomind | 77.08 | 46.00 | 51.00 | 31.00 | 51.96 | 62.76 | 67.11 | 68.15 | 34.00 | 52.68 | 74.72 | 35.00 |
57
+ | llama-3-70b-turbomind | 93.75 | 62.00 | 72.00 | 52.00 | 50.98 | 74.48 | 92.11 | 79.26 | 48.00 | 63.39 | 86.42 | 49.00 |
58
+ | internlm2-1.8b-turbomind | 38.89 | 37.00 | 44.00 | 35.00 | 30.39 | 49.66 | 50.66 | 44.44 | 25.00 | 35.71 | 51.32 | 32.00 |
59
+ | internlm2-7b-turbomind | 77.08 | 48.00 | 64.00 | 33.00 | 47.06 | 63.45 | 73.68 | 57.78 | 37.00 | 45.54 | 69.81 | 35.00 |
60
+ | internlm2-20b-turbomind | 83.33 | 51.00 | 61.00 | 36.00 | 45.10 | 64.83 | 75.00 | 59.26 | 39.00 | 53.57 | 73.58 | 32.00 |
61
+ | qwen-1.8b-turbomind | 42.36 | 36.00 | 39.00 | 34.00 | 27.45 | 51.03 | 50.66 | 42.96 | 31.00 | 31.25 | 53.21 | 28.00 |
62
+ | qwen-7b-turbomind | 67.36 | 48.00 | 53.00 | 28.00 | 39.22 | 59.31 | 63.82 | 49.63 | 34.00 | 38.39 | 63.02 | 37.00 |
63
+ | qwen-14b-turbomind | 78.47 | 51.00 | 62.00 | 42.00 | 49.02 | 65.52 | 71.05 | 60.00 | 37.00 | 58.93 | 71.32 | 40.00 |
64
+ | qwen-72b-turbomind | 93.75 | 56.00 | 66.00 | 56.00 | 50.98 | 80.69 | 85.53 | 73.33 | 41.00 | 62.50 | 83.77 | 54.00 |
65
+ | qwen1.5-0.5b-hf | 38.89 | 25.00 | 38.00 | 32.00 | 25.49 | 45.52 | 44.74 | 33.33 | 30.00 | 39.29 | 38.11 | 39.00 |
66
+ | qwen1.5-1.8b-hf | 43.75 | 34.00 | 45.00 | 38.00 | 28.43 | 47.59 | 47.37 | 40.74 | 32.00 | 31.25 | 53.96 | 37.00 |
67
+ | qwen1.5-4b-hf | 50.00 | 46.00 | 41.00 | 45.00 | 31.37 | 53.10 | 61.18 | 51.85 | 35.00 | 44.64 | 60.38 | 37.00 |
68
+ | qwen1.5-7b-hf | 66.67 | 48.00 | 55.00 | 37.00 | 41.18 | 60.69 | 65.79 | 52.59 | 39.00 | 41.07 | 68.68 | 43.00 |
69
+ | qwen1.5-14b-hf | 75.69 | 49.00 | 58.00 | 49.00 | 49.02 | 71.72 | 73.03 | 65.93 | 39.00 | 52.68 | 73.96 | 49.00 |
70
+ | qwen1.5-32b-hf | 85.42 | 53.00 | 59.00 | 51.00 | 53.92 | 72.41 | 82.24 | 63.70 | 43.00 | 58.04 | 78.11 | 50.00 |
71
+ | qwen1.5-72b-hf | 90.97 | 54.00 | 65.00 | 57.00 | 52.94 | 80.00 | 87.50 | 73.33 | 43.00 | 64.29 | 81.89 | 50.00 |
72
+ | qwen1.5-moe-a2-7b-hf | 62.50 | 44.00 | 54.00 | 41.00 | 49.02 | 58.62 | 69.74 | 57.78 | 37.00 | 38.39 | 66.79 | 38.00 |
73
+ | mistral-7b-v0.1-hf | 72.92 | 50.00 | 51.00 | 40.00 | 39.22 | 57.93 | 65.79 | 62.96 | 29.00 | 49.11 | 69.43 | 36.00 |
74
+ | mistral-7b-v0.2-hf | 71.53 | 49.00 | 53.00 | 40.00 | 36.27 | 57.24 | 64.47 | 60.00 | 29.00 | 53.57 | 67.92 | 39.00 |
75
+ | mixtral-8x7b-v0.1-hf | 85.42 | 54.00 | 62.00 | 43.00 | 46.08 | 68.97 | 82.89 | 70.37 | 37.00 | 56.25 | 79.25 | 51.00 |
76
+ | mixtral-8x22b-v0.1-hf | 89.58 | 56.00 | 69.00 | 48.00 | 52.94 | 76.55 | 86.18 | 77.04 | 53.00 | 62.50 | 82.26 | 56.00 |
77
+ | yi-6b-hf | 66.67 | 43.00 | 51.00 | 39.00 | 35.29 | 64.83 | 65.79 | 60.00 | 29.00 | 41.96 | 66.79 | 46.00 |
78
+ | yi-34b-hf | 88.89 | 52.00 | 66.00 | 44.00 | 48.04 | 80.00 | 89.47 | 74.81 | 44.00 | 58.04 | 78.87 | 52.00 |
79
+ | deepseek-7b-base-hf | 52.08 | 29.00 | 44.00 | 40.00 | 31.37 | 44.83 | 51.97 | 40.74 | 27.00 | 32.14 | 53.58 | 31.00 |
80
+ | deepseek-67b-base-hf | 84.72 | 52.00 | 62.00 | 42.00 | 42.16 | 70.34 | 80.92 | 65.19 | 39.00 | 50.00 | 78.11 | 42.00 |
81
+
82
+ | model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
83
+ |:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
84
+ | llama-7b-turbomind | 33.01 | 39.22 | 45.73 | 26.24 | 33.33 | 51.24 | 24.25 | 45.00 | 31.09 | 30.05 | 37.00 | 35.13 |
85
+ | llama-13b-turbomind | 66.02 | 51.63 | 71.79 | 34.75 | 55.05 | 64.46 | 30.06 | 63.00 | 47.48 | 37.22 | 53.00 | 48.53 |
86
+ | llama-30b-turbomind | 76.70 | 62.42 | 84.19 | 44.68 | 71.72 | 75.21 | 40.56 | 66.00 | 57.98 | 46.48 | 66.00 | 63.73 |
87
+ | llama-65b-turbomind | 82.52 | 68.95 | 87.18 | 48.94 | 79.29 | 81.82 | 47.82 | 79.00 | 68.49 | 50.07 | 68.00 | 66.67 |
88
+ | llama-2-7b-turbomind | 53.40 | 48.69 | 68.38 | 36.52 | 49.49 | 65.29 | 24.02 | 60.00 | 44.12 | 36.31 | 55.00 | 43.79 |
89
+ | llama-2-13b-turbomind | 72.82 | 61.76 | 79.49 | 39.72 | 69.19 | 74.38 | 43.80 | 70.00 | 58.40 | 42.50 | 54.00 | 54.90 |
90
+ | llama-2-70b-turbomind | 83.50 | 77.12 | 91.03 | 56.03 | 86.87 | 87.60 | 44.69 | 77.00 | 77.31 | 52.93 | 74.00 | 75.65 |
91
+ | llama-3-8b-turbomind | 87.38 | 75.82 | 89.74 | 48.94 | 80.81 | 84.30 | 40.89 | 81.00 | 73.95 | 46.22 | 77.00 | 71.90 |
92
+ | llama-3-70b-turbomind | 91.26 | 87.25 | 94.87 | 64.18 | 93.94 | 89.26 | 62.91 | 83.00 | 87.82 | 61.80 | 90.00 | 85.78 |
93
+ | internlm2-1.8b-turbomind | 60.19 | 58.17 | 63.25 | 31.21 | 56.57 | 56.20 | 24.47 | 52.00 | 50.42 | 36.11 | 53.00 | 41.83 |
94
+ | internlm2-7b-turbomind | 79.61 | 75.49 | 87.61 | 48.23 | 82.83 | 77.69 | 49.39 | 74.00 | 72.27 | 47.65 | 73.00 | 65.03 |
95
+ | internlm2-20b-turbomind | 79.61 | 75.49 | 91.88 | 50.00 | 87.88 | 85.95 | 35.08 | 81.00 | 70.59 | 49.48 | 78.00 | 70.10 |
96
+ | qwen-1.8b-turbomind | 66.02 | 60.46 | 73.50 | 38.30 | 56.57 | 66.94 | 23.91 | 56.00 | 42.02 | 33.96 | 51.00 | 39.54 |
97
+ | qwen-7b-turbomind | 78.64 | 67.32 | 83.33 | 41.49 | 76.77 | 76.03 | 29.72 | 73.00 | 58.40 | 41.72 | 69.00 | 59.64 |
98
+ | qwen-14b-turbomind | 78.64 | 73.86 | 88.89 | 48.58 | 83.84 | 84.30 | 45.47 | 77.00 | 73.95 | 50.85 | 74.00 | 69.61 |
99
+ | qwen-72b-turbomind | 90.29 | 84.97 | 94.87 | 65.96 | 92.93 | 88.43 | 65.70 | 79.00 | 84.87 | 61.21 | 86.00 | 82.19 |
100
+ | qwen1.5-0.5b-hf | 52.43 | 46.41 | 60.68 | 31.21 | 46.46 | 56.20 | 25.70 | 46.00 | 37.39 | 32.79 | 46.00 | 37.75 |
101
+ | qwen1.5-1.8b-hf | 66.02 | 58.50 | 75.64 | 33.69 | 56.06 | 72.73 | 24.69 | 57.00 | 39.50 | 36.11 | 53.00 | 42.81 |
102
+ | qwen1.5-4b-hf | 74.76 | 62.75 | 84.19 | 46.81 | 76.77 | 71.07 | 25.03 | 67.00 | 55.04 | 41.33 | 64.00 | 56.05 |
103
+ | qwen1.5-7b-hf | 78.64 | 70.92 | 86.32 | 44.68 | 81.82 | 77.69 | 32.74 | 76.00 | 64.29 | 45.37 | 68.00 | 61.27 |
104
+ | qwen1.5-14b-hf | 80.58 | 75.49 | 85.90 | 51.06 | 86.36 | 80.99 | 45.03 | 80.00 | 76.47 | 48.57 | 78.00 | 69.61 |
105
+ | qwen1.5-32b-hf | 86.41 | 81.37 | 95.30 | 56.38 | 91.41 | 88.43 | 44.02 | 76.00 | 82.77 | 57.89 | 83.00 | 75.33 |
106
+ | qwen1.5-72b-hf | 87.38 | 85.29 | 94.87 | 64.89 | 92.42 | 90.08 | 62.12 | 83.00 | 84.03 | 60.76 | 86.00 | 81.05 |
107
+ | qwen1.5-moe-a2-7b-hf | 78.64 | 70.92 | 86.32 | 46.81 | 81.82 | 77.69 | 25.59 | 71.00 | 65.97 | 45.37 | 65.00 | 61.44 |
108
+ | mistral-7b-v0.1-hf | 82.52 | 75.49 | 87.61 | 48.94 | 76.77 | 77.69 | 32.51 | 77.00 | 66.39 | 44.98 | 74.00 | 67.97 |
109
+ | mistral-7b-v0.2-hf | 81.55 | 74.18 | 88.46 | 51.06 | 76.77 | 80.99 | 38.77 | 75.00 | 64.71 | 45.37 | 72.00 | 66.34 |
110
+ | mixtral-8x7b-v0.1-hf | 87.38 | 81.70 | 91.88 | 51.77 | 85.86 | 85.95 | 40.11 | 80.00 | 79.41 | 53.32 | 77.00 | 77.94 |
111
+ | mixtral-8x22b-v0.1-hf | 89.32 | 85.95 | 91.88 | 62.06 | 91.41 | 90.08 | 64.58 | 83.00 | 87.82 | 60.82 | 84.00 | 83.17 |
112
+ | yi-6b-hf | 80.58 | 71.57 | 91.03 | 48.23 | 83.33 | 76.86 | 41.34 | 75.00 | 74.79 | 49.35 | 80.00 | 65.69 |
113
+ | yi-34b-hf | 91.26 | 85.62 | 92.31 | 65.25 | 89.39 | 91.74 | 64.69 | 82.00 | 85.29 | 59.97 | 87.00 | 82.19 |
114
+ | deepseek-7b-base-hf | 61.17 | 53.59 | 72.22 | 34.04 | 59.09 | 65.29 | 26.37 | 61.00 | 44.96 | 35.53 | 56.00 | 49.18 |
115
+ | deepseek-67b-base-hf | 88.35 | 79.74 | 91.88 | 57.09 | 89.39 | 85.12 | 46.15 | 76.00 | 82.35 | 55.93 | 72.00 | 79.58 |
116
+
117
+ | model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
118
+ |:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
119
+ | llama-7b-turbomind | 41.67 | 49.12 | 40.84 | 34.94 | 29.56 | 40.00 | 34.10 | 35.11 | 26.46 | 27.81 | 34.00 | 41.82 |
120
+ | llama-13b-turbomind | 51.85 | 67.84 | 55.31 | 43.37 | 28.57 | 60.91 | 46.15 | 57.25 | 26.98 | 29.80 | 49.00 | 61.21 |
121
+ | llama-30b-turbomind | 71.30 | 79.53 | 66.24 | 49.40 | 40.39 | 70.00 | 56.67 | 64.89 | 37.30 | 35.10 | 60.00 | 70.91 |
122
+ | llama-65b-turbomind | 75.00 | 81.29 | 73.63 | 53.01 | 41.38 | 74.55 | 65.90 | 77.86 | 40.21 | 35.76 | 69.00 | 76.36 |
123
+ | llama-2-7b-turbomind | 53.70 | 69.01 | 60.13 | 41.57 | 36.95 | 54.55 | 45.90 | 55.73 | 27.25 | 31.13 | 40.00 | 59.39 |
124
+ | llama-2-13b-turbomind | 74.07 | 76.61 | 63.99 | 45.78 | 44.83 | 62.73 | 50.77 | 62.60 | 34.13 | 36.42 | 57.00 | 63.03 |
125
+ | llama-2-70b-turbomind | 83.33 | 85.96 | 78.46 | 53.61 | 52.22 | 69.09 | 74.87 | 87.02 | 43.39 | 43.71 | 78.00 | 84.24 |
126
+ | llama-3-8b-turbomind | 75.00 | 83.04 | 74.28 | 56.02 | 54.68 | 71.82 | 64.87 | 79.39 | 42.06 | 45.03 | 68.00 | 76.36 |
127
+ | llama-3-70b-turbomind | 86.11 | 91.23 | 86.50 | 57.83 | 71.92 | 74.55 | 82.56 | 88.55 | 62.70 | 56.95 | 86.00 | 86.67 |
128
+ | internlm2-1.8b-turbomind | 55.56 | 59.65 | 51.13 | 40.96 | 43.35 | 52.73 | 43.33 | 47.33 | 30.42 | 33.11 | 47.00 | 56.36 |
129
+ | internlm2-7b-turbomind | 79.63 | 82.46 | 73.63 | 51.20 | 55.17 | 70.00 | 66.92 | 70.99 | 46.03 | 42.38 | 70.00 | 78.79 |
130
+ | internlm2-20b-turbomind | 75.93 | 82.46 | 73.95 | 56.02 | 57.64 | 68.18 | 70.51 | 68.70 | 49.21 | 38.41 | 75.00 | 82.42 |
131
+ | qwen-1.8b-turbomind | 59.26 | 56.14 | 50.80 | 40.96 | 37.93 | 60.00 | 41.03 | 51.15 | 33.33 | 34.44 | 39.00 | 64.24 |
132
+ | qwen-7b-turbomind | 73.15 | 76.61 | 67.20 | 47.59 | 51.23 | 65.45 | 60.00 | 69.47 | 43.12 | 38.41 | 67.00 | 66.67 |
133
+ | qwen-14b-turbomind | 76.85 | 84.21 | 72.03 | 53.01 | 65.52 | 66.36 | 66.92 | 78.63 | 51.32 | 41.72 | 72.00 | 82.42 |
134
+ | qwen-72b-turbomind | 83.33 | 88.30 | 83.28 | 58.43 | 65.52 | 74.55 | 81.54 | 89.31 | 68.52 | 58.28 | 81.00 | 84.24 |
135
+ | qwen1.5-0.5b-hf | 40.74 | 40.94 | 41.48 | 40.96 | 28.57 | 50.91 | 36.92 | 41.98 | 28.84 | 22.52 | 37.00 | 52.73 |
136
+ | qwen1.5-1.8b-hf | 55.56 | 57.31 | 49.84 | 40.96 | 36.45 | 56.36 | 43.59 | 56.49 | 35.19 | 27.81 | 45.00 | 61.21 |
137
+ | qwen1.5-4b-hf | 70.37 | 70.76 | 61.74 | 44.58 | 45.32 | 65.45 | 54.62 | 64.89 | 47.88 | 32.45 | 62.00 | 70.30 |
138
+ | qwen1.5-7b-hf | 75.93 | 77.19 | 66.24 | 50.60 | 53.20 | 62.73 | 60.00 | 71.76 | 50.26 | 38.41 | 71.00 | 74.55 |
139
+ | qwen1.5-14b-hf | 74.07 | 83.63 | 70.74 | 46.39 | 58.62 | 64.55 | 73.59 | 76.34 | 59.26 | 49.01 | 75.00 | 83.64 |
140
+ | qwen1.5-32b-hf | 83.33 | 85.96 | 82.96 | 56.63 | 61.58 | 63.64 | 77.95 | 83.97 | 69.31 | 50.99 | 85.00 | 86.06 |
141
+ | qwen1.5-72b-hf | 84.26 | 88.89 | 82.32 | 57.23 | 66.01 | 72.73 | 82.05 | 87.02 | 69.31 | 56.95 | 84.00 | 84.24 |
142
+ | qwen1.5-moe-a2-7b-hf | 70.37 | 80.12 | 66.56 | 51.20 | 47.78 | 64.55 | 62.31 | 70.99 | 46.30 | 45.03 | 59.00 | 69.70 |
143
+ | mistral-7b-v0.1-hf | 77.78 | 83.04 | 69.45 | 54.82 | 53.20 | 67.27 | 66.15 | 78.63 | 38.10 | 31.79 | 68.00 | 78.79 |
144
+ | mistral-7b-v0.2-hf | 73.15 | 82.46 | 72.99 | 53.01 | 55.67 | 66.36 | 62.31 | 77.10 | 40.48 | 34.44 | 66.00 | 76.36 |
145
+ | mixtral-8x7b-v0.1-hf | 82.41 | 88.30 | 78.14 | 51.20 | 62.56 | 70.00 | 70.77 | 80.92 | 48.68 | 48.34 | 71.00 | 80.61 |
146
+ | mixtral-8x22b-v0.1-hf | 84.26 | 89.47 | 84.57 | 59.04 | 67.49 | 78.18 | 79.23 | 88.55 | 61.64 | 52.98 | 87.00 | 86.06 |
147
+ | yi-6b-hf | 78.70 | 81.87 | 69.77 | 46.39 | 52.71 | 73.64 | 65.13 | 74.81 | 46.30 | 38.41 | 66.00 | 71.52 |
148
+ | yi-34b-hf | 89.81 | 86.55 | 83.92 | 57.23 | 64.04 | 73.64 | 79.49 | 85.50 | 66.40 | 52.32 | 81.00 | 86.06 |
149
+ | deepseek-7b-base-hf | 55.56 | 73.10 | 56.59 | 46.99 | 34.98 | 62.73 | 48.21 | 58.78 | 28.57 | 29.14 | 50.00 | 61.82 |
150
+ | deepseek-67b-base-hf | 84.26 | 85.96 | 81.03 | 56.02 | 57.64 | 72.73 | 73.85 | 82.44 | 51.59 | 45.03 | 74.00 | 81.82 |
151
+
152
+ | model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
153
+ |:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
154
+ | llama-7b-turbomind | 42.00 | 40.46 | 32.87 | 42.78 | 26.19 | 46.11 | 35.19 | 33.47 | 32.90 | 42.33 | 43.88 | 43.75 |
155
+ | llama-13b-turbomind | 46.00 | 50.00 | 30.56 | 64.88 | 31.75 | 66.84 | 51.85 | 52.65 | 51.94 | 52.76 | 67.51 | 51.10 |
156
+ | llama-30b-turbomind | 55.00 | 66.76 | 49.07 | 77.91 | 36.51 | 82.90 | 68.21 | 66.12 | 69.35 | 67.48 | 80.59 | 55.88 |
157
+ | llama-65b-turbomind | 59.00 | 73.70 | 61.57 | 81.35 | 43.65 | 88.60 | 73.46 | 71.84 | 74.19 | 77.30 | 83.97 | 62.13 |
158
+ | llama-2-7b-turbomind | 53.00 | 51.16 | 27.78 | 63.60 | 27.78 | 67.36 | 48.77 | 47.76 | 50.97 | 51.53 | 64.56 | 52.57 |
159
+ | llama-2-13b-turbomind | 54.00 | 64.45 | 45.37 | 74.46 | 36.51 | 80.83 | 64.81 | 62.86 | 67.42 | 66.87 | 72.15 | 54.41 |
160
+ | llama-2-70b-turbomind | 72.00 | 77.17 | 63.43 | 86.08 | 48.41 | 94.30 | 83.64 | 78.37 | 81.61 | 80.98 | 87.76 | 74.63 |
161
+ | llama-3-8b-turbomind | 62.00 | 73.70 | 54.17 | 82.76 | 48.41 | 90.16 | 72.53 | 75.51 | 77.74 | 73.01 | 82.70 | 72.06 |
162
+ | llama-3-70b-turbomind | 83.00 | 85.55 | 72.22 | 92.21 | 66.67 | 97.41 | 91.05 | 84.90 | 90.32 | 87.73 | 94.09 | 87.13 |
163
+ | internlm2-1.8b-turbomind | 44.00 | 45.95 | 38.89 | 59.39 | 32.54 | 60.62 | 50.31 | 54.29 | 52.58 | 45.40 | 62.87 | 37.87 |
164
+ | internlm2-7b-turbomind | 69.00 | 66.76 | 57.87 | 80.72 | 50.00 | 90.16 | 73.15 | 75.10 | 79.68 | 68.71 | 81.01 | 70.22 |
165
+ | internlm2-20b-turbomind | 74.00 | 74.57 | 60.19 | 81.48 | 44.44 | 91.71 | 75.31 | 81.63 | 82.58 | 75.46 | 87.76 | 63.60 |
166
+ | qwen-1.8b-turbomind | 52.00 | 52.31 | 34.72 | 57.98 | 29.37 | 59.07 | 47.22 | 48.57 | 52.26 | 44.17 | 61.18 | 43.38 |
167
+ | qwen-7b-turbomind | 68.00 | 64.74 | 45.37 | 77.39 | 43.65 | 83.94 | 68.21 | 70.20 | 72.26 | 65.64 | 75.95 | 58.46 |
168
+ | qwen-14b-turbomind | 75.00 | 74.86 | 57.87 | 84.04 | 51.59 | 91.71 | 70.99 | 77.14 | 83.55 | 73.01 | 83.12 | 67.65 |
169
+ | qwen-72b-turbomind | 80.00 | 84.97 | 68.98 | 91.44 | 54.76 | 98.96 | 87.04 | 81.63 | 89.03 | 84.05 | 90.30 | 84.93 |
170
+ | qwen1.5-0.5b-hf | 47.00 | 46.82 | 23.15 | 48.02 | 29.37 | 48.70 | 40.12 | 38.37 | 40.65 | 35.58 | 53.16 | 31.62 |
171
+ | qwen1.5-1.8b-hf | 54.00 | 54.91 | 28.70 | 61.69 | 23.81 | 58.03 | 48.15 | 51.84 | 55.48 | 45.40 | 59.92 | 39.71 |
172
+ | qwen1.5-4b-hf | 65.00 | 66.76 | 44.44 | 73.95 | 35.71 | 78.24 | 60.19 | 65.31 | 66.45 | 65.64 | 71.31 | 50.00 |
173
+ | qwen1.5-7b-hf | 68.00 | 70.81 | 48.61 | 76.50 | 38.89 | 84.97 | 69.44 | 68.16 | 74.52 | 68.10 | 77.22 | 56.25 |
174
+ | qwen1.5-14b-hf | 77.00 | 73.70 | 62.96 | 83.40 | 53.17 | 90.67 | 71.60 | 80.82 | 84.52 | 76.69 | 83.54 | 71.69 |
175
+ | qwen1.5-32b-hf | 77.00 | 78.90 | 68.98 | 88.12 | 54.76 | 94.82 | 81.48 | 80.82 | 88.39 | 82.21 | 86.08 | 80.88 |
176
+ | qwen1.5-72b-hf | 80.00 | 84.39 | 68.98 | 91.44 | 55.56 | 98.96 | 86.73 | 81.63 | 88.71 | 85.89 | 89.87 | 82.72 |
177
+ | qwen1.5-moe-a2-7b-hf | 74.00 | 65.90 | 56.48 | 82.25 | 34.13 | 84.46 | 70.68 | 74.29 | 73.23 | 68.10 | 76.79 | 66.91 |
178
+ | mistral-7b-v0.1-hf | 57.00 | 71.10 | 57.41 | 81.61 | 40.48 | 86.53 | 73.46 | 72.65 | 76.77 | 79.14 | 77.22 | 68.75 |
179
+ | mistral-7b-v0.2-hf | 61.00 | 71.39 | 52.78 | 80.08 | 40.48 | 88.08 | 69.44 | 72.24 | 76.13 | 77.91 | 78.06 | 70.59 |
180
+ | mixtral-8x7b-v0.1-hf | 77.00 | 80.06 | 63.43 | 87.87 | 54.76 | 93.26 | 83.95 | 80.00 | 84.19 | 79.14 | 88.61 | 81.25 |
181
+ | mixtral-8x22b-v0.1-hf | 72.00 | 84.10 | 68.52 | 90.68 | 57.14 | 96.37 | 86.73 | 86.53 | 90.32 | 87.73 | 90.30 | 87.87 |
182
+ | yi-6b-hf | 67.00 | 69.36 | 52.78 | 80.46 | 44.44 | 89.64 | 70.99 | 74.69 | 77.10 | 78.53 | 78.90 | 65.81 |
183
+ | yi-34b-hf | 79.00 | 83.82 | 66.67 | 90.29 | 57.14 | 97.93 | 87.65 | 84.90 | 88.39 | 87.73 | 92.83 | 81.99 |
184
+ | deepseek-7b-base-hf | 49.00 | 52.31 | 41.20 | 66.28 | 30.95 | 63.73 | 55.86 | 51.84 | 52.90 | 58.90 | 62.45 | 45.22 |
185
+ | deepseek-67b-base-hf | 81.00 | 77.17 | 63.89 | 90.04 | 53.17 | 97.93 | 85.49 | 73.88 | 82.26 | 84.05 | 91.56 | 78.31 |
186
+
187
+ | model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
188
+ |:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
189
+ | llama-7b-turbomind | 24.81 | 32.95 | 38.73 | 45.77 | 27.19 | 48.07 | 38.12 | 43.00 |
190
+ | llama-13b-turbomind | 26.30 | 42.20 | 59.80 | 61.19 | 28.95 | 61.28 | 53.36 | 78.00 |
191
+ | llama-30b-turbomind | 27.41 | 54.91 | 76.96 | 79.10 | 35.96 | 76.15 | 67.71 | 83.00 |
192
+ | llama-65b-turbomind | 34.44 | 54.34 | 82.84 | 81.09 | 39.47 | 82.39 | 66.37 | 88.00 |
193
+ | llama-2-7b-turbomind | 29.63 | 43.35 | 60.29 | 62.69 | 27.19 | 62.75 | 56.05 | 64.00 |
194
+ | llama-2-13b-turbomind | 27.04 | 52.60 | 75.49 | 73.13 | 32.46 | 76.51 | 64.57 | 82.00 |
195
+ | llama-2-70b-turbomind | 34.07 | 64.16 | 90.69 | 90.55 | 44.74 | 87.52 | 80.27 | 92.00 |
196
+ | llama-3-8b-turbomind | 38.15 | 64.16 | 83.33 | 86.57 | 47.37 | 84.04 | 70.85 | 87.00 |
197
+ | llama-3-70b-turbomind | 48.89 | 79.77 | 95.10 | 94.03 | 72.81 | 94.13 | 82.51 | 94.00 |
198
+ | internlm2-1.8b-turbomind | 30.37 | 41.04 | 55.88 | 51.74 | 28.95 | 61.47 | 51.12 | 63.00 |
199
+ | internlm2-7b-turbomind | 39.63 | 68.21 | 76.96 | 84.58 | 44.74 | 84.59 | 72.65 | 86.00 |
200
+ | internlm2-20b-turbomind | 39.63 | 66.47 | 82.84 | 85.07 | 47.37 | 86.79 | 70.85 | 84.00 |
201
+ | qwen-1.8b-turbomind | 28.52 | 43.35 | 54.90 | 60.70 | 36.84 | 60.73 | 48.43 | 60.00 |
202
+ | qwen-7b-turbomind | 30.00 | 57.23 | 75.98 | 79.10 | 32.46 | 79.27 | 63.23 | 81.00 |
203
+ | qwen-14b-turbomind | 37.41 | 70.52 | 81.37 | 85.07 | 50.00 | 84.95 | 73.09 | 86.00 |
204
+ | qwen-72b-turbomind | 50.00 | 75.72 | 92.16 | 90.05 | 59.65 | 92.66 | 82.51 | 95.00 |
205
+ | qwen1.5-0.5b-hf | 29.63 | 33.53 | 45.10 | 59.70 | 28.95 | 44.77 | 37.22 | 69.00 |
206
+ | qwen1.5-1.8b-hf | 34.07 | 39.31 | 47.55 | 63.18 | 32.46 | 59.08 | 53.81 | 73.00 |
207
+ | qwen1.5-4b-hf | 35.93 | 55.49 | 71.08 | 73.13 | 37.72 | 72.11 | 63.68 | 79.00 |
208
+ | qwen1.5-7b-hf | 34.81 | 61.85 | 78.92 | 82.09 | 41.23 | 80.73 | 61.88 | 84.00 |
209
+ | qwen1.5-14b-hf | 45.93 | 68.21 | 80.88 | 83.08 | 55.26 | 86.06 | 73.09 | 88.00 |
210
+ | qwen1.5-32b-hf | 47.04 | 76.30 | 90.20 | 86.07 | 57.89 | 90.28 | 75.78 | 92.00 |
211
+ | qwen1.5-72b-hf | 47.78 | 75.14 | 92.65 | 88.56 | 59.65 | 92.48 | 79.82 | 94.00 |
212
+ | qwen1.5-moe-a2-7b-hf | 46.30 | 54.91 | 78.43 | 79.10 | 38.60 | 82.39 | 66.82 | 83.00 |
213
+ | mistral-7b-v0.1-hf | 33.70 | 65.32 | 78.92 | 83.08 | 50.00 | 82.39 | 69.51 | 86.00 |
214
+ | mistral-7b-v0.2-hf | 38.15 | 64.16 | 81.86 | 82.09 | 43.86 | 80.18 | 69.96 | 86.00 |
215
+ | mixtral-8x7b-v0.1-hf | 40.37 | 69.94 | 86.27 | 88.56 | 65.79 | 88.81 | 79.37 | 91.00 |
216
+ | mixtral-8x22b-v0.1-hf | 45.93 | 79.19 | 90.20 | 93.03 | 70.18 | 92.29 | 79.37 | 95.00 |
217
+ | yi-6b-hf | 32.59 | 61.27 | 79.90 | 82.59 | 35.96 | 82.94 | 67.26 | 86.00 |
218
+ | yi-34b-hf | 45.19 | 71.68 | 91.18 | 88.56 | 55.26 | 91.74 | 78.48 | 91.00 |
219
+ | deepseek-7b-base-hf | 28.89 | 41.62 | 60.29 | 70.15 | 26.32 | 69.72 | 55.61 | 76.00 |
220
+ | deepseek-67b-base-hf | 38.89 | 72.25 | 90.69 | 90.05 | 52.63 | 90.46 | 80.72 | 95.00 |
221
+
222
+ ## Chat Models
223
+
224
+ | model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
225
+ |:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
226
+ | qwen1.5-0.5b-chat-hf | 35.32 | 30.90 | 37.59 | 37.29 | 37.73 |
227
+ | qwen1.5-1.8b-chat-hf | 45.62 | 39.20 | 49.21 | 47.67 | 49.63 |
228
+ | qwen1.5-4b-chat-hf | 55.90 | 48.07 | 62.67 | 59.70 | 57.31 |
229
+ | qwen1.5-7b-chat-hf | 61.79 | 52.68 | 69.41 | 66.41 | 63.45 |
230
+ | qwen1.5-14b-chat-hf | 67.96 | 59.79 | 75.46 | 71.23 | 69.72 |
231
+ | qwen1.5-32b-chat-hf | 75.36 | 67.04 | 82.11 | 80.44 | 76.23 |
232
+ | qwen1.5-72b-chat-hf | 77.24 | 69.59 | 83.95 | 81.58 | 77.87 |
233
+ | qwen1.5-110b-chat-hf | 77.95 | 71.56 | 83.77 | 81.44 | 78.41 |
234
+ | internlm2-chat-1.8b-hf | 47.58 | 40.88 | 53.33 | 49.92 | 49.74 |
235
+ | internlm2-chat-1.8b-sft-hf | 47.44 | 40.55 | 53.31 | 49.67 | 49.89 |
236
+ | internlm2-chat-7b-hf | 63.05 | 53.42 | 71.47 | 67.27 | 65.13 |
237
+ | internlm2-chat-7b-sft-hf | 63.33 | 53.95 | 71.74 | 67.62 | 65.00 |
238
+ | internlm2-chat-20b-hf | 67.37 | 57.39 | 75.75 | 71.63 | 69.95 |
239
+ | internlm2-chat-20b-sft-hf | 67.34 | 57.49 | 75.67 | 70.99 | 70.40 |
240
+ | llama-3-8b-instruct-hf | 68.37 | 58.01 | 77.82 | 71.22 | 71.94 |
241
+ | llama-3-70b-instruct-hf | 80.93 | 73.86 | 87.71 | 83.90 | 82.01 |
242
+ | llama-3-8b-instruct-lmdeploy | 67.35 | 56.66 | 75.96 | 70.90 | 71.49 |
243
+ | llama-3-70b-instruct-lmdeploy | 80.85 | 74.07 | 87.26 | 83.73 | 81.96 |
244
+ | mistral-7b-instruct-v0.1-hf | 54.36 | 43.74 | 62.96 | 58.87 | 57.46 |
245
+ | mistral-7b-instruct-v0.2-hf | 59.98 | 49.56 | 69.22 | 64.41 | 62.24 |
246
+ | mixtral-8x7b-instruct-v0.1-hf | 70.11 | 60.29 | 79.01 | 74.08 | 72.28 |
247
+
248
+ ### Details
249
+
250
+ | model | college_biology | college_chemistry | college_computer_science | college_mathematics | college_physics | electrical_engineering | astronomy | anatomy | abstract_algebra | machine_learning | clinical_knowledge | global_facts |
251
+ |:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
252
+ | qwen1.5-0.5b-chat-hf | 31.25 | 32.00 | 33.00 | 29.00 | 33.33 | 38.62 | 33.55 | 28.89 | 20.00 | 27.68 | 40.38 | 33.00 |
253
+ | qwen1.5-1.8b-chat-hf | 42.36 | 28.00 | 45.00 | 33.00 | 27.45 | 44.83 | 51.97 | 42.22 | 32.00 | 38.39 | 48.30 | 30.00 |
254
+ | qwen1.5-4b-chat-hf | 56.25 | 47.00 | 49.00 | 39.00 | 36.27 | 54.48 | 57.89 | 49.63 | 38.00 | 33.04 | 59.62 | 23.00 |
255
+ | qwen1.5-7b-chat-hf | 64.58 | 51.00 | 59.00 | 37.00 | 41.18 | 53.79 | 66.45 | 53.33 | 43.00 | 41.07 | 67.92 | 36.00 |
256
+ | qwen1.5-14b-chat-hf | 77.08 | 51.00 | 64.00 | 42.00 | 45.10 | 64.83 | 77.63 | 65.93 | 39.00 | 46.43 | 73.21 | 45.00 |
257
+ | qwen1.5-32b-chat-hf | 84.72 | 53.00 | 57.00 | 48.00 | 52.94 | 74.48 | 82.24 | 67.41 | 52.00 | 61.61 | 78.11 | 48.00 |
258
+ | qwen1.5-72b-chat-hf | 90.97 | 57.00 | 66.00 | 55.00 | 55.88 | 80.00 | 88.16 | 72.59 | 56.00 | 59.82 | 80.00 | 51.00 |
259
+ | qwen1.5-110b-chat-hf | 88.89 | 62.00 | 66.00 | 64.00 | 58.82 | 75.86 | 89.47 | 68.15 | 59.00 | 63.39 | 79.62 | 59.00 |
260
+ | internlm2-chat-1.8b-hf | 49.31 | 36.00 | 47.00 | 33.00 | 36.27 | 42.76 | 48.03 | 49.63 | 30.00 | 33.93 | 53.58 | 28.00 |
261
+ | internlm2-chat-1.8b-sft-hf | 51.39 | 37.00 | 50.00 | 33.00 | 33.33 | 42.76 | 46.05 | 49.63 | 31.00 | 32.14 | 53.21 | 29.00 |
262
+ | internlm2-chat-7b-hf | 68.75 | 47.00 | 62.00 | 32.00 | 38.24 | 57.24 | 69.74 | 58.52 | 29.00 | 53.57 | 70.19 | 41.00 |
263
+ | internlm2-chat-7b-sft-hf | 71.53 | 47.00 | 63.00 | 34.00 | 37.25 | 57.24 | 69.74 | 57.78 | 29.00 | 52.68 | 69.43 | 34.00 |
264
+ | internlm2-chat-20b-hf | 76.39 | 51.00 | 61.00 | 37.00 | 40.20 | 62.76 | 78.95 | 67.41 | 33.00 | 46.43 | 75.09 | 42.00 |
265
+ | internlm2-chat-20b-sft-hf | 77.08 | 49.00 | 60.00 | 39.00 | 39.22 | 64.14 | 79.61 | 68.15 | 35.00 | 46.43 | 75.09 | 42.00 |
266
+ | llama-3-8b-instruct-hf | 81.94 | 48.00 | 58.00 | 43.00 | 48.04 | 60.69 | 76.32 | 71.11 | 33.00 | 54.46 | 73.58 | 46.00 |
267
+ | llama-3-70b-instruct-hf | 93.06 | 56.00 | 70.00 | 60.00 | 60.78 | 77.24 | 93.42 | 79.26 | 53.00 | 71.43 | 86.42 | 66.00 |
268
+ | llama-3-8b-instruct-lmdeploy | 79.17 | 47.00 | 53.00 | 36.00 | 49.02 | 60.00 | 73.68 | 68.89 | 36.00 | 55.36 | 73.96 | 42.00 |
269
+ | llama-3-70b-instruct-lmdeploy | 93.75 | 57.00 | 66.00 | 61.00 | 65.69 | 77.93 | 92.11 | 78.52 | 55.00 | 70.54 | 86.42 | 64.00 |
270
+ | mistral-7b-instruct-v0.1-hf | 57.64 | 35.00 | 50.00 | 31.00 | 24.51 | 51.72 | 58.55 | 45.93 | 35.00 | 41.07 | 56.98 | 32.00 |
271
+ | mistral-7b-instruct-v0.2-hf | 70.14 | 42.00 | 49.00 | 35.00 | 43.14 | 54.48 | 65.79 | 56.30 | 29.00 | 42.86 | 65.28 | 37.00 |
272
+ | mixtral-8x7b-instruct-v0.1-hf | 81.25 | 57.00 | 57.00 | 40.00 | 50.00 | 60.69 | 80.92 | 65.93 | 45.00 | 50.89 | 76.60 | 41.00 |
273
+
274
+ | model | management | nutrition | marketing | professional_accounting | high_school_geography | international_law | moral_scenarios | computer_security | high_school_microeconomics | professional_law | medical_genetics | professional_psychology |
275
+ |:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
276
+ | qwen1.5-0.5b-chat-hf | 41.75 | 38.89 | 49.15 | 26.60 | 48.48 | 50.41 | 24.69 | 42.00 | 32.35 | 31.75 | 31.00 | 32.35 |
277
+ | qwen1.5-1.8b-chat-hf | 62.14 | 55.56 | 76.92 | 34.40 | 58.08 | 61.16 | 21.90 | 56.00 | 42.44 | 35.14 | 50.00 | 44.93 |
278
+ | qwen1.5-4b-chat-hf | 73.79 | 58.50 | 82.05 | 47.16 | 74.24 | 71.90 | 32.29 | 69.00 | 58.40 | 40.74 | 58.00 | 53.76 |
279
+ | qwen1.5-7b-chat-hf | 79.61 | 69.28 | 85.47 | 41.49 | 78.79 | 76.86 | 35.75 | 74.00 | 65.13 | 44.78 | 68.00 | 57.68 |
280
+ | qwen1.5-14b-chat-hf | 82.52 | 70.26 | 87.18 | 51.77 | 85.86 | 82.64 | 53.74 | 81.00 | 76.05 | 47.98 | 76.00 | 67.48 |
281
+ | qwen1.5-32b-chat-hf | 84.47 | 77.78 | 94.44 | 60.99 | 90.91 | 87.60 | 72.96 | 79.00 | 83.61 | 58.28 | 83.00 | 77.94 |
282
+ | qwen1.5-72b-chat-hf | 89.32 | 85.95 | 93.59 | 61.35 | 90.91 | 86.78 | 75.98 | 83.00 | 84.87 | 60.30 | 83.00 | 81.05 |
283
+ | qwen1.5-110b-chat-hf | 86.41 | 80.72 | 92.74 | 69.15 | 93.94 | 84.30 | 77.88 | 83.00 | 88.66 | 61.73 | 84.00 | 82.19 |
284
+ | internlm2-chat-1.8b-hf | 72.82 | 50.65 | 69.23 | 35.46 | 56.06 | 56.20 | 27.82 | 60.00 | 49.16 | 33.83 | 54.00 | 43.79 |
285
+ | internlm2-chat-1.8b-sft-hf | 71.84 | 52.61 | 68.80 | 34.75 | 55.56 | 53.72 | 27.04 | 58.00 | 48.74 | 34.09 | 54.00 | 44.61 |
286
+ | internlm2-chat-7b-hf | 78.64 | 66.67 | 85.90 | 46.81 | 79.29 | 70.25 | 35.31 | 79.00 | 68.07 | 46.41 | 68.00 | 64.87 |
287
+ | internlm2-chat-7b-sft-hf | 79.61 | 67.97 | 86.75 | 47.52 | 80.30 | 70.25 | 35.98 | 80.00 | 69.33 | 45.83 | 70.00 | 65.36 |
288
+ | internlm2-chat-20b-hf | 80.58 | 75.16 | 90.17 | 52.13 | 83.84 | 80.99 | 39.33 | 80.00 | 70.59 | 49.67 | 75.00 | 70.26 |
289
+ | internlm2-chat-20b-sft-hf | 80.58 | 76.14 | 91.03 | 53.19 | 84.34 | 80.99 | 36.31 | 77.00 | 71.85 | 49.61 | 77.00 | 70.59 |
290
+ | llama-3-8b-instruct-hf | 82.52 | 79.41 | 91.45 | 52.48 | 80.30 | 79.34 | 46.26 | 75.00 | 76.89 | 49.61 | 85.00 | 72.22 |
291
+ | llama-3-70b-instruct-hf | 89.32 | 87.58 | 93.16 | 66.67 | 92.42 | 90.08 | 76.20 | 83.00 | 89.50 | 64.67 | 92.00 | 87.09 |
292
+ | llama-3-8b-instruct-lmdeploy | 87.38 | 79.41 | 90.17 | 52.48 | 79.80 | 78.51 | 44.25 | 75.00 | 74.37 | 48.76 | 84.00 | 69.61 |
293
+ | llama-3-70b-instruct-lmdeploy | 90.29 | 88.56 | 93.59 | 65.96 | 92.93 | 89.26 | 75.75 | 83.00 | 89.92 | 63.95 | 92.00 | 86.60 |
294
+ | mistral-7b-instruct-v0.1-hf | 69.90 | 59.80 | 85.47 | 38.65 | 69.70 | 65.29 | 37.54 | 69.00 | 51.26 | 37.81 | 65.00 | 52.45 |
295
+ | mistral-7b-instruct-v0.2-hf | 74.76 | 66.99 | 88.89 | 43.97 | 75.25 | 76.86 | 42.01 | 73.00 | 62.61 | 42.24 | 67.00 | 62.25 |
296
+ | mixtral-8x7b-instruct-v0.1-hf | 85.44 | 80.39 | 92.74 | 55.32 | 85.35 | 82.64 | 48.38 | 78.00 | 75.21 | 53.52 | 75.00 | 74.02 |
297
+
298
+ | model | jurisprudence | world_religions | philosophy | virology | high_school_chemistry | public_relations | high_school_macroeconomics | human_sexuality | elementary_mathematics | high_school_physics | high_school_computer_science | high_school_european_history |
299
+ |:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
300
+ | qwen1.5-0.5b-chat-hf | 42.59 | 24.56 | 39.87 | 39.76 | 29.06 | 38.18 | 35.64 | 38.93 | 27.78 | 29.80 | 34.00 | 48.48 |
301
+ | qwen1.5-1.8b-chat-hf | 50.93 | 56.73 | 44.37 | 42.77 | 35.96 | 51.82 | 38.46 | 49.62 | 35.45 | 27.15 | 47.00 | 63.03 |
302
+ | qwen1.5-4b-chat-hf | 71.30 | 65.50 | 58.20 | 50.00 | 44.33 | 57.27 | 54.10 | 61.83 | 43.65 | 41.06 | 60.00 | 72.12 |
303
+ | qwen1.5-7b-chat-hf | 76.85 | 76.61 | 68.49 | 48.80 | 51.72 | 64.55 | 59.23 | 68.70 | 48.94 | 37.09 | 69.00 | 79.39 |
304
+ | qwen1.5-14b-chat-hf | 75.93 | 80.70 | 69.13 | 51.20 | 55.67 | 64.55 | 67.69 | 74.05 | 57.14 | 47.02 | 74.00 | 82.42 |
305
+ | qwen1.5-32b-chat-hf | 83.33 | 89.47 | 82.64 | 60.84 | 62.56 | 70.00 | 76.67 | 83.21 | 67.46 | 59.60 | 85.00 | 84.85 |
306
+ | qwen1.5-72b-chat-hf | 86.11 | 89.47 | 80.71 | 59.04 | 68.47 | 72.73 | 80.00 | 87.79 | 67.72 | 52.32 | 79.00 | 85.45 |
307
+ | qwen1.5-110b-chat-hf | 83.33 | 87.13 | 81.03 | 54.22 | 69.95 | 73.64 | 78.21 | 87.02 | 75.93 | 57.62 | 84.00 | 88.48 |
308
+ | internlm2-chat-1.8b-hf | 52.78 | 60.82 | 49.20 | 42.77 | 42.36 | 50.00 | 47.18 | 53.44 | 32.54 | 31.79 | 39.00 | 60.00 |
309
+ | internlm2-chat-1.8b-sft-hf | 53.70 | 61.40 | 50.16 | 42.17 | 40.89 | 50.00 | 47.69 | 51.15 | 32.54 | 29.14 | 40.00 | 59.39 |
310
+ | internlm2-chat-7b-hf | 73.15 | 81.87 | 67.85 | 47.59 | 49.75 | 62.73 | 61.79 | 66.41 | 44.97 | 33.77 | 71.00 | 81.82 |
311
+ | internlm2-chat-7b-sft-hf | 73.15 | 81.87 | 66.88 | 48.19 | 48.77 | 63.64 | 62.31 | 65.65 | 45.77 | 33.77 | 72.00 | 81.82 |
312
+ | internlm2-chat-20b-hf | 80.56 | 81.87 | 72.99 | 55.42 | 54.19 | 70.00 | 67.95 | 71.76 | 48.15 | 39.74 | 75.00 | 80.00 |
313
+ | internlm2-chat-20b-sft-hf | 81.48 | 79.53 | 72.99 | 54.82 | 54.19 | 69.09 | 67.95 | 71.76 | 48.94 | 41.06 | 75.00 | 80.00 |
314
+ | llama-3-8b-instruct-hf | 76.85 | 79.53 | 72.35 | 53.61 | 54.19 | 70.91 | 66.41 | 80.92 | 49.47 | 46.36 | 71.00 | 75.15 |
315
+ | llama-3-70b-instruct-hf | 87.04 | 88.30 | 82.64 | 56.02 | 67.49 | 74.55 | 86.41 | 88.55 | 74.34 | 65.56 | 91.00 | 86.06 |
316
+ | llama-3-8b-instruct-lmdeploy | 77.78 | 79.53 | 70.74 | 52.41 | 53.20 | 68.18 | 65.38 | 79.39 | 50.79 | 37.75 | 72.00 | 76.97 |
317
+ | llama-3-70b-instruct-lmdeploy | 87.96 | 90.64 | 83.28 | 54.82 | 69.46 | 73.64 | 86.92 | 87.02 | 74.87 | 66.23 | 92.00 | 85.45 |
318
+ | mistral-7b-instruct-v0.1-hf | 64.81 | 70.18 | 63.67 | 41.57 | 38.92 | 68.18 | 49.49 | 61.83 | 33.33 | 32.45 | 55.00 | 66.67 |
319
+ | mistral-7b-instruct-v0.2-hf | 70.37 | 80.12 | 64.95 | 50.60 | 50.74 | 68.18 | 54.36 | 71.76 | 40.74 | 35.10 | 60.00 | 73.33 |
320
+ | mixtral-8x7b-instruct-v0.1-hf | 79.63 | 87.72 | 73.63 | 54.82 | 61.58 | 67.27 | 69.49 | 83.21 | 52.91 | 47.02 | 74.00 | 80.61 |
321
+
322
+ | model | business_ethics | moral_disputes | high_school_statistics | miscellaneous | formal_logic | high_school_government_and_politics | prehistory | security_studies | high_school_biology | logical_fallacies | high_school_world_history | professional_medicine |
323
+ |:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
324
+ | qwen1.5-0.5b-chat-hf | 45.00 | 41.04 | 30.09 | 39.21 | 24.60 | 35.23 | 33.95 | 25.31 | 36.13 | 31.29 | 49.37 | 38.24 |
325
+ | qwen1.5-1.8b-chat-hf | 54.00 | 50.29 | 34.26 | 58.49 | 24.60 | 55.96 | 47.53 | 39.18 | 47.74 | 44.17 | 64.98 | 40.81 |
326
+ | qwen1.5-4b-chat-hf | 61.00 | 64.16 | 46.30 | 71.01 | 39.68 | 72.02 | 54.01 | 65.31 | 63.55 | 63.80 | 71.31 | 51.10 |
327
+ | qwen1.5-7b-chat-hf | 69.00 | 67.05 | 50.93 | 76.25 | 53.17 | 82.38 | 62.96 | 71.02 | 73.23 | 68.10 | 76.79 | 60.29 |
328
+ | qwen1.5-14b-chat-hf | 74.00 | 75.14 | 58.33 | 82.89 | 51.59 | 88.60 | 69.44 | 77.96 | 84.19 | 73.62 | 82.70 | 71.32 |
329
+ | qwen1.5-32b-chat-hf | 80.00 | 80.64 | 70.83 | 89.40 | 60.32 | 94.82 | 81.79 | 79.59 | 90.00 | 86.50 | 88.61 | 80.15 |
330
+ | qwen1.5-72b-chat-hf | 80.00 | 82.95 | 68.98 | 91.83 | 57.14 | 98.45 | 86.73 | 78.78 | 89.03 | 87.12 | 91.14 | 83.82 |
331
+ | qwen1.5-110b-chat-hf | 79.00 | 78.03 | 67.13 | 92.98 | 62.70 | 97.93 | 87.04 | 74.29 | 88.71 | 82.82 | 91.14 | 84.93 |
332
+ | internlm2-chat-1.8b-hf | 48.00 | 49.13 | 44.91 | 57.60 | 26.98 | 61.14 | 50.62 | 51.02 | 52.58 | 57.67 | 67.51 | 37.50 |
333
+ | internlm2-chat-1.8b-sft-hf | 50.00 | 49.13 | 44.91 | 57.73 | 28.57 | 61.66 | 49.69 | 51.02 | 49.68 | 57.67 | 66.67 | 38.60 |
334
+ | internlm2-chat-7b-hf | 65.00 | 65.61 | 49.54 | 80.84 | 43.65 | 88.08 | 70.99 | 68.98 | 78.39 | 75.46 | 82.28 | 61.76 |
335
+ | internlm2-chat-7b-sft-hf | 64.00 | 66.18 | 52.31 | 81.35 | 46.03 | 88.08 | 71.60 | 67.76 | 78.39 | 77.30 | 82.28 | 63.60 |
336
+ | internlm2-chat-20b-hf | 74.00 | 73.70 | 59.72 | 81.86 | 46.83 | 89.12 | 74.69 | 75.92 | 80.65 | 79.14 | 82.70 | 70.59 |
337
+ | internlm2-chat-20b-sft-hf | 76.00 | 73.12 | 60.19 | 81.99 | 43.65 | 88.60 | 74.38 | 73.88 | 80.32 | 80.37 | 82.70 | 70.59 |
338
+ | llama-3-8b-instruct-hf | 72.00 | 73.12 | 55.09 | 84.55 | 50.00 | 90.67 | 77.16 | 77.55 | 81.61 | 77.91 | 84.81 | 75.00 |
339
+ | llama-3-70b-instruct-hf | 85.00 | 85.26 | 75.00 | 92.72 | 69.05 | 97.41 | 90.43 | 82.04 | 91.61 | 87.12 | 94.09 | 89.71 |
340
+ | llama-3-8b-instruct-lmdeploy | 72.00 | 72.83 | 52.78 | 82.12 | 51.59 | 89.64 | 76.85 | 76.73 | 80.97 | 76.69 | 84.39 | 74.63 |
341
+ | llama-3-70b-instruct-lmdeploy | 85.00 | 84.39 | 73.61 | 92.72 | 67.46 | 97.93 | 89.81 | 81.63 | 90.65 | 87.12 | 93.25 | 89.34 |
342
+ | mistral-7b-instruct-v0.1-hf | 55.00 | 57.51 | 39.81 | 74.07 | 39.68 | 75.65 | 57.72 | 62.04 | 59.35 | 69.33 | 67.93 | 55.88 |
343
+ | mistral-7b-instruct-v0.2-hf | 61.00 | 66.76 | 46.76 | 78.67 | 36.51 | 84.97 | 68.83 | 70.20 | 68.39 | 69.33 | 73.00 | 58.09 |
344
+ | mixtral-8x7b-instruct-v0.1-hf | 66.00 | 76.59 | 57.87 | 86.59 | 50.00 | 93.78 | 83.02 | 79.18 | 82.58 | 75.46 | 86.50 | 77.94 |
345
+
346
+ | model | high_school_mathematics | college_medicine | high_school_us_history | sociology | econometrics | high_school_psychology | human_aging | us_foreign_policy |
347
+ |:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
348
+ | qwen1.5-0.5b-chat-hf | 24.44 | 35.26 | 42.16 | 47.26 | 29.82 | 40.55 | 32.29 | 47.00 |
349
+ | qwen1.5-1.8b-chat-hf | 32.22 | 43.35 | 54.90 | 48.26 | 28.95 | 61.83 | 48.43 | 71.00 |
350
+ | qwen1.5-4b-chat-hf | 36.30 | 51.45 | 71.08 | 76.62 | 34.21 | 72.29 | 58.30 | 72.00 |
351
+ | qwen1.5-7b-chat-hf | 31.11 | 61.27 | 76.47 | 79.10 | 42.11 | 81.28 | 61.43 | 83.00 |
352
+ | qwen1.5-14b-chat-hf | 41.48 | 68.79 | 80.88 | 82.59 | 48.25 | 84.40 | 72.20 | 88.00 |
353
+ | qwen1.5-32b-chat-hf | 48.52 | 75.72 | 88.73 | 86.07 | 57.02 | 90.46 | 78.03 | 95.00 |
354
+ | qwen1.5-72b-chat-hf | 51.48 | 73.99 | 90.69 | 87.06 | 59.65 | 92.11 | 79.37 | 94.00 |
355
+ | qwen1.5-110b-chat-hf | 52.22 | 76.30 | 93.14 | 87.56 | 62.28 | 91.56 | 80.27 | 88.00 |
356
+ | internlm2-chat-1.8b-hf | 31.48 | 46.82 | 56.37 | 65.17 | 28.07 | 65.87 | 50.22 | 69.00 |
357
+ | internlm2-chat-1.8b-sft-hf | 30.74 | 47.40 | 54.41 | 64.18 | 29.82 | 66.24 | 48.43 | 69.00 |
358
+ | internlm2-chat-7b-hf | 33.70 | 67.05 | 79.90 | 81.09 | 48.25 | 84.04 | 67.26 | 84.00 |
359
+ | internlm2-chat-7b-sft-hf | 35.19 | 67.05 | 79.90 | 80.60 | 48.25 | 84.59 | 65.47 | 85.00 |
360
+ | internlm2-chat-20b-hf | 36.30 | 66.47 | 88.73 | 85.07 | 51.75 | 85.69 | 70.85 | 87.00 |
361
+ | internlm2-chat-20b-sft-hf | 35.93 | 65.90 | 87.75 | 85.57 | 52.63 | 84.77 | 70.85 | 87.00 |
362
+ | llama-3-8b-instruct-hf | 36.67 | 68.79 | 83.82 | 86.57 | 61.40 | 84.95 | 70.85 | 85.00 |
363
+ | llama-3-70b-instruct-hf | 57.41 | 78.61 | 89.71 | 91.54 | 74.56 | 94.50 | 82.96 | 94.00 |
364
+ | llama-3-8b-instruct-lmdeploy | 38.52 | 68.79 | 82.84 | 85.57 | 54.39 | 85.50 | 69.96 | 83.00 |
365
+ | llama-3-70b-instruct-lmdeploy | 54.81 | 79.77 | 90.20 | 92.04 | 71.05 | 94.50 | 82.96 | 93.00 |
366
+ | mistral-7b-instruct-v0.1-hf | 28.89 | 50.29 | 67.16 | 76.12 | 39.47 | 72.29 | 62.33 | 77.00 |
367
+ | mistral-7b-instruct-v0.2-hf | 30.74 | 53.18 | 73.04 | 77.11 | 42.11 | 79.82 | 63.68 | 82.00 |
368
+ | mixtral-8x7b-instruct-v0.1-hf | 35.56 | 73.41 | 85.29 | 87.06 | 60.53 | 86.97 | 74.44 | 86.00 |
opencompass/configs/datasets/mmlu/mmlu_all_sets.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mmlu_all_sets = [
2
+ 'college_biology',
3
+ 'college_chemistry',
4
+ 'college_computer_science',
5
+ 'college_mathematics',
6
+ 'college_physics',
7
+ 'electrical_engineering',
8
+ 'astronomy',
9
+ 'anatomy',
10
+ 'abstract_algebra',
11
+ 'machine_learning',
12
+ 'clinical_knowledge',
13
+ 'global_facts',
14
+ 'management',
15
+ 'nutrition',
16
+ 'marketing',
17
+ 'professional_accounting',
18
+ 'high_school_geography',
19
+ 'international_law',
20
+ 'moral_scenarios',
21
+ 'computer_security',
22
+ 'high_school_microeconomics',
23
+ 'professional_law',
24
+ 'medical_genetics',
25
+ 'professional_psychology',
26
+ 'jurisprudence',
27
+ 'world_religions',
28
+ 'philosophy',
29
+ 'virology',
30
+ 'high_school_chemistry',
31
+ 'public_relations',
32
+ 'high_school_macroeconomics',
33
+ 'human_sexuality',
34
+ 'elementary_mathematics',
35
+ 'high_school_physics',
36
+ 'high_school_computer_science',
37
+ 'high_school_european_history',
38
+ 'business_ethics',
39
+ 'moral_disputes',
40
+ 'high_school_statistics',
41
+ 'miscellaneous',
42
+ 'formal_logic',
43
+ 'high_school_government_and_politics',
44
+ 'prehistory',
45
+ 'security_studies',
46
+ 'high_school_biology',
47
+ 'logical_fallacies',
48
+ 'high_school_world_history',
49
+ 'professional_medicine',
50
+ 'high_school_mathematics',
51
+ 'college_medicine',
52
+ 'high_school_us_history',
53
+ 'sociology',
54
+ 'econometrics',
55
+ 'high_school_psychology',
56
+ 'human_aging',
57
+ 'us_foreign_policy',
58
+ 'conceptual_physics',
59
+ ]
opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
5
+ from opencompass.datasets import MMLUDatasetClean as MMLUDataset
6
+
7
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
8
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
9
+
10
+ mmlu_reader_cfg = dict(
11
+ input_columns=['input', 'A', 'B', 'C', 'D'],
12
+ output_column='target',
13
+ train_split='dev')
14
+
15
+ mmlu_all_sets = [
16
+ 'college_biology',
17
+ 'college_chemistry',
18
+ 'college_computer_science',
19
+ 'college_mathematics',
20
+ 'college_physics',
21
+ 'electrical_engineering',
22
+ 'astronomy',
23
+ 'anatomy',
24
+ 'abstract_algebra',
25
+ 'machine_learning',
26
+ 'clinical_knowledge',
27
+ 'global_facts',
28
+ 'management',
29
+ 'nutrition',
30
+ 'marketing',
31
+ 'professional_accounting',
32
+ 'high_school_geography',
33
+ 'international_law',
34
+ 'moral_scenarios',
35
+ 'computer_security',
36
+ 'high_school_microeconomics',
37
+ 'professional_law',
38
+ 'medical_genetics',
39
+ 'professional_psychology',
40
+ 'jurisprudence',
41
+ 'world_religions',
42
+ 'philosophy',
43
+ 'virology',
44
+ 'high_school_chemistry',
45
+ 'public_relations',
46
+ 'high_school_macroeconomics',
47
+ 'human_sexuality',
48
+ 'elementary_mathematics',
49
+ 'high_school_physics',
50
+ 'high_school_computer_science',
51
+ 'high_school_european_history',
52
+ 'business_ethics',
53
+ 'moral_disputes',
54
+ 'high_school_statistics',
55
+ 'miscellaneous',
56
+ 'formal_logic',
57
+ 'high_school_government_and_politics',
58
+ 'prehistory',
59
+ 'security_studies',
60
+ 'high_school_biology',
61
+ 'logical_fallacies',
62
+ 'high_school_world_history',
63
+ 'professional_medicine',
64
+ 'high_school_mathematics',
65
+ 'college_medicine',
66
+ 'high_school_us_history',
67
+ 'sociology',
68
+ 'econometrics',
69
+ 'high_school_psychology',
70
+ 'human_aging',
71
+ 'us_foreign_policy',
72
+ 'conceptual_physics',
73
+ ]
74
+
75
+ mmlu_datasets = []
76
+ for _name in mmlu_all_sets:
77
+ _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
78
+ mmlu_infer_cfg = dict(
79
+ ice_template=dict(
80
+ type=PromptTemplate,
81
+ template={
82
+ opt:
83
+ f'{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}\n'
84
+ for opt in ['A', 'B', 'C', 'D']
85
+ },
86
+ ),
87
+ prompt_template=dict(
88
+ type=PromptTemplate,
89
+ template={
90
+ opt:
91
+ f'{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}'
92
+ for opt in ['A', 'B', 'C', 'D']
93
+ },
94
+ ice_token='</E>',
95
+ ),
96
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
97
+ inferencer=dict(type=PPLInferencer),
98
+ )
99
+
100
+ mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
101
+ analyze_contamination=True)
102
+
103
+ mmlu_datasets.append(
104
+ dict(
105
+ abbr=f'lukaemon_mmlu_{_name}',
106
+ type=MMLUDataset,
107
+ path='opencompass/mmlu',
108
+ name=_name,
109
+ reader_cfg=mmlu_reader_cfg,
110
+ infer_cfg=mmlu_infer_cfg,
111
+ eval_cfg=mmlu_eval_cfg,
112
+ ))
113
+
114
+ del _name, _hint
opencompass/configs/datasets/mmlu/mmlu_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .mmlu_gen_4d595a import mmlu_datasets # noqa: F401, F403
opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
9
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
10
+
11
+ mmlu_reader_cfg = dict(
12
+ input_columns=['input', 'A', 'B', 'C', 'D'],
13
+ output_column='target',
14
+ train_split='dev')
15
+
16
+ mmlu_prompt_template = dict(
17
+ type='PromptTemplate',
18
+ template=None,
19
+ ice_token='</E>')
20
+
21
+ mmlu_infer_cfg = dict(
22
+ ice_template=dict(
23
+ type=PromptTemplate,
24
+ template=dict(round=[
25
+ dict(
26
+ role='HUMAN',
27
+ prompt='{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '
28
+ ),
29
+ dict(role='BOT', prompt='{target}\n')
30
+ ])),
31
+ prompt_template=mmlu_prompt_template,
32
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
33
+ inferencer=dict(type=GenInferencer))
34
+
35
+ mmlu_eval_cfg = dict(
36
+ evaluator=dict(type=AccEvaluator),
37
+ pred_postprocessor=dict(type=first_capital_postprocess))
38
+
39
+ mmlu_all_sets = [
40
+ 'college_biology',
41
+ 'college_chemistry',
42
+ 'college_computer_science',
43
+ 'college_mathematics',
44
+ 'college_physics',
45
+ 'electrical_engineering',
46
+ 'astronomy',
47
+ 'anatomy',
48
+ 'abstract_algebra',
49
+ 'machine_learning',
50
+ 'clinical_knowledge',
51
+ 'global_facts',
52
+ 'management',
53
+ 'nutrition',
54
+ 'marketing',
55
+ 'professional_accounting',
56
+ 'high_school_geography',
57
+ 'international_law',
58
+ 'moral_scenarios',
59
+ 'computer_security',
60
+ 'high_school_microeconomics',
61
+ 'professional_law',
62
+ 'medical_genetics',
63
+ 'professional_psychology',
64
+ 'jurisprudence',
65
+ 'world_religions',
66
+ 'philosophy',
67
+ 'virology',
68
+ 'high_school_chemistry',
69
+ 'public_relations',
70
+ 'high_school_macroeconomics',
71
+ 'human_sexuality',
72
+ 'elementary_mathematics',
73
+ 'high_school_physics',
74
+ 'high_school_computer_science',
75
+ 'high_school_european_history',
76
+ 'business_ethics',
77
+ 'moral_disputes',
78
+ 'high_school_statistics',
79
+ 'miscellaneous',
80
+ 'formal_logic',
81
+ 'high_school_government_and_politics',
82
+ 'prehistory',
83
+ 'security_studies',
84
+ 'high_school_biology',
85
+ 'logical_fallacies',
86
+ 'high_school_world_history',
87
+ 'professional_medicine',
88
+ 'high_school_mathematics',
89
+ 'college_medicine',
90
+ 'high_school_us_history',
91
+ 'sociology',
92
+ 'econometrics',
93
+ 'high_school_psychology',
94
+ 'human_aging',
95
+ 'us_foreign_policy',
96
+ 'conceptual_physics',
97
+ ]
98
+
99
+ mmlu_datasets = []
100
+ for _name in mmlu_all_sets:
101
+ mmlu_datasets.append(
102
+ dict(
103
+ abbr=f'lukaemon_mmlu_{_name}',
104
+ type=MMLUDataset,
105
+ path='opencompass/mmlu',
106
+ name=_name,
107
+ reader_cfg=mmlu_reader_cfg,
108
+ infer_cfg=mmlu_infer_cfg.copy(),
109
+ eval_cfg=mmlu_eval_cfg))
110
+
111
+ mmlu_datasets[-1]['infer_cfg'][
112
+ 'prompt_template'] = mmlu_prompt_template.copy()
113
+ mmlu_datasets[-1]['infer_cfg']['prompt_template']['template'] = \
114
+ dict(
115
+ begin=[
116
+ dict(role='SYSTEM', fallback_role='HUMAN', prompt=f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.'),
117
+ '</E>',
118
+ ],
119
+ round=[
120
+ dict(role='HUMAN', prompt='{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '),
121
+ ]
122
+ )
123
+
124
+ del _name
opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
5
+ from opencompass.datasets import MMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
9
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
10
+
11
+ mmlu_reader_cfg = dict(
12
+ input_columns=['input', 'A', 'B', 'C', 'D'],
13
+ output_column='target',
14
+ train_split='dev')
15
+
16
+ mmlu_all_sets = [
17
+ 'college_biology',
18
+ 'college_chemistry',
19
+ 'college_computer_science',
20
+ 'college_mathematics',
21
+ 'college_physics',
22
+ 'electrical_engineering',
23
+ 'astronomy',
24
+ 'anatomy',
25
+ 'abstract_algebra',
26
+ 'machine_learning',
27
+ 'clinical_knowledge',
28
+ 'global_facts',
29
+ 'management',
30
+ 'nutrition',
31
+ 'marketing',
32
+ 'professional_accounting',
33
+ 'high_school_geography',
34
+ 'international_law',
35
+ 'moral_scenarios',
36
+ 'computer_security',
37
+ 'high_school_microeconomics',
38
+ 'professional_law',
39
+ 'medical_genetics',
40
+ 'professional_psychology',
41
+ 'jurisprudence',
42
+ 'world_religions',
43
+ 'philosophy',
44
+ 'virology',
45
+ 'high_school_chemistry',
46
+ 'public_relations',
47
+ 'high_school_macroeconomics',
48
+ 'human_sexuality',
49
+ 'elementary_mathematics',
50
+ 'high_school_physics',
51
+ 'high_school_computer_science',
52
+ 'high_school_european_history',
53
+ 'business_ethics',
54
+ 'moral_disputes',
55
+ 'high_school_statistics',
56
+ 'miscellaneous',
57
+ 'formal_logic',
58
+ 'high_school_government_and_politics',
59
+ 'prehistory',
60
+ 'security_studies',
61
+ 'high_school_biology',
62
+ 'logical_fallacies',
63
+ 'high_school_world_history',
64
+ 'professional_medicine',
65
+ 'high_school_mathematics',
66
+ 'college_medicine',
67
+ 'high_school_us_history',
68
+ 'sociology',
69
+ 'econometrics',
70
+ 'high_school_psychology',
71
+ 'human_aging',
72
+ 'us_foreign_policy',
73
+ 'conceptual_physics',
74
+ ]
75
+
76
+ mmlu_datasets = []
77
+ for _name in mmlu_all_sets:
78
+ _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
79
+ mmlu_infer_cfg = dict(
80
+ ice_template=dict(
81
+ type=PromptTemplate,
82
+ template=dict(round=[
83
+ dict(
84
+ role='HUMAN',
85
+ prompt=
86
+ f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
87
+ ),
88
+ dict(role='BOT', prompt='{target}\n')
89
+ ]),
90
+ ),
91
+ prompt_template=dict(
92
+ type=PromptTemplate,
93
+ template=dict(
94
+ begin='</E>',
95
+ round=[
96
+ dict(
97
+ role='HUMAN',
98
+ prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
99
+ ),
100
+ ],
101
+ ),
102
+ ice_token='</E>',
103
+ ),
104
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
105
+ inferencer=dict(type=GenInferencer),
106
+ )
107
+
108
+ mmlu_eval_cfg = dict(
109
+ evaluator=dict(type=AccwithDetailsEvaluator),
110
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
111
+
112
+ mmlu_datasets.append(
113
+ dict(
114
+ abbr=f'lukaemon_mmlu_{_name}',
115
+ type=MMLUDataset,
116
+ path='opencompass/mmlu',
117
+ name=_name,
118
+ reader_cfg=mmlu_reader_cfg,
119
+ infer_cfg=mmlu_infer_cfg,
120
+ eval_cfg=mmlu_eval_cfg,
121
+ ))
122
+
123
+ del _name, _hint
opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
9
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
10
+
11
+ mmlu_reader_cfg = dict(
12
+ input_columns=['input', 'A', 'B', 'C', 'D'],
13
+ output_column='target',
14
+ train_split='dev')
15
+
16
+ mmlu_all_sets = [
17
+ 'college_biology',
18
+ 'college_chemistry',
19
+ 'college_computer_science',
20
+ 'college_mathematics',
21
+ 'college_physics',
22
+ 'electrical_engineering',
23
+ 'astronomy',
24
+ 'anatomy',
25
+ 'abstract_algebra',
26
+ 'machine_learning',
27
+ 'clinical_knowledge',
28
+ 'global_facts',
29
+ 'management',
30
+ 'nutrition',
31
+ 'marketing',
32
+ 'professional_accounting',
33
+ 'high_school_geography',
34
+ 'international_law',
35
+ 'moral_scenarios',
36
+ 'computer_security',
37
+ 'high_school_microeconomics',
38
+ 'professional_law',
39
+ 'medical_genetics',
40
+ 'professional_psychology',
41
+ 'jurisprudence',
42
+ 'world_religions',
43
+ 'philosophy',
44
+ 'virology',
45
+ 'high_school_chemistry',
46
+ 'public_relations',
47
+ 'high_school_macroeconomics',
48
+ 'human_sexuality',
49
+ 'elementary_mathematics',
50
+ 'high_school_physics',
51
+ 'high_school_computer_science',
52
+ 'high_school_european_history',
53
+ 'business_ethics',
54
+ 'moral_disputes',
55
+ 'high_school_statistics',
56
+ 'miscellaneous',
57
+ 'formal_logic',
58
+ 'high_school_government_and_politics',
59
+ 'prehistory',
60
+ 'security_studies',
61
+ 'high_school_biology',
62
+ 'logical_fallacies',
63
+ 'high_school_world_history',
64
+ 'professional_medicine',
65
+ 'high_school_mathematics',
66
+ 'college_medicine',
67
+ 'high_school_us_history',
68
+ 'sociology',
69
+ 'econometrics',
70
+ 'high_school_psychology',
71
+ 'human_aging',
72
+ 'us_foreign_policy',
73
+ 'conceptual_physics',
74
+ ]
75
+
76
+ mmlu_datasets = []
77
+ for _name in mmlu_all_sets:
78
+ _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
79
+ mmlu_infer_cfg = dict(
80
+ ice_template=dict(
81
+ type=PromptTemplate,
82
+ template=dict(round=[
83
+ dict(
84
+ role='HUMAN',
85
+ prompt=
86
+ f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
87
+ ),
88
+ dict(role='BOT', prompt='{target}\n')
89
+ ]),
90
+ ),
91
+ prompt_template=dict(
92
+ type=PromptTemplate,
93
+ template=dict(
94
+ begin='</E>',
95
+ round=[
96
+ dict(
97
+ role='HUMAN',
98
+ prompt=
99
+ f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
100
+ ),
101
+ ],
102
+ ),
103
+ ice_token='</E>',
104
+ ),
105
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
106
+ inferencer=dict(type=GenInferencer),
107
+ )
108
+
109
+ mmlu_eval_cfg = dict(
110
+ evaluator=dict(type=AccEvaluator),
111
+ pred_postprocessor=dict(type=first_capital_postprocess))
112
+
113
+ mmlu_datasets.append(
114
+ dict(
115
+ abbr=f'lukaemon_mmlu_{_name}',
116
+ type=MMLUDataset,
117
+ path='opencompass/mmlu',
118
+ name=_name,
119
+ reader_cfg=mmlu_reader_cfg,
120
+ infer_cfg=mmlu_infer_cfg,
121
+ eval_cfg=mmlu_eval_cfg,
122
+ ))
123
+
124
+ del _name, _hint
opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
9
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
10
+ mmlu_reader_cfg = dict(
11
+ input_columns=['input', 'A', 'B', 'C', 'D'],
12
+ output_column='target',
13
+ train_split='dev')
14
+
15
+ mmlu_all_sets = [
16
+ 'college_biology',
17
+ 'college_chemistry',
18
+ 'college_computer_science',
19
+ 'college_mathematics',
20
+ 'college_physics',
21
+ 'electrical_engineering',
22
+ 'astronomy',
23
+ 'anatomy',
24
+ 'abstract_algebra',
25
+ 'machine_learning',
26
+ 'clinical_knowledge',
27
+ 'global_facts',
28
+ 'management',
29
+ 'nutrition',
30
+ 'marketing',
31
+ 'professional_accounting',
32
+ 'high_school_geography',
33
+ 'international_law',
34
+ 'moral_scenarios',
35
+ 'computer_security',
36
+ 'high_school_microeconomics',
37
+ 'professional_law',
38
+ 'medical_genetics',
39
+ 'professional_psychology',
40
+ 'jurisprudence',
41
+ 'world_religions',
42
+ 'philosophy',
43
+ 'virology',
44
+ 'high_school_chemistry',
45
+ 'public_relations',
46
+ 'high_school_macroeconomics',
47
+ 'human_sexuality',
48
+ 'elementary_mathematics',
49
+ 'high_school_physics',
50
+ 'high_school_computer_science',
51
+ 'high_school_european_history',
52
+ 'business_ethics',
53
+ 'moral_disputes',
54
+ 'high_school_statistics',
55
+ 'miscellaneous',
56
+ 'formal_logic',
57
+ 'high_school_government_and_politics',
58
+ 'prehistory',
59
+ 'security_studies',
60
+ 'high_school_biology',
61
+ 'logical_fallacies',
62
+ 'high_school_world_history',
63
+ 'professional_medicine',
64
+ 'high_school_mathematics',
65
+ 'college_medicine',
66
+ 'high_school_us_history',
67
+ 'sociology',
68
+ 'econometrics',
69
+ 'high_school_psychology',
70
+ 'human_aging',
71
+ 'us_foreign_policy',
72
+ 'conceptual_physics',
73
+ ]
74
+
75
+ mmlu_datasets = []
76
+ for _name in mmlu_all_sets:
77
+ _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
78
+ mmlu_infer_cfg = dict(
79
+ ice_template=dict(
80
+ type=PromptTemplate,
81
+ template=
82
+ '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: {target}\n',
83
+ ),
84
+ prompt_template=dict(
85
+ type=PromptTemplate,
86
+ template=
87
+ f'{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:',
88
+ ice_token='</E>',
89
+ ),
90
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
91
+ inferencer=dict(type=GenInferencer),
92
+ )
93
+
94
+ mmlu_eval_cfg = dict(
95
+ evaluator=dict(type=AccEvaluator),
96
+ pred_postprocessor=dict(type=first_capital_postprocess),
97
+ )
98
+
99
+ mmlu_datasets.append(
100
+ dict(
101
+ abbr=f'lukaemon_mmlu_{_name}',
102
+ type=MMLUDataset,
103
+ path='opencompass/mmlu',
104
+ name=_name,
105
+ reader_cfg=mmlu_reader_cfg,
106
+ infer_cfg=mmlu_infer_cfg,
107
+ eval_cfg=mmlu_eval_cfg,
108
+ ))
109
+
110
+ del _name, _hint
opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
9
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
10
+
11
+ mmlu_reader_cfg = dict(
12
+ input_columns=['input', 'A', 'B', 'C', 'D'],
13
+ output_column='target',
14
+ train_split='dev')
15
+
16
+ mmlu_all_sets = [
17
+ 'college_biology',
18
+ 'college_chemistry',
19
+ 'college_computer_science',
20
+ 'college_mathematics',
21
+ 'college_physics',
22
+ 'electrical_engineering',
23
+ 'astronomy',
24
+ 'anatomy',
25
+ 'abstract_algebra',
26
+ 'machine_learning',
27
+ 'clinical_knowledge',
28
+ 'global_facts',
29
+ 'management',
30
+ 'nutrition',
31
+ 'marketing',
32
+ 'professional_accounting',
33
+ 'high_school_geography',
34
+ 'international_law',
35
+ 'moral_scenarios',
36
+ 'computer_security',
37
+ 'high_school_microeconomics',
38
+ 'professional_law',
39
+ 'medical_genetics',
40
+ 'professional_psychology',
41
+ 'jurisprudence',
42
+ 'world_religions',
43
+ 'philosophy',
44
+ 'virology',
45
+ 'high_school_chemistry',
46
+ 'public_relations',
47
+ 'high_school_macroeconomics',
48
+ 'human_sexuality',
49
+ 'elementary_mathematics',
50
+ 'high_school_physics',
51
+ 'high_school_computer_science',
52
+ 'high_school_european_history',
53
+ 'business_ethics',
54
+ 'moral_disputes',
55
+ 'high_school_statistics',
56
+ 'miscellaneous',
57
+ 'formal_logic',
58
+ 'high_school_government_and_politics',
59
+ 'prehistory',
60
+ 'security_studies',
61
+ 'high_school_biology',
62
+ 'logical_fallacies',
63
+ 'high_school_world_history',
64
+ 'professional_medicine',
65
+ 'high_school_mathematics',
66
+ 'college_medicine',
67
+ 'high_school_us_history',
68
+ 'sociology',
69
+ 'econometrics',
70
+ 'high_school_psychology',
71
+ 'human_aging',
72
+ 'us_foreign_policy',
73
+ 'conceptual_physics',
74
+ ]
75
+
76
+ mmlu_datasets = []
77
+ for _name in mmlu_all_sets:
78
+ _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
79
+ mmlu_infer_cfg = dict(
80
+ ice_template=dict(
81
+ type=PromptTemplate,
82
+ template=dict(round=[
83
+ dict(
84
+ role='HUMAN',
85
+ prompt=
86
+ f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
87
+ ),
88
+ dict(role='BOT', prompt='{target}\n')
89
+ ]),
90
+ ),
91
+ prompt_template=dict(
92
+ type=PromptTemplate,
93
+ template=dict(
94
+ begin='</E>',
95
+ round=[
96
+ dict(
97
+ role='HUMAN',
98
+ prompt=
99
+ f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
100
+ ),
101
+ ],
102
+ ),
103
+ ice_token='</E>',
104
+ ),
105
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
106
+ inferencer=dict(type=GenInferencer),
107
+ )
108
+
109
+ mmlu_eval_cfg = dict(
110
+ evaluator=dict(type=AccEvaluator),
111
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
112
+
113
+ mmlu_datasets.append(
114
+ dict(
115
+ abbr=f'lukaemon_mmlu_{_name}',
116
+ type=MMLUDataset,
117
+ path='opencompass/mmlu',
118
+ name=_name,
119
+ reader_cfg=mmlu_reader_cfg,
120
+ infer_cfg=mmlu_infer_cfg,
121
+ eval_cfg=mmlu_eval_cfg,
122
+ ))
123
+
124
+ del _name, _hint
opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
5
+ from opencompass.datasets import MMLUDataset
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+ from opencompass.utils.model_postprocessors import navie_model_postprocess
8
+ from opencompass.utils.postprocessors.naive import OPTION_NAVIE_PROMPT_TEMPLATE
9
+
10
+
11
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
12
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
13
+
14
+ mmlu_reader_cfg = dict(
15
+ input_columns=['input', 'A', 'B', 'C', 'D'],
16
+ output_column='target',
17
+ train_split='dev')
18
+
19
+ mmlu_all_sets = [
20
+ 'college_biology',
21
+ 'college_chemistry',
22
+ 'college_computer_science',
23
+ 'college_mathematics',
24
+ 'college_physics',
25
+ 'electrical_engineering',
26
+ 'astronomy',
27
+ 'anatomy',
28
+ 'abstract_algebra',
29
+ 'machine_learning',
30
+ 'clinical_knowledge',
31
+ 'global_facts',
32
+ 'management',
33
+ 'nutrition',
34
+ 'marketing',
35
+ 'professional_accounting',
36
+ 'high_school_geography',
37
+ 'international_law',
38
+ 'moral_scenarios',
39
+ 'computer_security',
40
+ 'high_school_microeconomics',
41
+ 'professional_law',
42
+ 'medical_genetics',
43
+ 'professional_psychology',
44
+ 'jurisprudence',
45
+ 'world_religions',
46
+ 'philosophy',
47
+ 'virology',
48
+ 'high_school_chemistry',
49
+ 'public_relations',
50
+ 'high_school_macroeconomics',
51
+ 'human_sexuality',
52
+ 'elementary_mathematics',
53
+ 'high_school_physics',
54
+ 'high_school_computer_science',
55
+ 'high_school_european_history',
56
+ 'business_ethics',
57
+ 'moral_disputes',
58
+ 'high_school_statistics',
59
+ 'miscellaneous',
60
+ 'formal_logic',
61
+ 'high_school_government_and_politics',
62
+ 'prehistory',
63
+ 'security_studies',
64
+ 'high_school_biology',
65
+ 'logical_fallacies',
66
+ 'high_school_world_history',
67
+ 'professional_medicine',
68
+ 'high_school_mathematics',
69
+ 'college_medicine',
70
+ 'high_school_us_history',
71
+ 'sociology',
72
+ 'econometrics',
73
+ 'high_school_psychology',
74
+ 'human_aging',
75
+ 'us_foreign_policy',
76
+ 'conceptual_physics',
77
+ ]
78
+
79
+ mmlu_datasets = []
80
+ for _name in mmlu_all_sets:
81
+ _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
82
+ mmlu_infer_cfg = dict(
83
+ ice_template=dict(
84
+ type=PromptTemplate,
85
+ template=dict(round=[
86
+ dict(
87
+ role='HUMAN',
88
+ prompt=
89
+ f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
90
+ ),
91
+ dict(role='BOT', prompt='{target}\n')
92
+ ]),
93
+ ),
94
+ prompt_template=dict(
95
+ type=PromptTemplate,
96
+ template=dict(
97
+ begin='</E>',
98
+ round=[
99
+ dict(
100
+ role='HUMAN',
101
+ prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
102
+ ),
103
+ ],
104
+ ),
105
+ ice_token='</E>',
106
+ ),
107
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
108
+ inferencer=dict(type=GenInferencer),
109
+ )
110
+
111
+ # # You can write your own postprocess prompt like:
112
+ # MMLU_NAVIE_PROMPT_TEMPLATE = """
113
+ # There is a detailed explanation of the final answer you should extract:
114
+ # 1. ...
115
+ # 2. ...
116
+ # ...
117
+ # """
118
+
119
+ mmlu_eval_cfg = dict(
120
+ evaluator=dict(type=AccwithDetailsEvaluator),
121
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
122
+ model_postprocessor=dict(
123
+ type=navie_model_postprocess,
124
+ custom_instruction=OPTION_NAVIE_PROMPT_TEMPLATE,
125
+ model_name='',
126
+ api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
127
+ )
128
+
129
+
130
+ mmlu_datasets.append(
131
+ dict(
132
+ abbr=f'lukaemon_mmlu_{_name}',
133
+ type=MMLUDataset,
134
+ path='opencompass/mmlu',
135
+ name=_name,
136
+ reader_cfg=mmlu_reader_cfg,
137
+ infer_cfg=mmlu_infer_cfg,
138
+ eval_cfg=mmlu_eval_cfg,
139
+ ))
140
+
141
+ del _name, _hint
opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.datasets import MMLUDataset
7
+ from opencompass.utils.text_postprocessors import match_answer_pattern
8
+
9
+ with read_base():
10
+ from .mmlu_all_sets import mmlu_all_sets
11
+
12
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
13
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
14
+
15
+ QUERY_TEMPLATE = """
16
+ Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
17
+
18
+ {input}
19
+
20
+ A) {A}
21
+ B) {B}
22
+ C) {C}
23
+ D) {D}
24
+ """.strip()
25
+
26
+ mmlu_reader_cfg = dict(
27
+ input_columns=['input', 'A', 'B', 'C', 'D'],
28
+ output_column='target',
29
+ train_split='dev')
30
+
31
+ mmlu_datasets = []
32
+ for name in mmlu_all_sets:
33
+ mmlu_infer_cfg = dict(
34
+ prompt_template=dict(
35
+ type=PromptTemplate,
36
+ template=dict(
37
+ round=[
38
+ dict(role='HUMAN', prompt=QUERY_TEMPLATE),
39
+ ],
40
+ ),
41
+ ),
42
+ retriever=dict(type=ZeroRetriever),
43
+ inferencer=dict(type=GenInferencer),
44
+ )
45
+
46
+ mmlu_eval_cfg = dict(
47
+ evaluator=dict(type=AccEvaluator),
48
+ pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'))
49
+
50
+ mmlu_datasets.append(
51
+ dict(
52
+ abbr=f'lukaemon_mmlu_{name}',
53
+ type=MMLUDataset,
54
+ path='opencompass/mmlu',
55
+ name=name,
56
+ reader_cfg=mmlu_reader_cfg,
57
+ infer_cfg=mmlu_infer_cfg,
58
+ eval_cfg=mmlu_eval_cfg,
59
+ ))
opencompass/configs/datasets/mmlu/mmlu_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .mmlu_ppl_ac766d import mmlu_datasets # noqa: F401, F403
opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
5
+ from opencompass.datasets import MMLUDataset
6
+
7
+ # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
8
+ # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
9
+
10
+ mmlu_reader_cfg = dict(
11
+ input_columns=['input', 'A', 'B', 'C', 'D'],
12
+ output_column='target',
13
+ train_split='dev')
14
+
15
+ mmlu_all_sets = [
16
+ 'college_biology',
17
+ 'college_chemistry',
18
+ 'college_computer_science',
19
+ 'college_mathematics',
20
+ 'college_physics',
21
+ 'electrical_engineering',
22
+ 'astronomy',
23
+ 'anatomy',
24
+ 'abstract_algebra',
25
+ 'machine_learning',
26
+ 'clinical_knowledge',
27
+ 'global_facts',
28
+ 'management',
29
+ 'nutrition',
30
+ 'marketing',
31
+ 'professional_accounting',
32
+ 'high_school_geography',
33
+ 'international_law',
34
+ 'moral_scenarios',
35
+ 'computer_security',
36
+ 'high_school_microeconomics',
37
+ 'professional_law',
38
+ 'medical_genetics',
39
+ 'professional_psychology',
40
+ 'jurisprudence',
41
+ 'world_religions',
42
+ 'philosophy',
43
+ 'virology',
44
+ 'high_school_chemistry',
45
+ 'public_relations',
46
+ 'high_school_macroeconomics',
47
+ 'human_sexuality',
48
+ 'elementary_mathematics',
49
+ 'high_school_physics',
50
+ 'high_school_computer_science',
51
+ 'high_school_european_history',
52
+ 'business_ethics',
53
+ 'moral_disputes',
54
+ 'high_school_statistics',
55
+ 'miscellaneous',
56
+ 'formal_logic',
57
+ 'high_school_government_and_politics',
58
+ 'prehistory',
59
+ 'security_studies',
60
+ 'high_school_biology',
61
+ 'logical_fallacies',
62
+ 'high_school_world_history',
63
+ 'professional_medicine',
64
+ 'high_school_mathematics',
65
+ 'college_medicine',
66
+ 'high_school_us_history',
67
+ 'sociology',
68
+ 'econometrics',
69
+ 'high_school_psychology',
70
+ 'human_aging',
71
+ 'us_foreign_policy',
72
+ 'conceptual_physics',
73
+ ]
74
+
75
+ mmlu_datasets = []
76
+ for _name in mmlu_all_sets:
77
+ _hint = f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.\n\n'
78
+ question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
79
+ mmlu_infer_cfg = dict(
80
+ ice_template=dict(
81
+ type=PromptTemplate,
82
+ template={opt: f'{question_overall}\nAnswer: {opt}\n' for opt in ['A', 'B', 'C', 'D']},
83
+ ),
84
+ prompt_template=dict(
85
+ type=PromptTemplate,
86
+ template={opt: f'{_hint}</E>{question_overall}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
87
+ ice_token='</E>',
88
+ ),
89
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
90
+ inferencer=dict(type=PPLInferencer),
91
+ )
92
+
93
+ mmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator), )
94
+
95
+ mmlu_datasets.append(
96
+ dict(
97
+ abbr=f'lukaemon_mmlu_{_name}',
98
+ type=MMLUDataset,
99
+ path='opencompass/mmlu',
100
+ name=_name,
101
+ reader_cfg=mmlu_reader_cfg,
102
+ infer_cfg=mmlu_infer_cfg,
103
+ eval_cfg=mmlu_eval_cfg,
104
+ ))
105
+
106
+ del _name, _hint