tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

tuandunghcmut commited on Apr 11

Commit

cc8629b

verified ·

1 Parent(s): 56c27a3

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py +55 -0
opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py +53 -0
opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py +48 -0
opencompass/configs/datasets/ARC_c/ARC_c_few_shot_ppl.py +63 -0
opencompass/configs/datasets/ARC_c/ARC_c_gen.py +4 -0
opencompass/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py +44 -0
opencompass/configs/datasets/ARC_c/ARC_c_ppl.py +4 -0
opencompass/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py +37 -0
opencompass/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py +54 -0
opencompass/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py +36 -0
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py +4 -0
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py +51 -0
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl.py +4 -0
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py +45 -0
opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_acccb5.py +39 -0
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py +4 -0
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py +29 -0
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_30dea0.py +42 -0
opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py +35 -0
opencompass/configs/datasets/humaneval/README.md +69 -0
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py +36 -0
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py +36 -0
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py +36 -0
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py +33 -0
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py +31 -0
opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py +41 -0
opencompass/configs/datasets/humaneval/humaneval_gen.py +4 -0
opencompass/configs/datasets/humaneval/humaneval_gen_66a7f4.py +35 -0
opencompass/configs/datasets/humaneval/humaneval_gen_8e312c.py +37 -0
opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py +36 -0
opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py +36 -0
opencompass/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py +37 -0
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py +4 -0
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py +35 -0
opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py +37 -0
opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py +36 -0
opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py +37 -0
opencompass/configs/datasets/mmlu/README.md +368 -0
opencompass/configs/datasets/mmlu/mmlu_all_sets.py +59 -0
opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py +114 -0
opencompass/configs/datasets/mmlu/mmlu_gen.py +4 -0
opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py +124 -0
opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py +123 -0
opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py +124 -0
opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py +110 -0
opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py +124 -0
opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py +141 -0
opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py +59 -0
opencompass/configs/datasets/mmlu/mmlu_ppl.py +4 -0
opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py +106 -0

opencompass/configs/datasets/ARC_c/ARC_c_clean_ppl.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
+from opencompass.datasets import ARCDatasetClean as ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}')
+                ], ),
+            'B':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}')
+                ], ),
+            'C':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}')
+                ], ),
+            'D':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}')
+                ], ),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
+                      analyze_contamination=True)
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c-test',
+        path='opencompass/ai2_arc-test',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]

opencompass/configs/datasets/ARC_c/ARC_c_cot_gen_926652.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess, match_answer_pattern
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+{question}
+A. {textA}
+B. {textB}
+C. {textC}
+D. {textD}
+""".strip()
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=QUERY_TEMPLATE)
+            ], ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+ARC_c_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+ARC_c_datasets = [
+    dict(
+        abbr='ARC-c',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]

opencompass/configs/datasets/ARC_c/ARC_c_few_shot_gen_e9b043.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey',
+)
+ARC_c_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:',
+                ),
+                dict(role='BOT', prompt='{answerKey}'),
+            ],
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=GenInferencer, max_out_len=50),
+)
+ARC_c_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+ARC_c_datasets = [
+    dict(
+        abbr='ARC-c',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]

opencompass/configs/datasets/ARC_c/ARC_c_few_shot_ppl.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey',
+)
+ARC_c_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template={
+            'A': dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}'),
+                ],
+            ),
+            'B': dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}'),
+                ],
+            ),
+            'C': dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}'),
+                ],
+            ),
+            'D': dict(
+                begin='</E>',
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}'),
+                ],
+            ),
+        },
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
+    inferencer=dict(type=PPLInferencer),
+)
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]

opencompass/configs/datasets/ARC_c/ARC_c_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .ARC_c_gen_1e0de5 import ARC_c_datasets  # noqa: F401, F403

opencompass/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    'Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:'
+                )
+            ], ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+ARC_c_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+ARC_c_datasets = [
+    dict(
+        abbr='ARC-c',
+        type=ARCDataset,
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg,
+    )
+]

opencompass/configs/datasets/ARC_c/ARC_c_ppl.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .ARC_c_ppl_a450bd import ARC_c_datasets  # noqa: F401, F403

opencompass/configs/datasets/ARC_c/ARC_c_ppl_2ef631.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            opt: dict(
+                round=[
+                    dict(role='HUMAN', prompt=f'{{question}}\nA. {{textA}}\nB. {{textB}}\nC. {{textC}}\nD. {{textD}}'),
+                    dict(role='BOT', prompt=f'Answer: {opt}'),
+                ]
+            ) for opt in ['A', 'B', 'C', 'D']
+        },
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]

opencompass/configs/datasets/ARC_c/ARC_c_ppl_a450bd.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textA}')
+                ], ),
+            'B':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textB}')
+                ], ),
+            'C':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textC}')
+                ], ),
+            'D':
+            dict(
+                round=[
+                    dict(role='HUMAN', prompt='Question: {question}\nAnswer: '),
+                    dict(role='BOT', prompt='{textD}')
+                ], ),
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]

opencompass/configs/datasets/ARC_c/ARC_c_ppl_d52a21.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from mmengine.config import read_base
+# with read_base():
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ARCDataset
+ARC_c_reader_cfg = dict(
+    input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
+    output_column='answerKey')
+ARC_c_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            'A': 'Question: {question}\nAnswer: {textA}',
+            'B': 'Question: {question}\nAnswer: {textB}',
+            'C': 'Question: {question}\nAnswer: {textC}',
+            'D': 'Question: {question}\nAnswer: {textD}'
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+ARC_c_datasets = [
+    dict(
+        type=ARCDataset,
+        abbr='ARC-c',
+        path='opencompass/ai2_arc-dev',
+        name='ARC-Challenge',
+        reader_cfg=ARC_c_reader_cfg,
+        infer_cfg=ARC_c_infer_cfg,
+        eval_cfg=ARC_c_eval_cfg)
+]

opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .FewCLUE_chid_gen_0a29a2 import chid_datasets  # noqa: F401, F403

opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CHIDDatasetV2
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+chid_reader_cfg = dict(
+    input_columns=['content','A','B','C','D','E','F','G'],
+    output_column='answer',
+)
+chid_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    '{content}\n请选择______处所填的词\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nF. {F}\nG. {G}\n请从”A“，”B“，”C“，”D“，”E“，”F“，”G“中进行选择。答：',
+                ),
+            ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+chid_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+chid_datasets = [
+    dict(
+        abbr='chid-dev',
+        type=CHIDDatasetV2,
+        path='./data/FewCLUE/chid/dev_few_all.json',
+        reader_cfg=chid_reader_cfg,
+        infer_cfg=chid_infer_cfg,
+        eval_cfg=chid_eval_cfg,
+    ),
+    dict(
+        abbr='chid-test',
+        type=CHIDDatasetV2,
+        path='./data/FewCLUE/chid/test_public.json',
+        reader_cfg=chid_reader_cfg,
+        infer_cfg=chid_infer_cfg,
+        eval_cfg=chid_eval_cfg,
+    ),
+]

opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .FewCLUE_chid_ppl_8f2872 import chid_datasets  # noqa: F401, F403

opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_8f2872.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CHIDDataset
+chid_reader_cfg = dict(
+    input_columns=[f'content{i}' for i in range(7)], output_column='answer')
+chid_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            i: dict(
+                round=[
+                    dict(role='HUMAN', prompt=f'以下句子是否通顺？\n{{content{i}}}'),
+                    dict(role='BOT', prompt='这个句子是通顺的。'),
+                ], )
+            for i in range(7)
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+chid_datasets = [
+    dict(
+        type=CHIDDataset,
+        path='json',
+        abbr='chid-dev',
+        data_files='./data/FewCLUE/chid/dev_few_all.json',
+        split='train',
+        reader_cfg=chid_reader_cfg,
+        infer_cfg=chid_infer_cfg,
+        eval_cfg=chid_eval_cfg),
+    dict(
+        type=CHIDDataset,
+        path='json',
+        abbr='chid-test',
+        data_files='./data/FewCLUE/chid/test_public.json',
+        split='train',
+        reader_cfg=chid_reader_cfg,
+        infer_cfg=chid_infer_cfg,
+        eval_cfg=chid_eval_cfg),
+]

opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_acccb5.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CHIDDataset
+chid_reader_cfg = dict(
+    input_columns=[f'content{i}' for i in range(7)], output_column='answer')
+chid_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={i: f'以下句子是否通顺？\n{{content{i}}}\n这个句子是通顺的。'
+                  for i in range(7)}),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+chid_datasets = [
+    dict(
+        type=CHIDDataset,
+        path='json',
+        abbr='chid-dev',
+        data_files='./data/FewCLUE/chid/dev_few_all.json',
+        split='train',
+        reader_cfg=chid_reader_cfg,
+        infer_cfg=chid_infer_cfg,
+        eval_cfg=chid_eval_cfg),
+    dict(
+        type=CHIDDataset,
+        path='json',
+        abbr='chid-test',
+        data_files='./data/FewCLUE/chid/test_public.json',
+        split='train',
+        reader_cfg=chid_reader_cfg,
+        infer_cfg=chid_infer_cfg,
+        eval_cfg=chid_eval_cfg),
+]

opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets  # noqa: F401, F403

opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import EMEvaluator
+from opencompass.datasets import ReCoRDDataset, ReCoRD_postprocess
+ReCoRD_reader_cfg = dict(
+    input_columns=['question', 'text'], output_column='answers')
+ReCoRD_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=
+        'Passage:{text}\nResult:{question}\nQuestion: What entity does ____ refer to in the result?Give me the entity name:'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+ReCoRD_eval_cfg = dict(
+    evaluator=dict(type=EMEvaluator), pred_postprocessor=dict(type=ReCoRD_postprocess))
+ReCoRD_datasets = [
+    dict(
+        type=ReCoRDDataset,
+        abbr='ReCoRD',
+        path='./data/SuperGLUE/ReCoRD/val.jsonl',
+        reader_cfg=ReCoRD_reader_cfg,
+        infer_cfg=ReCoRD_infer_cfg,
+        eval_cfg=ReCoRD_eval_cfg)
+]

opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_30dea0.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import EMEvaluator
+from opencompass.datasets import ReCoRDDataset
+ReCoRD_reader_cfg = dict(
+    input_columns=['question', 'text'],
+    output_column='answers',
+)
+ReCoRD_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=
+                'Passage: {text}\nResult: {question}\nQuestion: What entity does ____ refer to in the result? Give me the entity name:'
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+ReCoRD_eval_cfg = dict(
+    evaluator=dict(type=EMEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type='ReCoRD'),
+)
+ReCoRD_datasets = [
+    dict(
+        type=ReCoRDDataset,
+        abbr='ReCoRD',
+        path='./data/SuperGLUE/ReCoRD/val.jsonl',
+        reader_cfg=ReCoRD_reader_cfg,
+        infer_cfg=ReCoRD_infer_cfg,
+        eval_cfg=ReCoRD_eval_cfg,
+    )
+]

opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import EMEvaluator
+from opencompass.datasets import ReCoRDDatasetV2, ReCoRD_postprocess
+ReCoRD_reader_cfg = dict(
+    input_columns=['question', 'text'], output_column='answers')
+ReCoRD_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN', prompt='Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:'
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+ReCoRD_eval_cfg = dict(
+    evaluator=dict(type=EMEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=ReCoRD_postprocess))
+ReCoRD_datasets = [
+    dict(
+        type=ReCoRDDatasetV2,
+        abbr='ReCoRD',
+        path='./data/SuperGLUE/ReCoRD/val.jsonl',
+        reader_cfg=ReCoRD_reader_cfg,
+        infer_cfg=ReCoRD_infer_cfg,
+        eval_cfg=ReCoRD_eval_cfg)
+]

opencompass/configs/datasets/humaneval/README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# HumanEval
+```bash
+python3 run.py --models hf_internlm2_7b --datasets deprecated_humaneval_gen_d2537e --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
+```
+## Base Models
+|          model           |   pass@1 |
+|:------------------------:|---------:|
+|    llama-7b-turbomind    |    12.80 |
+|   llama-13b-turbomind    |    15.24 |
+|   llama-30b-turbomind    |     9.15 |
+|   llama-65b-turbomind    |     7.32 |
+|   llama-2-7b-turbomind   |    14.02 |
+|  llama-2-13b-turbomind   |    15.24 |
+|  llama-2-70b-turbomind   |    15.24 |
+|   llama-3-8b-turbomind   |    28.05 |
+|  llama-3-70b-turbomind   |    28.05 |
+| internlm2-1.8b-turbomind |    30.49 |
+|  internlm2-7b-turbomind  |    48.17 |
+| internlm2-20b-turbomind  |    51.83 |
+|   qwen-1.8b-turbomind    |    16.46 |
+|    qwen-7b-turbomind     |    23.78 |
+|    qwen-14b-turbomind    |    23.78 |
+|    qwen-72b-turbomind    |    66.46 |
+|     qwen1.5-0.5b-hf      |     8.54 |
+|     qwen1.5-1.8b-hf      |    23.17 |
+|      qwen1.5-4b-hf       |    41.46 |
+|      qwen1.5-7b-hf       |    53.05 |
+|      qwen1.5-14b-hf      |    57.32 |
+|      qwen1.5-32b-hf      |    70.12 |
+|      qwen1.5-72b-hf      |    65.85 |
+|   qwen1.5-moe-a2-7b-hf   |    45.73 |
+|    mistral-7b-v0.1-hf    |    14.02 |
+|    mistral-7b-v0.2-hf    |     9.15 |
+|   mixtral-8x7b-v0.1-hf   |    24.39 |
+|  mixtral-8x22b-v0.1-hf   |    16.46 |
+|         yi-6b-hf         |    14.63 |
+|        yi-34b-hf         |    17.07 |
+|   deepseek-7b-base-hf    |    18.29 |
+|   deepseek-67b-base-hf   |    23.17 |
+## Chat Models
+|             model             |   pass@1 |
+|:-----------------------------:|---------:|
+|     qwen1.5-0.5b-chat-hf      |     9.15 |
+|     qwen1.5-1.8b-chat-hf      |    15.85 |
+|      qwen1.5-4b-chat-hf       |    30.49 |
+|      qwen1.5-7b-chat-hf       |    40.85 |
+|      qwen1.5-14b-chat-hf      |    50.00 |
+|      qwen1.5-32b-chat-hf      |    57.93 |
+|      qwen1.5-72b-chat-hf      |    60.37 |
+|     qwen1.5-110b-chat-hf      |    65.24 |
+|    internlm2-chat-1.8b-hf     |    33.54 |
+|  internlm2-chat-1.8b-sft-hf   |    34.15 |
+|     internlm2-chat-7b-hf      |    56.71 |
+|   internlm2-chat-7b-sft-hf    |    61.59 |
+|     internlm2-chat-20b-hf     |    67.68 |
+|   internlm2-chat-20b-sft-hf   |    67.68 |
+|    llama-3-8b-instruct-hf     |    55.49 |
+|    llama-3-70b-instruct-hf    |    70.73 |
+| llama-3-8b-instruct-lmdeploy  |    57.93 |
+| llama-3-70b-instruct-lmdeploy |    70.73 |
+|  mistral-7b-instruct-v0.1-hf  |    32.32 |
+|  mistral-7b-instruct-v0.2-hf  |    29.27 |
+| mixtral-8x7b-instruct-v0.1-hf |    34.15 |

opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_4a6eef.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a Python script for this problem:\n{prompt}\n\n### Response:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_6d1cc2.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following python function.:\n{prompt}\n\n### Response:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_a82cae.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_d2537e.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='Complete the following python code:\n{prompt}',
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_fd5822.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/deprecated_humaneval_gen_ff7054.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='Complete the following python code:'),
+            ],
+            round=[
+                dict(role='HUMAN', prompt='{prompt}'),
+            ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/humaneval_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .humaneval_gen_8e312c import humaneval_datasets  # noqa: F401, F403

opencompass/configs/datasets/humaneval/humaneval_gen_66a7f4.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+HUMANEVAL_TEMPLATE = dict(
+    round=[
+        dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
+    ]
+)
+humaneval_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    k=[1, 10, 100],
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg,
+    )
+]

opencompass/configs/datasets/humaneval/humaneval_gen_8e312c.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# THIS SHALL ALSO BE DEPRECATED
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_gen_159614.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/humaneval_passk_gen_8e312c.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval_passk',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval/humaneval_repeat10_gen_8e312c.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval_repeat10',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        num_repeats=10,
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg)
+]

opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .humaneval_plus_gen_8e312c import humaneval_plus_datasets  # noqa: F401, F403

opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_66a7f4.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
+humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
+HUMANEVAL_TEMPLATE = dict(
+    round=[
+        dict(role='HUMAN', prompt='You are an intelligent programming assistant to produce Python algorithmic solutions.\nCan you complete the following Python function?\n```python\n{prompt}\n```'),
+    ]
+)
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=HUMANEVAL_TEMPLATE),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalPlusEvaluator),
+    k=[1, 10, 100],
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_plus_datasets = [
+    dict(
+        abbr='humaneval_plus',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg,
+    )
+]

opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen_8e312c.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# THIS SHALL ALSO BE DEPRECATED
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
+humaneval_plus_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalPlusEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_plus_datasets = [
+    dict(
+        abbr='humaneval_plus',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg)
+]

opencompass/configs/datasets/humaneval_plus/humaneval_plus_passk_gen_8e312c.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_plus_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_plus_datasets = [
+    dict(
+        abbr='humaneval_plus_passk',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg)
+]

opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat10_gen_8e312c.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+humaneval_plus_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# TODO: allow empty output-column
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator, metric='EvalPlus'),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+humaneval_plus_datasets = [
+    dict(
+        abbr='humaneval_plus_repeat10',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        num_repeats=10,
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg)
+]

opencompass/configs/datasets/mmlu/README.md ADDED Viewed

	@@ -0,0 +1,368 @@

+# MMLU
+```bash
+python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
+```
+## Base Models
+|          model           |   mmlu |   mmlu-stem |   mmlu-social-science |   mmlu-humanities |   mmlu-other |
+|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
+|    llama-7b-turbomind    |  35.66 |       31.22 |                 37.70 |             38.90 |        37.01 |
+|   llama-13b-turbomind    |  47.76 |       37.68 |                 55.36 |             52.43 |        50.83 |
+|   llama-30b-turbomind    |  58.55 |       46.95 |                 67.35 |             65.13 |        60.78 |
+|   llama-65b-turbomind    |  63.78 |       52.35 |                 73.68 |             70.84 |        64.29 |
+|   llama-2-7b-turbomind   |  46.78 |       37.81 |                 52.11 |             51.69 |        50.04 |
+|  llama-2-13b-turbomind   |  55.76 |       44.61 |                 63.86 |             62.97 |        57.35 |
+|  llama-2-70b-turbomind   |  69.87 |       58.30 |                 79.86 |             75.84 |        71.58 |
+|   llama-3-8b-turbomind   |  66.43 |       55.95 |                 76.11 |             70.29 |        68.96 |
+|  llama-3-70b-turbomind   |  79.35 |       70.66 |                 87.54 |             83.43 |        80.42 |
+| internlm2-1.8b-turbomind |  45.99 |       39.63 |                 51.02 |             48.65 |        47.96 |
+|  internlm2-7b-turbomind  |  65.84 |       56.48 |                 74.43 |             69.68 |        67.75 |
+| internlm2-20b-turbomind  |  67.58 |       59.01 |                 76.04 |             71.20 |        68.69 |
+|   qwen-1.8b-turbomind    |  46.61 |       38.91 |                 51.35 |             49.57 |        50.51 |
+|    qwen-7b-turbomind     |  59.75 |       50.16 |                 67.98 |             63.48 |        62.44 |
+|    qwen-14b-turbomind    |  67.85 |       59.13 |                 76.18 |             71.62 |        69.12 |
+|    qwen-72b-turbomind    |  77.36 |       68.70 |                 85.28 |             80.60 |        79.45 |
+|     qwen1.5-0.5b-hf      |  39.98 |       33.96 |                 45.08 |             41.59 |        42.48 |
+|     qwen1.5-1.8b-hf      |  47.14 |       39.47 |                 52.70 |             49.01 |        51.33 |
+|      qwen1.5-4b-hf       |  57.03 |       47.80 |                 64.86 |             60.10 |        60.20 |
+|      qwen1.5-7b-hf       |  62.15 |       53.22 |                 70.25 |             65.62 |        64.26 |
+|      qwen1.5-14b-hf      |  69.10 |       61.46 |                 77.57 |             71.25 |        70.29 |
+|      qwen1.5-32b-hf      |  73.88 |       65.60 |                 81.41 |             77.10 |        75.79 |
+|      qwen1.5-72b-hf      |  77.02 |       69.00 |                 84.55 |             80.60 |        78.21 |
+|   qwen1.5-moe-a2-7b-hf   |  62.09 |       53.27 |                 70.74 |             63.80 |        65.28 |
+|    mistral-7b-v0.1-hf    |  64.04 |       53.21 |                 73.65 |             68.04 |        67.00 |
+|    mistral-7b-v0.2-hf    |  63.85 |       53.21 |                 72.17 |             68.40 |        67.15 |
+|   mixtral-8x7b-v0.1-hf   |  71.80 |       61.70 |                 81.03 |             75.51 |        74.35 |
+|  mixtral-8x22b-v0.1-hf   |  77.67 |       68.94 |                 86.81 |             81.23 |        78.43 |
+|         yi-6b-hf         |  64.08 |       52.61 |                 74.10 |             68.58 |        67.11 |
+|        yi-34b-hf         |  76.26 |       66.73 |                 83.74 |             81.78 |        77.77 |
+|   deepseek-7b-base-hf    |  49.22 |       40.17 |                 56.73 |             53.46 |        51.26 |
+|   deepseek-67b-base-hf   |  71.95 |       60.57 |                 81.69 |             77.11 |        74.42 |
+### Details
+|          model           |   college_biology |   college_chemistry |   college_computer_science |   college_mathematics |   college_physics |   electrical_engineering |   astronomy |   anatomy |   abstract_algebra |   machine_learning |   clinical_knowledge |   global_facts |
+|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
+|    llama-7b-turbomind    |             37.50 |               30.00 |                      30.00 |                 33.00 |             23.53 |                    23.45 |       34.87 |     37.78 |              25.00 |              27.68 |                34.34 |          31.00 |
+|   llama-13b-turbomind    |             46.53 |               30.00 |                      42.00 |                 36.00 |             18.63 |                    42.76 |       46.71 |     46.67 |              30.00 |              32.14 |                45.66 |          37.00 |
+|   llama-30b-turbomind    |             59.03 |               45.00 |                      47.00 |                 35.00 |             26.47 |                    53.10 |       61.18 |     51.85 |              37.00 |              41.07 |                57.36 |          38.00 |
+|   llama-65b-turbomind    |             68.75 |               49.00 |                      47.00 |                 37.00 |             35.29 |                    55.17 |       73.03 |     57.78 |              30.00 |              48.21 |                66.04 |          38.00 |
+|   llama-2-7b-turbomind   |             46.53 |               34.00 |                      33.00 |                 34.00 |             22.55 |                    47.59 |       40.13 |     47.41 |              29.00 |              38.39 |                46.42 |          32.00 |
+|  llama-2-13b-turbomind   |             59.03 |               44.00 |                      48.00 |                 29.00 |             26.47 |                    50.34 |       53.29 |     49.63 |              35.00 |              28.57 |                60.00 |          32.00 |
+|  llama-2-70b-turbomind   |             84.72 |               51.00 |                      60.00 |                 39.00 |             37.25 |                    65.52 |       81.58 |     63.70 |              32.00 |              52.68 |                72.08 |          46.00 |
+|   llama-3-8b-turbomind   |             77.08 |               46.00 |                      51.00 |                 31.00 |             51.96 |                    62.76 |       67.11 |     68.15 |              34.00 |              52.68 |                74.72 |          35.00 |
+|  llama-3-70b-turbomind   |             93.75 |               62.00 |                      72.00 |                 52.00 |             50.98 |                    74.48 |       92.11 |     79.26 |              48.00 |              63.39 |                86.42 |          49.00 |
+| internlm2-1.8b-turbomind |             38.89 |               37.00 |                      44.00 |                 35.00 |             30.39 |                    49.66 |       50.66 |     44.44 |              25.00 |              35.71 |                51.32 |          32.00 |
+|  internlm2-7b-turbomind  |             77.08 |               48.00 |                      64.00 |                 33.00 |             47.06 |                    63.45 |       73.68 |     57.78 |              37.00 |              45.54 |                69.81 |          35.00 |
+| internlm2-20b-turbomind  |             83.33 |               51.00 |                      61.00 |                 36.00 |             45.10 |                    64.83 |       75.00 |     59.26 |              39.00 |              53.57 |                73.58 |          32.00 |
+|   qwen-1.8b-turbomind    |             42.36 |               36.00 |                      39.00 |                 34.00 |             27.45 |                    51.03 |       50.66 |     42.96 |              31.00 |              31.25 |                53.21 |          28.00 |
+|    qwen-7b-turbomind     |             67.36 |               48.00 |                      53.00 |                 28.00 |             39.22 |                    59.31 |       63.82 |     49.63 |              34.00 |              38.39 |                63.02 |          37.00 |
+|    qwen-14b-turbomind    |             78.47 |               51.00 |                      62.00 |                 42.00 |             49.02 |                    65.52 |       71.05 |     60.00 |              37.00 |              58.93 |                71.32 |          40.00 |
+|    qwen-72b-turbomind    |             93.75 |               56.00 |                      66.00 |                 56.00 |             50.98 |                    80.69 |       85.53 |     73.33 |              41.00 |              62.50 |                83.77 |          54.00 |
+|     qwen1.5-0.5b-hf      |             38.89 |               25.00 |                      38.00 |                 32.00 |             25.49 |                    45.52 |       44.74 |     33.33 |              30.00 |              39.29 |                38.11 |          39.00 |
+|     qwen1.5-1.8b-hf      |             43.75 |               34.00 |                      45.00 |                 38.00 |             28.43 |                    47.59 |       47.37 |     40.74 |              32.00 |              31.25 |                53.96 |          37.00 |
+|      qwen1.5-4b-hf       |             50.00 |               46.00 |                      41.00 |                 45.00 |             31.37 |                    53.10 |       61.18 |     51.85 |              35.00 |              44.64 |                60.38 |          37.00 |
+|      qwen1.5-7b-hf       |             66.67 |               48.00 |                      55.00 |                 37.00 |             41.18 |                    60.69 |       65.79 |     52.59 |              39.00 |              41.07 |                68.68 |          43.00 |
+|      qwen1.5-14b-hf      |             75.69 |               49.00 |                      58.00 |                 49.00 |             49.02 |                    71.72 |       73.03 |     65.93 |              39.00 |              52.68 |                73.96 |          49.00 |
+|      qwen1.5-32b-hf      |             85.42 |               53.00 |                      59.00 |                 51.00 |             53.92 |                    72.41 |       82.24 |     63.70 |              43.00 |              58.04 |                78.11 |          50.00 |
+|      qwen1.5-72b-hf      |             90.97 |               54.00 |                      65.00 |                 57.00 |             52.94 |                    80.00 |       87.50 |     73.33 |              43.00 |              64.29 |                81.89 |          50.00 |
+|   qwen1.5-moe-a2-7b-hf   |             62.50 |               44.00 |                      54.00 |                 41.00 |             49.02 |                    58.62 |       69.74 |     57.78 |              37.00 |              38.39 |                66.79 |          38.00 |
+|    mistral-7b-v0.1-hf    |             72.92 |               50.00 |                      51.00 |                 40.00 |             39.22 |                    57.93 |       65.79 |     62.96 |              29.00 |              49.11 |                69.43 |          36.00 |
+|    mistral-7b-v0.2-hf    |             71.53 |               49.00 |                      53.00 |                 40.00 |             36.27 |                    57.24 |       64.47 |     60.00 |              29.00 |              53.57 |                67.92 |          39.00 |
+|   mixtral-8x7b-v0.1-hf   |             85.42 |               54.00 |                      62.00 |                 43.00 |             46.08 |                    68.97 |       82.89 |     70.37 |              37.00 |              56.25 |                79.25 |          51.00 |
+|  mixtral-8x22b-v0.1-hf   |             89.58 |               56.00 |                      69.00 |                 48.00 |             52.94 |                    76.55 |       86.18 |     77.04 |              53.00 |              62.50 |                82.26 |          56.00 |
+|         yi-6b-hf         |             66.67 |               43.00 |                      51.00 |                 39.00 |             35.29 |                    64.83 |       65.79 |     60.00 |              29.00 |              41.96 |                66.79 |          46.00 |
+|        yi-34b-hf         |             88.89 |               52.00 |                      66.00 |                 44.00 |             48.04 |                    80.00 |       89.47 |     74.81 |              44.00 |              58.04 |                78.87 |          52.00 |
+|   deepseek-7b-base-hf    |             52.08 |               29.00 |                      44.00 |                 40.00 |             31.37 |                    44.83 |       51.97 |     40.74 |              27.00 |              32.14 |                53.58 |          31.00 |
+|   deepseek-67b-base-hf   |             84.72 |               52.00 |                      62.00 |                 42.00 |             42.16 |                    70.34 |       80.92 |     65.19 |              39.00 |              50.00 |                78.11 |          42.00 |
+|          model           |   management |   nutrition |   marketing |   professional_accounting |   high_school_geography |   international_law |   moral_scenarios |   computer_security |   high_school_microeconomics |   professional_law |   medical_genetics |   professional_psychology |
+|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
+|    llama-7b-turbomind    |        33.01 |       39.22 |       45.73 |                     26.24 |                   33.33 |               51.24 |             24.25 |               45.00 |                        31.09 |              30.05 |              37.00 |                     35.13 |
+|   llama-13b-turbomind    |        66.02 |       51.63 |       71.79 |                     34.75 |                   55.05 |               64.46 |             30.06 |               63.00 |                        47.48 |              37.22 |              53.00 |                     48.53 |
+|   llama-30b-turbomind    |        76.70 |       62.42 |       84.19 |                     44.68 |                   71.72 |               75.21 |             40.56 |               66.00 |                        57.98 |              46.48 |              66.00 |                     63.73 |
+|   llama-65b-turbomind    |        82.52 |       68.95 |       87.18 |                     48.94 |                   79.29 |               81.82 |             47.82 |               79.00 |                        68.49 |              50.07 |              68.00 |                     66.67 |
+|   llama-2-7b-turbomind   |        53.40 |       48.69 |       68.38 |                     36.52 |                   49.49 |               65.29 |             24.02 |               60.00 |                        44.12 |              36.31 |              55.00 |                     43.79 |
+|  llama-2-13b-turbomind   |        72.82 |       61.76 |       79.49 |                     39.72 |                   69.19 |               74.38 |             43.80 |               70.00 |                        58.40 |              42.50 |              54.00 |                     54.90 |
+|  llama-2-70b-turbomind   |        83.50 |       77.12 |       91.03 |                     56.03 |                   86.87 |               87.60 |             44.69 |               77.00 |                        77.31 |              52.93 |              74.00 |                     75.65 |
+|   llama-3-8b-turbomind   |        87.38 |       75.82 |       89.74 |                     48.94 |                   80.81 |               84.30 |             40.89 |               81.00 |                        73.95 |              46.22 |              77.00 |                     71.90 |
+|  llama-3-70b-turbomind   |        91.26 |       87.25 |       94.87 |                     64.18 |                   93.94 |               89.26 |             62.91 |               83.00 |                        87.82 |              61.80 |              90.00 |                     85.78 |
+| internlm2-1.8b-turbomind |        60.19 |       58.17 |       63.25 |                     31.21 |                   56.57 |               56.20 |             24.47 |               52.00 |                        50.42 |              36.11 |              53.00 |                     41.83 |
+|  internlm2-7b-turbomind  |        79.61 |       75.49 |       87.61 |                     48.23 |                   82.83 |               77.69 |             49.39 |               74.00 |                        72.27 |              47.65 |              73.00 |                     65.03 |
+| internlm2-20b-turbomind  |        79.61 |       75.49 |       91.88 |                     50.00 |                   87.88 |               85.95 |             35.08 |               81.00 |                        70.59 |              49.48 |              78.00 |                     70.10 |
+|   qwen-1.8b-turbomind    |        66.02 |       60.46 |       73.50 |                     38.30 |                   56.57 |               66.94 |             23.91 |               56.00 |                        42.02 |              33.96 |              51.00 |                     39.54 |
+|    qwen-7b-turbomind     |        78.64 |       67.32 |       83.33 |                     41.49 |                   76.77 |               76.03 |             29.72 |               73.00 |                        58.40 |              41.72 |              69.00 |                     59.64 |
+|    qwen-14b-turbomind    |        78.64 |       73.86 |       88.89 |                     48.58 |                   83.84 |               84.30 |             45.47 |               77.00 |                        73.95 |              50.85 |              74.00 |                     69.61 |
+|    qwen-72b-turbomind    |        90.29 |       84.97 |       94.87 |                     65.96 |                   92.93 |               88.43 |             65.70 |               79.00 |                        84.87 |              61.21 |              86.00 |                     82.19 |
+|     qwen1.5-0.5b-hf      |        52.43 |       46.41 |       60.68 |                     31.21 |                   46.46 |               56.20 |             25.70 |               46.00 |                        37.39 |              32.79 |              46.00 |                     37.75 |
+|     qwen1.5-1.8b-hf      |        66.02 |       58.50 |       75.64 |                     33.69 |                   56.06 |               72.73 |             24.69 |               57.00 |                        39.50 |              36.11 |              53.00 |                     42.81 |
+|      qwen1.5-4b-hf       |        74.76 |       62.75 |       84.19 |                     46.81 |                   76.77 |               71.07 |             25.03 |               67.00 |                        55.04 |              41.33 |              64.00 |                     56.05 |
+|      qwen1.5-7b-hf       |        78.64 |       70.92 |       86.32 |                     44.68 |                   81.82 |               77.69 |             32.74 |               76.00 |                        64.29 |              45.37 |              68.00 |                     61.27 |
+|      qwen1.5-14b-hf      |        80.58 |       75.49 |       85.90 |                     51.06 |                   86.36 |               80.99 |             45.03 |               80.00 |                        76.47 |              48.57 |              78.00 |                     69.61 |
+|      qwen1.5-32b-hf      |        86.41 |       81.37 |       95.30 |                     56.38 |                   91.41 |               88.43 |             44.02 |               76.00 |                        82.77 |              57.89 |              83.00 |                     75.33 |
+|      qwen1.5-72b-hf      |        87.38 |       85.29 |       94.87 |                     64.89 |                   92.42 |               90.08 |             62.12 |               83.00 |                        84.03 |              60.76 |              86.00 |                     81.05 |
+|   qwen1.5-moe-a2-7b-hf   |        78.64 |       70.92 |       86.32 |                     46.81 |                   81.82 |               77.69 |             25.59 |               71.00 |                        65.97 |              45.37 |              65.00 |                     61.44 |
+|    mistral-7b-v0.1-hf    |        82.52 |       75.49 |       87.61 |                     48.94 |                   76.77 |               77.69 |             32.51 |               77.00 |                        66.39 |              44.98 |              74.00 |                     67.97 |
+|    mistral-7b-v0.2-hf    |        81.55 |       74.18 |       88.46 |                     51.06 |                   76.77 |               80.99 |             38.77 |               75.00 |                        64.71 |              45.37 |              72.00 |                     66.34 |
+|   mixtral-8x7b-v0.1-hf   |        87.38 |       81.70 |       91.88 |                     51.77 |                   85.86 |               85.95 |             40.11 |               80.00 |                        79.41 |              53.32 |              77.00 |                     77.94 |
+|  mixtral-8x22b-v0.1-hf   |        89.32 |       85.95 |       91.88 |                     62.06 |                   91.41 |               90.08 |             64.58 |               83.00 |                        87.82 |              60.82 |              84.00 |                     83.17 |
+|         yi-6b-hf         |        80.58 |       71.57 |       91.03 |                     48.23 |                   83.33 |               76.86 |             41.34 |               75.00 |                        74.79 |              49.35 |              80.00 |                     65.69 |
+|        yi-34b-hf         |        91.26 |       85.62 |       92.31 |                     65.25 |                   89.39 |               91.74 |             64.69 |               82.00 |                        85.29 |              59.97 |              87.00 |                     82.19 |
+|   deepseek-7b-base-hf    |        61.17 |       53.59 |       72.22 |                     34.04 |                   59.09 |               65.29 |             26.37 |               61.00 |                        44.96 |              35.53 |              56.00 |                     49.18 |
+|   deepseek-67b-base-hf   |        88.35 |       79.74 |       91.88 |                     57.09 |                   89.39 |               85.12 |             46.15 |               76.00 |                        82.35 |              55.93 |              72.00 |                     79.58 |
+|          model           |   jurisprudence |   world_religions |   philosophy |   virology |   high_school_chemistry |   public_relations |   high_school_macroeconomics |   human_sexuality |   elementary_mathematics |   high_school_physics |   high_school_computer_science |   high_school_european_history |
+|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
+|    llama-7b-turbomind    |           41.67 |             49.12 |        40.84 |      34.94 |                   29.56 |              40.00 |                        34.10 |             35.11 |                    26.46 |                 27.81 |                          34.00 |                          41.82 |
+|   llama-13b-turbomind    |           51.85 |             67.84 |        55.31 |      43.37 |                   28.57 |              60.91 |                        46.15 |             57.25 |                    26.98 |                 29.80 |                          49.00 |                          61.21 |
+|   llama-30b-turbomind    |           71.30 |             79.53 |        66.24 |      49.40 |                   40.39 |              70.00 |                        56.67 |             64.89 |                    37.30 |                 35.10 |                          60.00 |                          70.91 |
+|   llama-65b-turbomind    |           75.00 |             81.29 |        73.63 |      53.01 |                   41.38 |              74.55 |                        65.90 |             77.86 |                    40.21 |                 35.76 |                          69.00 |                          76.36 |
+|   llama-2-7b-turbomind   |           53.70 |             69.01 |        60.13 |      41.57 |                   36.95 |              54.55 |                        45.90 |             55.73 |                    27.25 |                 31.13 |                          40.00 |                          59.39 |
+|  llama-2-13b-turbomind   |           74.07 |             76.61 |        63.99 |      45.78 |                   44.83 |              62.73 |                        50.77 |             62.60 |                    34.13 |                 36.42 |                          57.00 |                          63.03 |
+|  llama-2-70b-turbomind   |           83.33 |             85.96 |        78.46 |      53.61 |                   52.22 |              69.09 |                        74.87 |             87.02 |                    43.39 |                 43.71 |                          78.00 |                          84.24 |
+|   llama-3-8b-turbomind   |           75.00 |             83.04 |        74.28 |      56.02 |                   54.68 |              71.82 |                        64.87 |             79.39 |                    42.06 |                 45.03 |                          68.00 |                          76.36 |
+|  llama-3-70b-turbomind   |           86.11 |             91.23 |        86.50 |      57.83 |                   71.92 |              74.55 |                        82.56 |             88.55 |                    62.70 |                 56.95 |                          86.00 |                          86.67 |
+| internlm2-1.8b-turbomind |           55.56 |             59.65 |        51.13 |      40.96 |                   43.35 |              52.73 |                        43.33 |             47.33 |                    30.42 |                 33.11 |                          47.00 |                          56.36 |
+|  internlm2-7b-turbomind  |           79.63 |             82.46 |        73.63 |      51.20 |                   55.17 |              70.00 |                        66.92 |             70.99 |                    46.03 |                 42.38 |                          70.00 |                          78.79 |
+| internlm2-20b-turbomind  |           75.93 |             82.46 |        73.95 |      56.02 |                   57.64 |              68.18 |                        70.51 |             68.70 |                    49.21 |                 38.41 |                          75.00 |                          82.42 |
+|   qwen-1.8b-turbomind    |           59.26 |             56.14 |        50.80 |      40.96 |                   37.93 |              60.00 |                        41.03 |             51.15 |                    33.33 |                 34.44 |                          39.00 |                          64.24 |
+|    qwen-7b-turbomind     |           73.15 |             76.61 |        67.20 |      47.59 |                   51.23 |              65.45 |                        60.00 |             69.47 |                    43.12 |                 38.41 |                          67.00 |                          66.67 |
+|    qwen-14b-turbomind    |           76.85 |             84.21 |        72.03 |      53.01 |                   65.52 |              66.36 |                        66.92 |             78.63 |                    51.32 |                 41.72 |                          72.00 |                          82.42 |
+|    qwen-72b-turbomind    |           83.33 |             88.30 |        83.28 |      58.43 |                   65.52 |              74.55 |                        81.54 |             89.31 |                    68.52 |                 58.28 |                          81.00 |                          84.24 |
+|     qwen1.5-0.5b-hf      |           40.74 |             40.94 |        41.48 |      40.96 |                   28.57 |              50.91 |                        36.92 |             41.98 |                    28.84 |                 22.52 |                          37.00 |                          52.73 |
+|     qwen1.5-1.8b-hf      |           55.56 |             57.31 |        49.84 |      40.96 |                   36.45 |              56.36 |                        43.59 |             56.49 |                    35.19 |                 27.81 |                          45.00 |                          61.21 |
+|      qwen1.5-4b-hf       |           70.37 |             70.76 |        61.74 |      44.58 |                   45.32 |              65.45 |                        54.62 |             64.89 |                    47.88 |                 32.45 |                          62.00 |                          70.30 |
+|      qwen1.5-7b-hf       |           75.93 |             77.19 |        66.24 |      50.60 |                   53.20 |              62.73 |                        60.00 |             71.76 |                    50.26 |                 38.41 |                          71.00 |                          74.55 |
+|      qwen1.5-14b-hf      |           74.07 |             83.63 |        70.74 |      46.39 |                   58.62 |              64.55 |                        73.59 |             76.34 |                    59.26 |                 49.01 |                          75.00 |                          83.64 |
+|      qwen1.5-32b-hf      |           83.33 |             85.96 |        82.96 |      56.63 |                   61.58 |              63.64 |                        77.95 |             83.97 |                    69.31 |                 50.99 |                          85.00 |                          86.06 |
+|      qwen1.5-72b-hf      |           84.26 |             88.89 |        82.32 |      57.23 |                   66.01 |              72.73 |                        82.05 |             87.02 |                    69.31 |                 56.95 |                          84.00 |                          84.24 |
+|   qwen1.5-moe-a2-7b-hf   |           70.37 |             80.12 |        66.56 |      51.20 |                   47.78 |              64.55 |                        62.31 |             70.99 |                    46.30 |                 45.03 |                          59.00 |                          69.70 |
+|    mistral-7b-v0.1-hf    |           77.78 |             83.04 |        69.45 |      54.82 |                   53.20 |              67.27 |                        66.15 |             78.63 |                    38.10 |                 31.79 |                          68.00 |                          78.79 |
+|    mistral-7b-v0.2-hf    |           73.15 |             82.46 |        72.99 |      53.01 |                   55.67 |              66.36 |                        62.31 |             77.10 |                    40.48 |                 34.44 |                          66.00 |                          76.36 |
+|   mixtral-8x7b-v0.1-hf   |           82.41 |             88.30 |        78.14 |      51.20 |                   62.56 |              70.00 |                        70.77 |             80.92 |                    48.68 |                 48.34 |                          71.00 |                          80.61 |
+|  mixtral-8x22b-v0.1-hf   |           84.26 |             89.47 |        84.57 |      59.04 |                   67.49 |              78.18 |                        79.23 |             88.55 |                    61.64 |                 52.98 |                          87.00 |                          86.06 |
+|         yi-6b-hf         |           78.70 |             81.87 |        69.77 |      46.39 |                   52.71 |              73.64 |                        65.13 |             74.81 |                    46.30 |                 38.41 |                          66.00 |                          71.52 |
+|        yi-34b-hf         |           89.81 |             86.55 |        83.92 |      57.23 |                   64.04 |              73.64 |                        79.49 |             85.50 |                    66.40 |                 52.32 |                          81.00 |                          86.06 |
+|   deepseek-7b-base-hf    |           55.56 |             73.10 |        56.59 |      46.99 |                   34.98 |              62.73 |                        48.21 |             58.78 |                    28.57 |                 29.14 |                          50.00 |                          61.82 |
+|   deepseek-67b-base-hf   |           84.26 |             85.96 |        81.03 |      56.02 |                   57.64 |              72.73 |                        73.85 |             82.44 |                    51.59 |                 45.03 |                          74.00 |                          81.82 |
+|          model           |   business_ethics |   moral_disputes |   high_school_statistics |   miscellaneous |   formal_logic |   high_school_government_and_politics |   prehistory |   security_studies |   high_school_biology |   logical_fallacies |   high_school_world_history |   professional_medicine |
+|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
+|    llama-7b-turbomind    |             42.00 |            40.46 |                    32.87 |           42.78 |          26.19 |                                 46.11 |        35.19 |              33.47 |                 32.90 |               42.33 |                       43.88 |                   43.75 |
+|   llama-13b-turbomind    |             46.00 |            50.00 |                    30.56 |           64.88 |          31.75 |                                 66.84 |        51.85 |              52.65 |                 51.94 |               52.76 |                       67.51 |                   51.10 |
+|   llama-30b-turbomind    |             55.00 |            66.76 |                    49.07 |           77.91 |          36.51 |                                 82.90 |        68.21 |              66.12 |                 69.35 |               67.48 |                       80.59 |                   55.88 |
+|   llama-65b-turbomind    |             59.00 |            73.70 |                    61.57 |           81.35 |          43.65 |                                 88.60 |        73.46 |              71.84 |                 74.19 |               77.30 |                       83.97 |                   62.13 |
+|   llama-2-7b-turbomind   |             53.00 |            51.16 |                    27.78 |           63.60 |          27.78 |                                 67.36 |        48.77 |              47.76 |                 50.97 |               51.53 |                       64.56 |                   52.57 |
+|  llama-2-13b-turbomind   |             54.00 |            64.45 |                    45.37 |           74.46 |          36.51 |                                 80.83 |        64.81 |              62.86 |                 67.42 |               66.87 |                       72.15 |                   54.41 |
+|  llama-2-70b-turbomind   |             72.00 |            77.17 |                    63.43 |           86.08 |          48.41 |                                 94.30 |        83.64 |              78.37 |                 81.61 |               80.98 |                       87.76 |                   74.63 |
+|   llama-3-8b-turbomind   |             62.00 |            73.70 |                    54.17 |           82.76 |          48.41 |                                 90.16 |        72.53 |              75.51 |                 77.74 |               73.01 |                       82.70 |                   72.06 |
+|  llama-3-70b-turbomind   |             83.00 |            85.55 |                    72.22 |           92.21 |          66.67 |                                 97.41 |        91.05 |              84.90 |                 90.32 |               87.73 |                       94.09 |                   87.13 |
+| internlm2-1.8b-turbomind |             44.00 |            45.95 |                    38.89 |           59.39 |          32.54 |                                 60.62 |        50.31 |              54.29 |                 52.58 |               45.40 |                       62.87 |                   37.87 |
+|  internlm2-7b-turbomind  |             69.00 |            66.76 |                    57.87 |           80.72 |          50.00 |                                 90.16 |        73.15 |              75.10 |                 79.68 |               68.71 |                       81.01 |                   70.22 |
+| internlm2-20b-turbomind  |             74.00 |            74.57 |                    60.19 |           81.48 |          44.44 |                                 91.71 |        75.31 |              81.63 |                 82.58 |               75.46 |                       87.76 |                   63.60 |
+|   qwen-1.8b-turbomind    |             52.00 |            52.31 |                    34.72 |           57.98 |          29.37 |                                 59.07 |        47.22 |              48.57 |                 52.26 |               44.17 |                       61.18 |                   43.38 |
+|    qwen-7b-turbomind     |             68.00 |            64.74 |                    45.37 |           77.39 |          43.65 |                                 83.94 |        68.21 |              70.20 |                 72.26 |               65.64 |                       75.95 |                   58.46 |
+|    qwen-14b-turbomind    |             75.00 |            74.86 |                    57.87 |           84.04 |          51.59 |                                 91.71 |        70.99 |              77.14 |                 83.55 |               73.01 |                       83.12 |                   67.65 |
+|    qwen-72b-turbomind    |             80.00 |            84.97 |                    68.98 |           91.44 |          54.76 |                                 98.96 |        87.04 |              81.63 |                 89.03 |               84.05 |                       90.30 |                   84.93 |
+|     qwen1.5-0.5b-hf      |             47.00 |            46.82 |                    23.15 |           48.02 |          29.37 |                                 48.70 |        40.12 |              38.37 |                 40.65 |               35.58 |                       53.16 |                   31.62 |
+|     qwen1.5-1.8b-hf      |             54.00 |            54.91 |                    28.70 |           61.69 |          23.81 |                                 58.03 |        48.15 |              51.84 |                 55.48 |               45.40 |                       59.92 |                   39.71 |
+|      qwen1.5-4b-hf       |             65.00 |            66.76 |                    44.44 |           73.95 |          35.71 |                                 78.24 |        60.19 |              65.31 |                 66.45 |               65.64 |                       71.31 |                   50.00 |
+|      qwen1.5-7b-hf       |             68.00 |            70.81 |                    48.61 |           76.50 |          38.89 |                                 84.97 |        69.44 |              68.16 |                 74.52 |               68.10 |                       77.22 |                   56.25 |
+|      qwen1.5-14b-hf      |             77.00 |            73.70 |                    62.96 |           83.40 |          53.17 |                                 90.67 |        71.60 |              80.82 |                 84.52 |               76.69 |                       83.54 |                   71.69 |
+|      qwen1.5-32b-hf      |             77.00 |            78.90 |                    68.98 |           88.12 |          54.76 |                                 94.82 |        81.48 |              80.82 |                 88.39 |               82.21 |                       86.08 |                   80.88 |
+|      qwen1.5-72b-hf      |             80.00 |            84.39 |                    68.98 |           91.44 |          55.56 |                                 98.96 |        86.73 |              81.63 |                 88.71 |               85.89 |                       89.87 |                   82.72 |
+|   qwen1.5-moe-a2-7b-hf   |             74.00 |            65.90 |                    56.48 |           82.25 |          34.13 |                                 84.46 |        70.68 |              74.29 |                 73.23 |               68.10 |                       76.79 |                   66.91 |
+|    mistral-7b-v0.1-hf    |             57.00 |            71.10 |                    57.41 |           81.61 |          40.48 |                                 86.53 |        73.46 |              72.65 |                 76.77 |               79.14 |                       77.22 |                   68.75 |
+|    mistral-7b-v0.2-hf    |             61.00 |            71.39 |                    52.78 |           80.08 |          40.48 |                                 88.08 |        69.44 |              72.24 |                 76.13 |               77.91 |                       78.06 |                   70.59 |
+|   mixtral-8x7b-v0.1-hf   |             77.00 |            80.06 |                    63.43 |           87.87 |          54.76 |                                 93.26 |        83.95 |              80.00 |                 84.19 |               79.14 |                       88.61 |                   81.25 |
+|  mixtral-8x22b-v0.1-hf   |             72.00 |            84.10 |                    68.52 |           90.68 |          57.14 |                                 96.37 |        86.73 |              86.53 |                 90.32 |               87.73 |                       90.30 |                   87.87 |
+|         yi-6b-hf         |             67.00 |            69.36 |                    52.78 |           80.46 |          44.44 |                                 89.64 |        70.99 |              74.69 |                 77.10 |               78.53 |                       78.90 |                   65.81 |
+|        yi-34b-hf         |             79.00 |            83.82 |                    66.67 |           90.29 |          57.14 |                                 97.93 |        87.65 |              84.90 |                 88.39 |               87.73 |                       92.83 |                   81.99 |
+|   deepseek-7b-base-hf    |             49.00 |            52.31 |                    41.20 |           66.28 |          30.95 |                                 63.73 |        55.86 |              51.84 |                 52.90 |               58.90 |                       62.45 |                   45.22 |
+|   deepseek-67b-base-hf   |             81.00 |            77.17 |                    63.89 |           90.04 |          53.17 |                                 97.93 |        85.49 |              73.88 |                 82.26 |               84.05 |                       91.56 |                   78.31 |
+|          model           |   high_school_mathematics |   college_medicine |   high_school_us_history |   sociology |   econometrics |   high_school_psychology |   human_aging |   us_foreign_policy |
+|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
+|    llama-7b-turbomind    |                     24.81 |              32.95 |                    38.73 |       45.77 |          27.19 |                    48.07 |         38.12 |               43.00 |
+|   llama-13b-turbomind    |                     26.30 |              42.20 |                    59.80 |       61.19 |          28.95 |                    61.28 |         53.36 |               78.00 |
+|   llama-30b-turbomind    |                     27.41 |              54.91 |                    76.96 |       79.10 |          35.96 |                    76.15 |         67.71 |               83.00 |
+|   llama-65b-turbomind    |                     34.44 |              54.34 |                    82.84 |       81.09 |          39.47 |                    82.39 |         66.37 |               88.00 |
+|   llama-2-7b-turbomind   |                     29.63 |              43.35 |                    60.29 |       62.69 |          27.19 |                    62.75 |         56.05 |               64.00 |
+|  llama-2-13b-turbomind   |                     27.04 |              52.60 |                    75.49 |       73.13 |          32.46 |                    76.51 |         64.57 |               82.00 |
+|  llama-2-70b-turbomind   |                     34.07 |              64.16 |                    90.69 |       90.55 |          44.74 |                    87.52 |         80.27 |               92.00 |
+|   llama-3-8b-turbomind   |                     38.15 |              64.16 |                    83.33 |       86.57 |          47.37 |                    84.04 |         70.85 |               87.00 |
+|  llama-3-70b-turbomind   |                     48.89 |              79.77 |                    95.10 |       94.03 |          72.81 |                    94.13 |         82.51 |               94.00 |
+| internlm2-1.8b-turbomind |                     30.37 |              41.04 |                    55.88 |       51.74 |          28.95 |                    61.47 |         51.12 |               63.00 |
+|  internlm2-7b-turbomind  |                     39.63 |              68.21 |                    76.96 |       84.58 |          44.74 |                    84.59 |         72.65 |               86.00 |
+| internlm2-20b-turbomind  |                     39.63 |              66.47 |                    82.84 |       85.07 |          47.37 |                    86.79 |         70.85 |               84.00 |
+|   qwen-1.8b-turbomind    |                     28.52 |              43.35 |                    54.90 |       60.70 |          36.84 |                    60.73 |         48.43 |               60.00 |
+|    qwen-7b-turbomind     |                     30.00 |              57.23 |                    75.98 |       79.10 |          32.46 |                    79.27 |         63.23 |               81.00 |
+|    qwen-14b-turbomind    |                     37.41 |              70.52 |                    81.37 |       85.07 |          50.00 |                    84.95 |         73.09 |               86.00 |
+|    qwen-72b-turbomind    |                     50.00 |              75.72 |                    92.16 |       90.05 |          59.65 |                    92.66 |         82.51 |               95.00 |
+|     qwen1.5-0.5b-hf      |                     29.63 |              33.53 |                    45.10 |       59.70 |          28.95 |                    44.77 |         37.22 |               69.00 |
+|     qwen1.5-1.8b-hf      |                     34.07 |              39.31 |                    47.55 |       63.18 |          32.46 |                    59.08 |         53.81 |               73.00 |
+|      qwen1.5-4b-hf       |                     35.93 |              55.49 |                    71.08 |       73.13 |          37.72 |                    72.11 |         63.68 |               79.00 |
+|      qwen1.5-7b-hf       |                     34.81 |              61.85 |                    78.92 |       82.09 |          41.23 |                    80.73 |         61.88 |               84.00 |
+|      qwen1.5-14b-hf      |                     45.93 |              68.21 |                    80.88 |       83.08 |          55.26 |                    86.06 |         73.09 |               88.00 |
+|      qwen1.5-32b-hf      |                     47.04 |              76.30 |                    90.20 |       86.07 |          57.89 |                    90.28 |         75.78 |               92.00 |
+|      qwen1.5-72b-hf      |                     47.78 |              75.14 |                    92.65 |       88.56 |          59.65 |                    92.48 |         79.82 |               94.00 |
+|   qwen1.5-moe-a2-7b-hf   |                     46.30 |              54.91 |                    78.43 |       79.10 |          38.60 |                    82.39 |         66.82 |               83.00 |
+|    mistral-7b-v0.1-hf    |                     33.70 |              65.32 |                    78.92 |       83.08 |          50.00 |                    82.39 |         69.51 |               86.00 |
+|    mistral-7b-v0.2-hf    |                     38.15 |              64.16 |                    81.86 |       82.09 |          43.86 |                    80.18 |         69.96 |               86.00 |
+|   mixtral-8x7b-v0.1-hf   |                     40.37 |              69.94 |                    86.27 |       88.56 |          65.79 |                    88.81 |         79.37 |               91.00 |
+|  mixtral-8x22b-v0.1-hf   |                     45.93 |              79.19 |                    90.20 |       93.03 |          70.18 |                    92.29 |         79.37 |               95.00 |
+|         yi-6b-hf         |                     32.59 |              61.27 |                    79.90 |       82.59 |          35.96 |                    82.94 |         67.26 |               86.00 |
+|        yi-34b-hf         |                     45.19 |              71.68 |                    91.18 |       88.56 |          55.26 |                    91.74 |         78.48 |               91.00 |
+|   deepseek-7b-base-hf    |                     28.89 |              41.62 |                    60.29 |       70.15 |          26.32 |                    69.72 |         55.61 |               76.00 |
+|   deepseek-67b-base-hf   |                     38.89 |              72.25 |                    90.69 |       90.05 |          52.63 |                    90.46 |         80.72 |               95.00 |
+## Chat Models
+|             model             |   mmlu |   mmlu-stem |   mmlu-social-science |   mmlu-humanities |   mmlu-other |
+|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
+|     qwen1.5-0.5b-chat-hf      |  35.32 |       30.90 |                 37.59 |             37.29 |        37.73 |
+|     qwen1.5-1.8b-chat-hf      |  45.62 |       39.20 |                 49.21 |             47.67 |        49.63 |
+|      qwen1.5-4b-chat-hf       |  55.90 |       48.07 |                 62.67 |             59.70 |        57.31 |
+|      qwen1.5-7b-chat-hf       |  61.79 |       52.68 |                 69.41 |             66.41 |        63.45 |
+|      qwen1.5-14b-chat-hf      |  67.96 |       59.79 |                 75.46 |             71.23 |        69.72 |
+|      qwen1.5-32b-chat-hf      |  75.36 |       67.04 |                 82.11 |             80.44 |        76.23 |
+|      qwen1.5-72b-chat-hf      |  77.24 |       69.59 |                 83.95 |             81.58 |        77.87 |
+|     qwen1.5-110b-chat-hf      |  77.95 |       71.56 |                 83.77 |             81.44 |        78.41 |
+|    internlm2-chat-1.8b-hf     |  47.58 |       40.88 |                 53.33 |             49.92 |        49.74 |
+|  internlm2-chat-1.8b-sft-hf   |  47.44 |       40.55 |                 53.31 |             49.67 |        49.89 |
+|     internlm2-chat-7b-hf      |  63.05 |       53.42 |                 71.47 |             67.27 |        65.13 |
+|   internlm2-chat-7b-sft-hf    |  63.33 |       53.95 |                 71.74 |             67.62 |        65.00 |
+|     internlm2-chat-20b-hf     |  67.37 |       57.39 |                 75.75 |             71.63 |        69.95 |
+|   internlm2-chat-20b-sft-hf   |  67.34 |       57.49 |                 75.67 |             70.99 |        70.40 |
+|    llama-3-8b-instruct-hf     |  68.37 |       58.01 |                 77.82 |             71.22 |        71.94 |
+|    llama-3-70b-instruct-hf    |  80.93 |       73.86 |                 87.71 |             83.90 |        82.01 |
+| llama-3-8b-instruct-lmdeploy  |  67.35 |       56.66 |                 75.96 |             70.90 |        71.49 |
+| llama-3-70b-instruct-lmdeploy |  80.85 |       74.07 |                 87.26 |             83.73 |        81.96 |
+|  mistral-7b-instruct-v0.1-hf  |  54.36 |       43.74 |                 62.96 |             58.87 |        57.46 |
+|  mistral-7b-instruct-v0.2-hf  |  59.98 |       49.56 |                 69.22 |             64.41 |        62.24 |
+| mixtral-8x7b-instruct-v0.1-hf |  70.11 |       60.29 |                 79.01 |             74.08 |        72.28 |
+### Details
+|             model             |   college_biology |   college_chemistry |   college_computer_science |   college_mathematics |   college_physics |   electrical_engineering |   astronomy |   anatomy |   abstract_algebra |   machine_learning |   clinical_knowledge |   global_facts |
+|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
+|     qwen1.5-0.5b-chat-hf      |             31.25 |               32.00 |                      33.00 |                 29.00 |             33.33 |                    38.62 |       33.55 |     28.89 |              20.00 |              27.68 |                40.38 |          33.00 |
+|     qwen1.5-1.8b-chat-hf      |             42.36 |               28.00 |                      45.00 |                 33.00 |             27.45 |                    44.83 |       51.97 |     42.22 |              32.00 |              38.39 |                48.30 |          30.00 |
+|      qwen1.5-4b-chat-hf       |             56.25 |               47.00 |                      49.00 |                 39.00 |             36.27 |                    54.48 |       57.89 |     49.63 |              38.00 |              33.04 |                59.62 |          23.00 |
+|      qwen1.5-7b-chat-hf       |             64.58 |               51.00 |                      59.00 |                 37.00 |             41.18 |                    53.79 |       66.45 |     53.33 |              43.00 |              41.07 |                67.92 |          36.00 |
+|      qwen1.5-14b-chat-hf      |             77.08 |               51.00 |                      64.00 |                 42.00 |             45.10 |                    64.83 |       77.63 |     65.93 |              39.00 |              46.43 |                73.21 |          45.00 |
+|      qwen1.5-32b-chat-hf      |             84.72 |               53.00 |                      57.00 |                 48.00 |             52.94 |                    74.48 |       82.24 |     67.41 |              52.00 |              61.61 |                78.11 |          48.00 |
+|      qwen1.5-72b-chat-hf      |             90.97 |               57.00 |                      66.00 |                 55.00 |             55.88 |                    80.00 |       88.16 |     72.59 |              56.00 |              59.82 |                80.00 |          51.00 |
+|     qwen1.5-110b-chat-hf      |             88.89 |               62.00 |                      66.00 |                 64.00 |             58.82 |                    75.86 |       89.47 |     68.15 |              59.00 |              63.39 |                79.62 |          59.00 |
+|    internlm2-chat-1.8b-hf     |             49.31 |               36.00 |                      47.00 |                 33.00 |             36.27 |                    42.76 |       48.03 |     49.63 |              30.00 |              33.93 |                53.58 |          28.00 |
+|  internlm2-chat-1.8b-sft-hf   |             51.39 |               37.00 |                      50.00 |                 33.00 |             33.33 |                    42.76 |       46.05 |     49.63 |              31.00 |              32.14 |                53.21 |          29.00 |
+|     internlm2-chat-7b-hf      |             68.75 |               47.00 |                      62.00 |                 32.00 |             38.24 |                    57.24 |       69.74 |     58.52 |              29.00 |              53.57 |                70.19 |          41.00 |
+|   internlm2-chat-7b-sft-hf    |             71.53 |               47.00 |                      63.00 |                 34.00 |             37.25 |                    57.24 |       69.74 |     57.78 |              29.00 |              52.68 |                69.43 |          34.00 |
+|     internlm2-chat-20b-hf     |             76.39 |               51.00 |                      61.00 |                 37.00 |             40.20 |                    62.76 |       78.95 |     67.41 |              33.00 |              46.43 |                75.09 |          42.00 |
+|   internlm2-chat-20b-sft-hf   |             77.08 |               49.00 |                      60.00 |                 39.00 |             39.22 |                    64.14 |       79.61 |     68.15 |              35.00 |              46.43 |                75.09 |          42.00 |
+|    llama-3-8b-instruct-hf     |             81.94 |               48.00 |                      58.00 |                 43.00 |             48.04 |                    60.69 |       76.32 |     71.11 |              33.00 |              54.46 |                73.58 |          46.00 |
+|    llama-3-70b-instruct-hf    |             93.06 |               56.00 |                      70.00 |                 60.00 |             60.78 |                    77.24 |       93.42 |     79.26 |              53.00 |              71.43 |                86.42 |          66.00 |
+| llama-3-8b-instruct-lmdeploy  |             79.17 |               47.00 |                      53.00 |                 36.00 |             49.02 |                    60.00 |       73.68 |     68.89 |              36.00 |              55.36 |                73.96 |          42.00 |
+| llama-3-70b-instruct-lmdeploy |             93.75 |               57.00 |                      66.00 |                 61.00 |             65.69 |                    77.93 |       92.11 |     78.52 |              55.00 |              70.54 |                86.42 |          64.00 |
+|  mistral-7b-instruct-v0.1-hf  |             57.64 |               35.00 |                      50.00 |                 31.00 |             24.51 |                    51.72 |       58.55 |     45.93 |              35.00 |              41.07 |                56.98 |          32.00 |
+|  mistral-7b-instruct-v0.2-hf  |             70.14 |               42.00 |                      49.00 |                 35.00 |             43.14 |                    54.48 |       65.79 |     56.30 |              29.00 |              42.86 |                65.28 |          37.00 |
+| mixtral-8x7b-instruct-v0.1-hf |             81.25 |               57.00 |                      57.00 |                 40.00 |             50.00 |                    60.69 |       80.92 |     65.93 |              45.00 |              50.89 |                76.60 |          41.00 |
+|             model             |   management |   nutrition |   marketing |   professional_accounting |   high_school_geography |   international_law |   moral_scenarios |   computer_security |   high_school_microeconomics |   professional_law |   medical_genetics |   professional_psychology |
+|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
+|     qwen1.5-0.5b-chat-hf      |        41.75 |       38.89 |       49.15 |                     26.60 |                   48.48 |               50.41 |             24.69 |               42.00 |                        32.35 |              31.75 |              31.00 |                     32.35 |
+|     qwen1.5-1.8b-chat-hf      |        62.14 |       55.56 |       76.92 |                     34.40 |                   58.08 |               61.16 |             21.90 |               56.00 |                        42.44 |              35.14 |              50.00 |                     44.93 |
+|      qwen1.5-4b-chat-hf       |        73.79 |       58.50 |       82.05 |                     47.16 |                   74.24 |               71.90 |             32.29 |               69.00 |                        58.40 |              40.74 |              58.00 |                     53.76 |
+|      qwen1.5-7b-chat-hf       |        79.61 |       69.28 |       85.47 |                     41.49 |                   78.79 |               76.86 |             35.75 |               74.00 |                        65.13 |              44.78 |              68.00 |                     57.68 |
+|      qwen1.5-14b-chat-hf      |        82.52 |       70.26 |       87.18 |                     51.77 |                   85.86 |               82.64 |             53.74 |               81.00 |                        76.05 |              47.98 |              76.00 |                     67.48 |
+|      qwen1.5-32b-chat-hf      |        84.47 |       77.78 |       94.44 |                     60.99 |                   90.91 |               87.60 |             72.96 |               79.00 |                        83.61 |              58.28 |              83.00 |                     77.94 |
+|      qwen1.5-72b-chat-hf      |        89.32 |       85.95 |       93.59 |                     61.35 |                   90.91 |               86.78 |             75.98 |               83.00 |                        84.87 |              60.30 |              83.00 |                     81.05 |
+|     qwen1.5-110b-chat-hf      |        86.41 |       80.72 |       92.74 |                     69.15 |                   93.94 |               84.30 |             77.88 |               83.00 |                        88.66 |              61.73 |              84.00 |                     82.19 |
+|    internlm2-chat-1.8b-hf     |        72.82 |       50.65 |       69.23 |                     35.46 |                   56.06 |               56.20 |             27.82 |               60.00 |                        49.16 |              33.83 |              54.00 |                     43.79 |
+|  internlm2-chat-1.8b-sft-hf   |        71.84 |       52.61 |       68.80 |                     34.75 |                   55.56 |               53.72 |             27.04 |               58.00 |                        48.74 |              34.09 |              54.00 |                     44.61 |
+|     internlm2-chat-7b-hf      |        78.64 |       66.67 |       85.90 |                     46.81 |                   79.29 |               70.25 |             35.31 |               79.00 |                        68.07 |              46.41 |              68.00 |                     64.87 |
+|   internlm2-chat-7b-sft-hf    |        79.61 |       67.97 |       86.75 |                     47.52 |                   80.30 |               70.25 |             35.98 |               80.00 |                        69.33 |              45.83 |              70.00 |                     65.36 |
+|     internlm2-chat-20b-hf     |        80.58 |       75.16 |       90.17 |                     52.13 |                   83.84 |               80.99 |             39.33 |               80.00 |                        70.59 |              49.67 |              75.00 |                     70.26 |
+|   internlm2-chat-20b-sft-hf   |        80.58 |       76.14 |       91.03 |                     53.19 |                   84.34 |               80.99 |             36.31 |               77.00 |                        71.85 |              49.61 |              77.00 |                     70.59 |
+|    llama-3-8b-instruct-hf     |        82.52 |       79.41 |       91.45 |                     52.48 |                   80.30 |               79.34 |             46.26 |               75.00 |                        76.89 |              49.61 |              85.00 |                     72.22 |
+|    llama-3-70b-instruct-hf    |        89.32 |       87.58 |       93.16 |                     66.67 |                   92.42 |               90.08 |             76.20 |               83.00 |                        89.50 |              64.67 |              92.00 |                     87.09 |
+| llama-3-8b-instruct-lmdeploy  |        87.38 |       79.41 |       90.17 |                     52.48 |                   79.80 |               78.51 |             44.25 |               75.00 |                        74.37 |              48.76 |              84.00 |                     69.61 |
+| llama-3-70b-instruct-lmdeploy |        90.29 |       88.56 |       93.59 |                     65.96 |                   92.93 |               89.26 |             75.75 |               83.00 |                        89.92 |              63.95 |              92.00 |                     86.60 |
+|  mistral-7b-instruct-v0.1-hf  |        69.90 |       59.80 |       85.47 |                     38.65 |                   69.70 |               65.29 |             37.54 |               69.00 |                        51.26 |              37.81 |              65.00 |                     52.45 |
+|  mistral-7b-instruct-v0.2-hf  |        74.76 |       66.99 |       88.89 |                     43.97 |                   75.25 |               76.86 |             42.01 |               73.00 |                        62.61 |              42.24 |              67.00 |                     62.25 |
+| mixtral-8x7b-instruct-v0.1-hf |        85.44 |       80.39 |       92.74 |                     55.32 |                   85.35 |               82.64 |             48.38 |               78.00 |                        75.21 |              53.52 |              75.00 |                     74.02 |
+|             model             |   jurisprudence |   world_religions |   philosophy |   virology |   high_school_chemistry |   public_relations |   high_school_macroeconomics |   human_sexuality |   elementary_mathematics |   high_school_physics |   high_school_computer_science |   high_school_european_history |
+|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
+|     qwen1.5-0.5b-chat-hf      |           42.59 |             24.56 |        39.87 |      39.76 |                   29.06 |              38.18 |                        35.64 |             38.93 |                    27.78 |                 29.80 |                          34.00 |                          48.48 |
+|     qwen1.5-1.8b-chat-hf      |           50.93 |             56.73 |        44.37 |      42.77 |                   35.96 |              51.82 |                        38.46 |             49.62 |                    35.45 |                 27.15 |                          47.00 |                          63.03 |
+|      qwen1.5-4b-chat-hf       |           71.30 |             65.50 |        58.20 |      50.00 |                   44.33 |              57.27 |                        54.10 |             61.83 |                    43.65 |                 41.06 |                          60.00 |                          72.12 |
+|      qwen1.5-7b-chat-hf       |           76.85 |             76.61 |        68.49 |      48.80 |                   51.72 |              64.55 |                        59.23 |             68.70 |                    48.94 |                 37.09 |                          69.00 |                          79.39 |
+|      qwen1.5-14b-chat-hf      |           75.93 |             80.70 |        69.13 |      51.20 |                   55.67 |              64.55 |                        67.69 |             74.05 |                    57.14 |                 47.02 |                          74.00 |                          82.42 |
+|      qwen1.5-32b-chat-hf      |           83.33 |             89.47 |        82.64 |      60.84 |                   62.56 |              70.00 |                        76.67 |             83.21 |                    67.46 |                 59.60 |                          85.00 |                          84.85 |
+|      qwen1.5-72b-chat-hf      |           86.11 |             89.47 |        80.71 |      59.04 |                   68.47 |              72.73 |                        80.00 |             87.79 |                    67.72 |                 52.32 |                          79.00 |                          85.45 |
+|     qwen1.5-110b-chat-hf      |           83.33 |             87.13 |        81.03 |      54.22 |                   69.95 |              73.64 |                        78.21 |             87.02 |                    75.93 |                 57.62 |                          84.00 |                          88.48 |
+|    internlm2-chat-1.8b-hf     |           52.78 |             60.82 |        49.20 |      42.77 |                   42.36 |              50.00 |                        47.18 |             53.44 |                    32.54 |                 31.79 |                          39.00 |                          60.00 |
+|  internlm2-chat-1.8b-sft-hf   |           53.70 |             61.40 |        50.16 |      42.17 |                   40.89 |              50.00 |                        47.69 |             51.15 |                    32.54 |                 29.14 |                          40.00 |                          59.39 |
+|     internlm2-chat-7b-hf      |           73.15 |             81.87 |        67.85 |      47.59 |                   49.75 |              62.73 |                        61.79 |             66.41 |                    44.97 |                 33.77 |                          71.00 |                          81.82 |
+|   internlm2-chat-7b-sft-hf    |           73.15 |             81.87 |        66.88 |      48.19 |                   48.77 |              63.64 |                        62.31 |             65.65 |                    45.77 |                 33.77 |                          72.00 |                          81.82 |
+|     internlm2-chat-20b-hf     |           80.56 |             81.87 |        72.99 |      55.42 |                   54.19 |              70.00 |                        67.95 |             71.76 |                    48.15 |                 39.74 |                          75.00 |                          80.00 |
+|   internlm2-chat-20b-sft-hf   |           81.48 |             79.53 |        72.99 |      54.82 |                   54.19 |              69.09 |                        67.95 |             71.76 |                    48.94 |                 41.06 |                          75.00 |                          80.00 |
+|    llama-3-8b-instruct-hf     |           76.85 |             79.53 |        72.35 |      53.61 |                   54.19 |              70.91 |                        66.41 |             80.92 |                    49.47 |                 46.36 |                          71.00 |                          75.15 |
+|    llama-3-70b-instruct-hf    |           87.04 |             88.30 |        82.64 |      56.02 |                   67.49 |              74.55 |                        86.41 |             88.55 |                    74.34 |                 65.56 |                          91.00 |                          86.06 |
+| llama-3-8b-instruct-lmdeploy  |           77.78 |             79.53 |        70.74 |      52.41 |                   53.20 |              68.18 |                        65.38 |             79.39 |                    50.79 |                 37.75 |                          72.00 |                          76.97 |
+| llama-3-70b-instruct-lmdeploy |           87.96 |             90.64 |        83.28 |      54.82 |                   69.46 |              73.64 |                        86.92 |             87.02 |                    74.87 |                 66.23 |                          92.00 |                          85.45 |
+|  mistral-7b-instruct-v0.1-hf  |           64.81 |             70.18 |        63.67 |      41.57 |                   38.92 |              68.18 |                        49.49 |             61.83 |                    33.33 |                 32.45 |                          55.00 |                          66.67 |
+|  mistral-7b-instruct-v0.2-hf  |           70.37 |             80.12 |        64.95 |      50.60 |                   50.74 |              68.18 |                        54.36 |             71.76 |                    40.74 |                 35.10 |                          60.00 |                          73.33 |
+| mixtral-8x7b-instruct-v0.1-hf |           79.63 |             87.72 |        73.63 |      54.82 |                   61.58 |              67.27 |                        69.49 |             83.21 |                    52.91 |                 47.02 |                          74.00 |                          80.61 |
+|             model             |   business_ethics |   moral_disputes |   high_school_statistics |   miscellaneous |   formal_logic |   high_school_government_and_politics |   prehistory |   security_studies |   high_school_biology |   logical_fallacies |   high_school_world_history |   professional_medicine |
+|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
+|     qwen1.5-0.5b-chat-hf      |             45.00 |            41.04 |                    30.09 |           39.21 |          24.60 |                                 35.23 |        33.95 |              25.31 |                 36.13 |               31.29 |                       49.37 |                   38.24 |
+|     qwen1.5-1.8b-chat-hf      |             54.00 |            50.29 |                    34.26 |           58.49 |          24.60 |                                 55.96 |        47.53 |              39.18 |                 47.74 |               44.17 |                       64.98 |                   40.81 |
+|      qwen1.5-4b-chat-hf       |             61.00 |            64.16 |                    46.30 |           71.01 |          39.68 |                                 72.02 |        54.01 |              65.31 |                 63.55 |               63.80 |                       71.31 |                   51.10 |
+|      qwen1.5-7b-chat-hf       |             69.00 |            67.05 |                    50.93 |           76.25 |          53.17 |                                 82.38 |        62.96 |              71.02 |                 73.23 |               68.10 |                       76.79 |                   60.29 |
+|      qwen1.5-14b-chat-hf      |             74.00 |            75.14 |                    58.33 |           82.89 |          51.59 |                                 88.60 |        69.44 |              77.96 |                 84.19 |               73.62 |                       82.70 |                   71.32 |
+|      qwen1.5-32b-chat-hf      |             80.00 |            80.64 |                    70.83 |           89.40 |          60.32 |                                 94.82 |        81.79 |              79.59 |                 90.00 |               86.50 |                       88.61 |                   80.15 |
+|      qwen1.5-72b-chat-hf      |             80.00 |            82.95 |                    68.98 |           91.83 |          57.14 |                                 98.45 |        86.73 |              78.78 |                 89.03 |               87.12 |                       91.14 |                   83.82 |
+|     qwen1.5-110b-chat-hf      |             79.00 |            78.03 |                    67.13 |           92.98 |          62.70 |                                 97.93 |        87.04 |              74.29 |                 88.71 |               82.82 |                       91.14 |                   84.93 |
+|    internlm2-chat-1.8b-hf     |             48.00 |            49.13 |                    44.91 |           57.60 |          26.98 |                                 61.14 |        50.62 |              51.02 |                 52.58 |               57.67 |                       67.51 |                   37.50 |
+|  internlm2-chat-1.8b-sft-hf   |             50.00 |            49.13 |                    44.91 |           57.73 |          28.57 |                                 61.66 |        49.69 |              51.02 |                 49.68 |               57.67 |                       66.67 |                   38.60 |
+|     internlm2-chat-7b-hf      |             65.00 |            65.61 |                    49.54 |           80.84 |          43.65 |                                 88.08 |        70.99 |              68.98 |                 78.39 |               75.46 |                       82.28 |                   61.76 |
+|   internlm2-chat-7b-sft-hf    |             64.00 |            66.18 |                    52.31 |           81.35 |          46.03 |                                 88.08 |        71.60 |              67.76 |                 78.39 |               77.30 |                       82.28 |                   63.60 |
+|     internlm2-chat-20b-hf     |             74.00 |            73.70 |                    59.72 |           81.86 |          46.83 |                                 89.12 |        74.69 |              75.92 |                 80.65 |               79.14 |                       82.70 |                   70.59 |
+|   internlm2-chat-20b-sft-hf   |             76.00 |            73.12 |                    60.19 |           81.99 |          43.65 |                                 88.60 |        74.38 |              73.88 |                 80.32 |               80.37 |                       82.70 |                   70.59 |
+|    llama-3-8b-instruct-hf     |             72.00 |            73.12 |                    55.09 |           84.55 |          50.00 |                                 90.67 |        77.16 |              77.55 |                 81.61 |               77.91 |                       84.81 |                   75.00 |
+|    llama-3-70b-instruct-hf    |             85.00 |            85.26 |                    75.00 |           92.72 |          69.05 |                                 97.41 |        90.43 |              82.04 |                 91.61 |               87.12 |                       94.09 |                   89.71 |
+| llama-3-8b-instruct-lmdeploy  |             72.00 |            72.83 |                    52.78 |           82.12 |          51.59 |                                 89.64 |        76.85 |              76.73 |                 80.97 |               76.69 |                       84.39 |                   74.63 |
+| llama-3-70b-instruct-lmdeploy |             85.00 |            84.39 |                    73.61 |           92.72 |          67.46 |                                 97.93 |        89.81 |              81.63 |                 90.65 |               87.12 |                       93.25 |                   89.34 |
+|  mistral-7b-instruct-v0.1-hf  |             55.00 |            57.51 |                    39.81 |           74.07 |          39.68 |                                 75.65 |        57.72 |              62.04 |                 59.35 |               69.33 |                       67.93 |                   55.88 |
+|  mistral-7b-instruct-v0.2-hf  |             61.00 |            66.76 |                    46.76 |           78.67 |          36.51 |                                 84.97 |        68.83 |              70.20 |                 68.39 |               69.33 |                       73.00 |                   58.09 |
+| mixtral-8x7b-instruct-v0.1-hf |             66.00 |            76.59 |                    57.87 |           86.59 |          50.00 |                                 93.78 |        83.02 |              79.18 |                 82.58 |               75.46 |                       86.50 |                   77.94 |
+|             model             |   high_school_mathematics |   college_medicine |   high_school_us_history |   sociology |   econometrics |   high_school_psychology |   human_aging |   us_foreign_policy |
+|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
+|     qwen1.5-0.5b-chat-hf      |                     24.44 |              35.26 |                    42.16 |       47.26 |          29.82 |                    40.55 |         32.29 |               47.00 |
+|     qwen1.5-1.8b-chat-hf      |                     32.22 |              43.35 |                    54.90 |       48.26 |          28.95 |                    61.83 |         48.43 |               71.00 |
+|      qwen1.5-4b-chat-hf       |                     36.30 |              51.45 |                    71.08 |       76.62 |          34.21 |                    72.29 |         58.30 |               72.00 |
+|      qwen1.5-7b-chat-hf       |                     31.11 |              61.27 |                    76.47 |       79.10 |          42.11 |                    81.28 |         61.43 |               83.00 |
+|      qwen1.5-14b-chat-hf      |                     41.48 |              68.79 |                    80.88 |       82.59 |          48.25 |                    84.40 |         72.20 |               88.00 |
+|      qwen1.5-32b-chat-hf      |                     48.52 |              75.72 |                    88.73 |       86.07 |          57.02 |                    90.46 |         78.03 |               95.00 |
+|      qwen1.5-72b-chat-hf      |                     51.48 |              73.99 |                    90.69 |       87.06 |          59.65 |                    92.11 |         79.37 |               94.00 |
+|     qwen1.5-110b-chat-hf      |                     52.22 |              76.30 |                    93.14 |       87.56 |          62.28 |                    91.56 |         80.27 |               88.00 |
+|    internlm2-chat-1.8b-hf     |                     31.48 |              46.82 |                    56.37 |       65.17 |          28.07 |                    65.87 |         50.22 |               69.00 |
+|  internlm2-chat-1.8b-sft-hf   |                     30.74 |              47.40 |                    54.41 |       64.18 |          29.82 |                    66.24 |         48.43 |               69.00 |
+|     internlm2-chat-7b-hf      |                     33.70 |              67.05 |                    79.90 |       81.09 |          48.25 |                    84.04 |         67.26 |               84.00 |
+|   internlm2-chat-7b-sft-hf    |                     35.19 |              67.05 |                    79.90 |       80.60 |          48.25 |                    84.59 |         65.47 |               85.00 |
+|     internlm2-chat-20b-hf     |                     36.30 |              66.47 |                    88.73 |       85.07 |          51.75 |                    85.69 |         70.85 |               87.00 |
+|   internlm2-chat-20b-sft-hf   |                     35.93 |              65.90 |                    87.75 |       85.57 |          52.63 |                    84.77 |         70.85 |               87.00 |
+|    llama-3-8b-instruct-hf     |                     36.67 |              68.79 |                    83.82 |       86.57 |          61.40 |                    84.95 |         70.85 |               85.00 |
+|    llama-3-70b-instruct-hf    |                     57.41 |              78.61 |                    89.71 |       91.54 |          74.56 |                    94.50 |         82.96 |               94.00 |
+| llama-3-8b-instruct-lmdeploy  |                     38.52 |              68.79 |                    82.84 |       85.57 |          54.39 |                    85.50 |         69.96 |               83.00 |
+| llama-3-70b-instruct-lmdeploy |                     54.81 |              79.77 |                    90.20 |       92.04 |          71.05 |                    94.50 |         82.96 |               93.00 |
+|  mistral-7b-instruct-v0.1-hf  |                     28.89 |              50.29 |                    67.16 |       76.12 |          39.47 |                    72.29 |         62.33 |               77.00 |
+|  mistral-7b-instruct-v0.2-hf  |                     30.74 |              53.18 |                    73.04 |       77.11 |          42.11 |                    79.82 |         63.68 |               82.00 |
+| mixtral-8x7b-instruct-v0.1-hf |                     35.56 |              73.41 |                    85.29 |       87.06 |          60.53 |                    86.97 |         74.44 |               86.00 |

opencompass/configs/datasets/mmlu/mmlu_all_sets.py ADDED Viewed

	@@ -0,0 +1,59 @@

+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]

opencompass/configs/datasets/mmlu/mmlu_clean_ppl.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
+from opencompass.datasets import MMLUDatasetClean as MMLUDataset
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'The following are multiple choice questions (with answers) about  {_name.replace("_", " ")}.\n\n'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template={
+                opt:
+                f'{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}\n'
+                for opt in ['A', 'B', 'C', 'D']
+            },
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template={
+                opt:
+                f'{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: {opt}'
+                for opt in ['A', 'B', 'C', 'D']
+            },
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=PPLInferencer),
+    )
+    mmlu_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
+                         analyze_contamination=True)
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+del _name, _hint

opencompass/configs/datasets/mmlu/mmlu_gen.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .mmlu_gen_4d595a import mmlu_datasets  # noqa: F401, F403

opencompass/configs/datasets/mmlu/mmlu_gen_23a9a9.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_prompt_template = dict(
+    type='PromptTemplate',
+    template=None,
+    ice_token='</E>')
+mmlu_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '
+            ),
+            dict(role='BOT', prompt='{target}\n')
+        ])),
+    prompt_template=mmlu_prompt_template,
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+    inferencer=dict(type=GenInferencer))
+mmlu_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_postprocessor=dict(type=first_capital_postprocess))
+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg.copy(),
+            eval_cfg=mmlu_eval_cfg))
+    mmlu_datasets[-1]['infer_cfg'][
+        'prompt_template'] = mmlu_prompt_template.copy()
+    mmlu_datasets[-1]['infer_cfg']['prompt_template']['template'] = \
+        dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=f'The following are multiple choice questions (with answers) about {_name.replace("_", " ")}.'),
+                '</E>',
+            ],
+            round=[
+                dict(role='HUMAN', prompt='{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: '),
+            ]
+        )
+del _name

opencompass/configs/datasets/mmlu/mmlu_gen_4d595a.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{target}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccwithDetailsEvaluator),
+        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+del _name, _hint

opencompass/configs/datasets/mmlu/mmlu_gen_5d1409.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
+                ),
+                dict(role='BOT', prompt='{target}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=
+                        f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_capital_postprocess))
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+del _name, _hint

opencompass/configs/datasets/mmlu/mmlu_gen_79e572.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'The following are multiple choice questions (with answers) about  {_name.replace("_", " ")}.\n\n'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=
+            '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer: {target}\n',
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=
+            f'{_hint}</E>{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:',
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_capital_postprocess),
+    )
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+del _name, _hint

opencompass/configs/datasets/mmlu/mmlu_gen_a484b3.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{target}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=
+                        f'{_hint}\nQ: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nA: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+del _name, _hint

opencompass/configs/datasets/mmlu/mmlu_model_postprocess_gen_4d595a.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.utils.model_postprocessors import navie_model_postprocess
+from opencompass.utils.postprocessors.naive import OPTION_NAVIE_PROMPT_TEMPLATE
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{target}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+# # You can write your own postprocess prompt like:
+# MMLU_NAVIE_PROMPT_TEMPLATE = """
+# There is a detailed explanation of the final answer you should extract:
+# 1. ...
+# 2. ...
+# ...
+# """
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccwithDetailsEvaluator),
+        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+        model_postprocessor=dict(
+            type=navie_model_postprocess,
+            custom_instruction=OPTION_NAVIE_PROMPT_TEMPLATE,
+            model_name='',
+            api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
+        )
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+del _name, _hint

opencompass/configs/datasets/mmlu/mmlu_openai_simple_evals_gen_b618ea.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+with read_base():
+    from .mmlu_all_sets import mmlu_all_sets
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+{input}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_datasets = []
+for name in mmlu_all_sets:
+    mmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'))
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))

opencompass/configs/datasets/mmlu/mmlu_ppl.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from mmengine.config import read_base
+with read_base():
+    from .mmlu_ppl_ac766d import mmlu_datasets  # noqa: F401, F403

opencompass/configs/datasets/mmlu/mmlu_ppl_ac766d.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import MMLUDataset
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+mmlu_all_sets = [
+    'college_biology',
+    'college_chemistry',
+    'college_computer_science',
+    'college_mathematics',
+    'college_physics',
+    'electrical_engineering',
+    'astronomy',
+    'anatomy',
+    'abstract_algebra',
+    'machine_learning',
+    'clinical_knowledge',
+    'global_facts',
+    'management',
+    'nutrition',
+    'marketing',
+    'professional_accounting',
+    'high_school_geography',
+    'international_law',
+    'moral_scenarios',
+    'computer_security',
+    'high_school_microeconomics',
+    'professional_law',
+    'medical_genetics',
+    'professional_psychology',
+    'jurisprudence',
+    'world_religions',
+    'philosophy',
+    'virology',
+    'high_school_chemistry',
+    'public_relations',
+    'high_school_macroeconomics',
+    'human_sexuality',
+    'elementary_mathematics',
+    'high_school_physics',
+    'high_school_computer_science',
+    'high_school_european_history',
+    'business_ethics',
+    'moral_disputes',
+    'high_school_statistics',
+    'miscellaneous',
+    'formal_logic',
+    'high_school_government_and_politics',
+    'prehistory',
+    'security_studies',
+    'high_school_biology',
+    'logical_fallacies',
+    'high_school_world_history',
+    'professional_medicine',
+    'high_school_mathematics',
+    'college_medicine',
+    'high_school_us_history',
+    'sociology',
+    'econometrics',
+    'high_school_psychology',
+    'human_aging',
+    'us_foreign_policy',
+    'conceptual_physics',
+]
+mmlu_datasets = []
+for _name in mmlu_all_sets:
+    _hint = f'The following are multiple choice questions (with answers) about  {_name.replace("_", " ")}.\n\n'
+    question_overall = '{input}\nA. {A}\nB. {B}\nC. {C}\nD. {D}'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template={opt: f'{question_overall}\nAnswer: {opt}\n' for opt in ['A', 'B', 'C', 'D']},
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template={opt: f'{_hint}</E>{question_overall}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=PPLInferencer),
+    )
+    mmlu_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator), )
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+del _name, _hint