tuandunghcmut commited on
Commit
928e1d0
·
verified ·
1 Parent(s): 4dcb263

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py +4 -0
  2. opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py +77 -0
  3. opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl.py +4 -0
  4. opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py +76 -0
  5. opencompass/configs/datasets/MathBench/deprecated_mathbench_gen_7b734b.py +110 -0
  6. opencompass/configs/datasets/MathBench/mathbench_2024_few_shot_mixed_4a3fd4.py +81 -0
  7. opencompass/configs/datasets/MathBench/mathbench_2024_gen_19e486.py +114 -0
  8. opencompass/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py +81 -0
  9. opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py +81 -0
  10. opencompass/configs/datasets/MathBench/mathbench_prompt.py +109 -0
  11. opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py +4 -0
  12. opencompass/configs/datasets/NPHardEval/NPHardEval_gen_22aac5.py +59 -0
  13. opencompass/configs/datasets/NPHardEval/README.md +126 -0
  14. opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py +4 -0
  15. opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py +43 -0
  16. opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_ppl.py +4 -0
  17. opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_ppl_866273.py +30 -0
  18. opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_ppl_ced824.py +47 -0
  19. opencompass/configs/datasets/agieval/agieval_gen.py +4 -0
  20. opencompass/configs/datasets/agieval/agieval_gen_397d81.py +204 -0
  21. opencompass/configs/datasets/agieval/agieval_gen_617738.py +209 -0
  22. opencompass/configs/datasets/agieval/agieval_gen_64afd3.py +207 -0
  23. opencompass/configs/datasets/agieval/agieval_gen_a0c741.py +85 -0
  24. opencompass/configs/datasets/agieval/agieval_mixed.py +4 -0
  25. opencompass/configs/datasets/agieval/agieval_mixed_0fa998.py +220 -0
  26. opencompass/configs/datasets/anthropics_evals/airisk_gen.py +4 -0
  27. opencompass/configs/datasets/anthropics_evals/airisk_gen_ba66fc.py +66 -0
  28. opencompass/configs/datasets/anthropics_evals/persona_gen.py +4 -0
  29. opencompass/configs/datasets/anthropics_evals/persona_gen_cc72e2.py +184 -0
  30. opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py +4 -0
  31. opencompass/configs/datasets/anthropics_evals/sycophancy_gen_4bba45.py +50 -0
  32. opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py +4 -0
  33. opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen_c205fb.py +42 -0
  34. opencompass/configs/datasets/cmb/cmb_gen.py +4 -0
  35. opencompass/configs/datasets/cmb/cmb_gen_dfb5c4.py +49 -0
  36. opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_code_gen_c8c3aa.py +181 -0
  37. opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_knowledge.py +94 -0
  38. opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_math.py +86 -0
  39. opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_objective_gen.py +4 -0
  40. opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_objective_gen_068af0.py +74 -0
  41. opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_prompt.py +44 -0
  42. opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py +4 -0
  43. opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen_8a1435.py +91 -0
  44. opencompass/configs/datasets/lambada/lambada_gen.py +4 -0
  45. opencompass/configs/datasets/lambada/lambada_gen_217e11.py +33 -0
  46. opencompass/configs/datasets/lambada/lambada_gen_8b48a5.py +29 -0
  47. opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py +67 -0
  48. opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py +66 -0
  49. opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py +67 -0
  50. opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py +66 -0
opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FinanceIQ_gen_e0e6b5 import financeIQ_datasets # noqa: F401, F403
opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import FinanceIQDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ financeIQ_subject_mapping_en = {
9
+ 'certified_public_accountant': '注册会计师(CPA)',
10
+ 'banking_qualification': '银行从业资格',
11
+ 'securities_qualification': '证券从业资格',
12
+ 'fund_qualification': '基金从业资格',
13
+ 'insurance_qualification': '保险从业资格CICE',
14
+ 'economic_analyst': '经济师',
15
+ 'taxation_practitioner': '税务师',
16
+ 'futures_qualification': '期货从业资格',
17
+ 'certified_fin_planner': '理财规划师',
18
+ 'actuary_fin_math': '精算师-金融数学',
19
+ }
20
+
21
+ financeIQ_subject_mapping = {
22
+ '注册会计师(CPA)': '注册会计师(CPA)',
23
+ '银行从业资格': '银行从业资格',
24
+ '证券从业资格': '证券从业资格',
25
+ '基金从业资格': '基金从业资格',
26
+ '保险从业资格CICE': '保险从业资格CICE',
27
+ '经济师': '经济师',
28
+ '税务师': '税务师',
29
+ '期货从业资格': '期货从业资格',
30
+ '理财规划师': '理财规划师',
31
+ '精算师-金融数学': '精算师-金融数学',
32
+ }
33
+
34
+ financeIQ_all_sets = list(financeIQ_subject_mapping.keys())
35
+
36
+ financeIQ_datasets = []
37
+ for _name in financeIQ_all_sets:
38
+ _ch_name = financeIQ_subject_mapping[_name]
39
+ financeIQ_infer_cfg = dict(
40
+ ice_template=dict(
41
+ type=PromptTemplate,
42
+ template=dict(
43
+ begin='</E>',
44
+ round=[
45
+ dict(
46
+ role='HUMAN',
47
+ prompt=
48
+ f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
49
+ ),
50
+ dict(role='BOT', prompt='答案是: {answer}'),
51
+ ]),
52
+ ice_token='</E>',
53
+ ),
54
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
55
+ inferencer=dict(type=GenInferencer),
56
+ )
57
+
58
+ financeIQ_eval_cfg = dict(
59
+ evaluator=dict(type=AccEvaluator),
60
+ pred_postprocessor=dict(type=first_capital_postprocess))
61
+
62
+ financeIQ_datasets.append(
63
+ dict(
64
+ type=FinanceIQDataset,
65
+ path='./data/FinanceIQ/',
66
+ name=_name,
67
+ abbr=f'FinanceIQ-{_name}',
68
+ reader_cfg=dict(
69
+ input_columns=['question', 'A', 'B', 'C', 'D'],
70
+ output_column='answer',
71
+ train_split='dev',
72
+ test_split='test'),
73
+ infer_cfg=financeIQ_infer_cfg,
74
+ eval_cfg=financeIQ_eval_cfg,
75
+ ))
76
+
77
+ del _name, _ch_name
opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FinanceIQ_ppl_42b9bd import financeIQ_datasets # noqa: F401, F403
opencompass/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import FinanceIQDataset
6
+
7
+ financeIQ_subject_mapping_en = {
8
+ 'certified_public_accountant': '注册会计师(CPA)',
9
+ 'banking_qualification': '银行从业资格',
10
+ 'securities_qualification': '证券从业资格',
11
+ 'fund_qualification': '基金从业资格',
12
+ 'insurance_qualification': '保险从业资格CICE',
13
+ 'economic_analyst': '经济师',
14
+ 'taxation_practitioner': '税务师',
15
+ 'futures_qualification': '期货从业资格',
16
+ 'certified_fin_planner': '理财规划师',
17
+ 'actuary_fin_math': '精算师-金融数学',
18
+ }
19
+
20
+ financeIQ_subject_mapping = {
21
+ '注册会计师(CPA)': '注册会计师(CPA)',
22
+ '银行从业资格': '银行从业资格',
23
+ '证券从业资格': '证券从业资格',
24
+ '基金从业资格': '基金从业资格',
25
+ '保险从业资格CICE': '保险从业资格CICE',
26
+ '经济师': '经济师',
27
+ '税务师': '税务师',
28
+ '期货从业资格': '期货从业资格',
29
+ '理财规划师': '理财规划师',
30
+ '精算师-金融数学': '精算师-金融数学',
31
+ }
32
+
33
+ financeIQ_all_sets = list(financeIQ_subject_mapping.keys())
34
+
35
+ financeIQ_datasets = []
36
+ for _name in financeIQ_all_sets:
37
+ _ch_name = financeIQ_subject_mapping[_name]
38
+ financeIQ_infer_cfg = dict(
39
+ ice_template=dict(
40
+ type=PromptTemplate,
41
+ template={
42
+ answer: dict(
43
+ begin='</E>',
44
+ round=[
45
+ dict(
46
+ role='HUMAN',
47
+ prompt=f'以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}'
48
+ ),
49
+ dict(role='BOT', prompt=f'答案是: {answer}'),
50
+ ])
51
+ for answer in ['A', 'B', 'C', 'D']
52
+ },
53
+ ice_token='</E>',
54
+ ),
55
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
56
+ inferencer=dict(type=PPLInferencer),
57
+ )
58
+
59
+ financeIQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
60
+
61
+ financeIQ_datasets.append(
62
+ dict(
63
+ type=FinanceIQDataset,
64
+ path='./data/FinanceIQ/',
65
+ name=_name,
66
+ abbr=f'FinanceIQ-{_name}',
67
+ reader_cfg=dict(
68
+ input_columns=['question', 'A', 'B', 'C', 'D'],
69
+ output_column='answer',
70
+ train_split='dev',
71
+ test_split='test'),
72
+ infer_cfg=financeIQ_infer_cfg,
73
+ eval_cfg=financeIQ_eval_cfg,
74
+ ))
75
+
76
+ del _name, _ch_name
opencompass/configs/datasets/MathBench/deprecated_mathbench_gen_7b734b.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
5
+ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+
9
+ single_choice_prompts = {
10
+ 'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
11
+ 'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
12
+ 'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
13
+ 'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
14
+ }
15
+
16
+ cloze_prompts = {
17
+ 'cloze_cn': [
18
+ dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
19
+ dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
20
+ dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
21
+ dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
22
+ dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
23
+ dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
24
+ dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
25
+ dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
26
+ dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
27
+ dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
28
+ dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
29
+ dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
30
+ dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
31
+ dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
32
+ dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
33
+ dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
34
+ dict(role='HUMAN', prompt='Q: {question}'),
35
+ dict(role='BOT', prompt='A: {answer}'),
36
+ ],
37
+ 'cloze_en': [
38
+ dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
39
+ dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
40
+ dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
41
+ dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
42
+ dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
43
+ dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
44
+ dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
45
+ dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
46
+ dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
47
+ dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
48
+ dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
49
+ dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
50
+ dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
51
+ dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
52
+ dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
53
+ dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
54
+ dict(role='HUMAN', prompt='Q: {question}'),
55
+ dict(role='BOT', prompt='A: {answer}\n'),
56
+ ]}
57
+ cloze_prompts['refine_cloze_cn'] = cloze_prompts['cloze_cn']
58
+
59
+ mathbench_sets = {
60
+ 'college': ['single_choice_cn', 'cloze_en'],
61
+ 'high': ['single_choice_cn', 'single_choice_en'],
62
+ 'middle': ['single_choice_cn'],
63
+ 'primary': ['cloze_cn'],
64
+ 'primary_refine': ['refine_cloze_cn']
65
+ }
66
+
67
+ # Generate reasoning path or not, only for single choice
68
+ with_reasoning = False
69
+
70
+ # Use circular evaluation or not
71
+ with_circular_eval = True
72
+
73
+ mathbench_datasets = []
74
+
75
+ for _split in list(mathbench_sets.keys()):
76
+ for _name in mathbench_sets[_split]:
77
+ mathbench_infer_cfg = dict(
78
+ prompt_template=dict(
79
+ type=PromptTemplate,
80
+ template=dict(
81
+ round=[
82
+ dict(
83
+ role='HUMAN',
84
+ prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
85
+ ),
86
+ dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
87
+ ),
88
+ ),
89
+ retriever=dict(type=ZeroRetriever),
90
+ inferencer=dict(type=GenInferencer, max_out_len=512),
91
+ )
92
+
93
+ mathbench_eval_cfg = dict(
94
+ evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
95
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
96
+
97
+ mathbench_datasets.append(
98
+ dict(
99
+ abbr='mathbench-' + _split + '-' + _name,
100
+ type=MathBenchDataset,
101
+ path=f'./data/mathbench/{_split}',
102
+ name=_name,
103
+ with_circular=with_circular_eval,
104
+ reader_cfg=dict(
105
+ input_columns=['question'],
106
+ output_column='answer'
107
+ ),
108
+ infer_cfg=mathbench_infer_cfg,
109
+ eval_cfg=mathbench_eval_cfg,
110
+ ))
opencompass/configs/datasets/MathBench/mathbench_2024_few_shot_mixed_4a3fd4.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ from copy import deepcopy
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
6
+ from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
7
+ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
8
+ from opencompass.utils.text_postprocessors import first_option_postprocess
9
+
10
+ with read_base():
11
+ from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
12
+
13
+ # Max for this dataset is 4
14
+ num_shot = 4
15
+ # Generate reasoning path or not, only for single choice
16
+ with_reasoning = True
17
+ # Use circular evaluation or not
18
+ with_circular_eval = True
19
+ # Use PPL mode in single choice test or not
20
+ use_ppl_single_choice = True
21
+
22
+ assert 0 <= num_shot <= 4
23
+ if num_shot == 0:
24
+ prompts = zero_shot_prompts
25
+ else:
26
+ prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
27
+
28
+ mathbench_datasets = []
29
+ for _split in mathbench_sets:
30
+ for _name in mathbench_sets[_split]:
31
+ if 'single_choice' in _name:
32
+ if with_reasoning and not use_ppl_single_choice:
33
+ template_round = prompts[_name + '_with_reasoning']
34
+ else:
35
+ template_round = prompts[_name]
36
+ else:
37
+ template_round = prompts[_name]
38
+
39
+ if 'single_choice' in _name:
40
+ pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
41
+ else:
42
+ pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
43
+
44
+ if 'single_choice' in _name and with_circular_eval:
45
+ evaluator = dict(type=CircularEvaluator)
46
+ else:
47
+ evaluator = dict(type=AccEvaluator)
48
+
49
+ # assemble the final config
50
+ mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
51
+ if use_ppl_single_choice and 'single_choice' in _name:
52
+ template = {}
53
+ for answer in ['A', 'B', 'C', 'D']:
54
+ one_template_round = deepcopy(template_round)
55
+ one_template_round[-1]['prompt'] = one_template_round[-1]['prompt'].format(answer=answer)
56
+ template[answer] = dict(round=one_template_round)
57
+ mathbench_infer_cfg = dict(
58
+ prompt_template=dict(type=PromptTemplate, template=template),
59
+ retriever=dict(type=ZeroRetriever),
60
+ inferencer=dict(type=PPLInferencer),
61
+ )
62
+ else:
63
+ mathbench_infer_cfg = dict(
64
+ prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
65
+ retriever=dict(type=ZeroRetriever),
66
+ inferencer=dict(type=GenInferencer, max_out_len=2048, stopping_criteria=['Question:']),
67
+ )
68
+ mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
69
+
70
+ mathbench_datasets.append(
71
+ dict(
72
+ abbr='mathbench-' + _split + '-' + _name,
73
+ type=MathBenchDataset,
74
+ path=f'data/mathbench_v1/{_split}',
75
+ name=_name,
76
+ with_circular=with_circular_eval,
77
+ reader_cfg=mathbench_reader_cfg,
78
+ infer_cfg=mathbench_infer_cfg,
79
+ eval_cfg=mathbench_eval_cfg,
80
+ )
81
+ )
opencompass/configs/datasets/MathBench/mathbench_2024_gen_19e486.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
5
+ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ single_choice_prompts = {
9
+ 'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n',
10
+ 'single_choice_cn': '以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:',
11
+ 'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
12
+ 'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
13
+ }
14
+
15
+ cloze_prompts = {
16
+ 'cloze_cn': [
17
+ dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
18
+ dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
19
+ dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
20
+ dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
21
+ dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
22
+ dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
23
+ dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
24
+ dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
25
+ dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
26
+ dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
27
+ dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
28
+ dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
29
+ dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
30
+ dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
31
+ dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
32
+ dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
33
+ dict(role='HUMAN', prompt='Q: {question}'),
34
+ dict(role='BOT', prompt='A: {answer}'),
35
+ ],
36
+ 'cloze_en': [
37
+ dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
38
+ dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
39
+ dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
40
+ dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
41
+ dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
42
+ dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
43
+ dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
44
+ dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
45
+ dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
46
+ dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
47
+ dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
48
+ dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
49
+ dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
50
+ dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
51
+ dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
52
+ dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
53
+ dict(role='HUMAN', prompt='Q: {question}'),
54
+ dict(role='BOT', prompt='A: {answer}\n'),
55
+ ]}
56
+
57
+ mathbench_sets = {
58
+ # Practice Part
59
+ 'college': ['single_choice_cn', 'single_choice_en'],
60
+ 'high': ['single_choice_cn', 'single_choice_en'],
61
+ 'middle': ['single_choice_cn', 'single_choice_en'],
62
+ 'primary': ['cloze_cn', 'cloze_en'],
63
+ 'arithmetic': ['cloze_en'],
64
+ # Theory part
65
+ 'college_knowledge': ['single_choice_cn','single_choice_en'],
66
+ 'high_knowledge': ['single_choice_cn','single_choice_en'],
67
+ 'middle_knowledge': ['single_choice_cn','single_choice_en'],
68
+ 'primary_knowledge': ['single_choice_cn','single_choice_en'],
69
+ }
70
+
71
+ # Generate reasoning path or not, only for single choice
72
+ with_reasoning = True
73
+
74
+ # Use circular evaluation or not
75
+ with_circular_eval = True
76
+
77
+ mathbench_datasets = []
78
+
79
+ for _split in list(mathbench_sets.keys()):
80
+ for _name in mathbench_sets[_split]:
81
+ mathbench_infer_cfg = dict(
82
+ prompt_template=dict(
83
+ type=PromptTemplate,
84
+ template=dict(
85
+ round=[
86
+ dict(
87
+ role='HUMAN',
88
+ prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
89
+ ),
90
+ dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
91
+ ),
92
+ ),
93
+ retriever=dict(type=ZeroRetriever),
94
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
95
+ )
96
+
97
+ mathbench_eval_cfg = dict(
98
+ evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
99
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
100
+
101
+ mathbench_datasets.append(
102
+ dict(
103
+ abbr='mathbench-' + _split + '-' + _name,
104
+ type=MathBenchDataset,
105
+ path=f'./data/mathbench_v1/{_split}',
106
+ name=_name,
107
+ with_circular=with_circular_eval,
108
+ reader_cfg=dict(
109
+ input_columns=['question'],
110
+ output_column='answer'
111
+ ),
112
+ infer_cfg=mathbench_infer_cfg,
113
+ eval_cfg=mathbench_eval_cfg,
114
+ ))
opencompass/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ from copy import deepcopy
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
6
+ from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
7
+ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
8
+ from opencompass.utils.text_postprocessors import first_option_postprocess
9
+
10
+ with read_base():
11
+ from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
12
+
13
+ # Max for this dataset is 4
14
+ num_shot = 4
15
+ # Generate reasoning path or not, only for single choice
16
+ with_reasoning = False
17
+ # Use circular evaluation or not
18
+ with_circular_eval = True
19
+ # Use PPL mode in single choice test or not
20
+ use_ppl_single_choice = False
21
+
22
+ assert 0 <= num_shot <= 4
23
+ if num_shot == 0:
24
+ prompts = zero_shot_prompts
25
+ else:
26
+ prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
27
+
28
+ mathbench_datasets = []
29
+ for _split in mathbench_sets:
30
+ for _name in mathbench_sets[_split]:
31
+ if 'single_choice' in _name:
32
+ if with_reasoning:
33
+ template_round = prompts[_name + '_with_reasoning']
34
+ else:
35
+ template_round = prompts[_name]
36
+ else:
37
+ template_round = prompts[_name]
38
+
39
+ if 'single_choice' in _name:
40
+ pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
41
+ else:
42
+ pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
43
+
44
+ if 'single_choice' in _name and with_circular_eval:
45
+ evaluator = dict(type=CircularEvaluator)
46
+ else:
47
+ evaluator = dict(type=AccEvaluator)
48
+
49
+ # assemble the final config
50
+ mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
51
+ if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
52
+ template = {}
53
+ for answer in ['A', 'B', 'C', 'D']:
54
+ one_template_round = deepcopy(template_round)
55
+ one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
56
+ template[answer] = dict(round=one_template_round)
57
+ mathbench_infer_cfg = dict(
58
+ prompt_template=dict(type=PromptTemplate, template=template),
59
+ retriever=dict(type=ZeroRetriever),
60
+ inferencer=dict(type=PPLInferencer),
61
+ )
62
+ else:
63
+ mathbench_infer_cfg = dict(
64
+ prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
65
+ retriever=dict(type=ZeroRetriever),
66
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
67
+ )
68
+ mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
69
+
70
+ mathbench_datasets.append(
71
+ dict(
72
+ abbr='mathbench-no_cot-' + _split + '-' + _name,
73
+ type=MathBenchDataset,
74
+ path=f'data/mathbench_v1/{_split}',
75
+ name=_name,
76
+ with_circular=with_circular_eval,
77
+ reader_cfg=mathbench_reader_cfg,
78
+ infer_cfg=mathbench_infer_cfg,
79
+ eval_cfg=mathbench_eval_cfg,
80
+ )
81
+ )
opencompass/configs/datasets/MathBench/mathbench_2024_gen_50a320.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+ from copy import deepcopy
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
6
+ from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
7
+ from opencompass.datasets import MathBenchDataset, math_postprocess_v2
8
+ from opencompass.utils.text_postprocessors import first_option_postprocess
9
+
10
+ with read_base():
11
+ from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
12
+
13
+ # Max for this dataset is 4
14
+ num_shot = 0
15
+ # Generate reasoning path or not, only for single choice
16
+ with_reasoning = True
17
+ # Use circular evaluation or not
18
+ with_circular_eval = True
19
+ # Use PPL mode in single choice test or not
20
+ use_ppl_single_choice = False
21
+
22
+ assert 0 <= num_shot <= 4
23
+ if num_shot == 0:
24
+ prompts = zero_shot_prompts
25
+ else:
26
+ prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
27
+
28
+ mathbench_datasets = []
29
+ for _split in mathbench_sets:
30
+ for _name in mathbench_sets[_split]:
31
+ if 'single_choice' in _name:
32
+ if with_reasoning:
33
+ template_round = prompts[_name + '_with_reasoning']
34
+ else:
35
+ template_round = prompts[_name]
36
+ else:
37
+ template_round = prompts[_name]
38
+
39
+ if 'single_choice' in _name:
40
+ pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
41
+ else:
42
+ pred_postprocessor = dict(type=math_postprocess_v2)
43
+
44
+ if 'single_choice' in _name and with_circular_eval:
45
+ evaluator = dict(type=CircularEvaluator)
46
+ else:
47
+ evaluator = dict(type=AccEvaluator)
48
+
49
+ # assemble the final config
50
+ mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
51
+ if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
52
+ template = {}
53
+ for answer in ['A', 'B', 'C', 'D']:
54
+ one_template_round = deepcopy(template_round)
55
+ one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
56
+ template[answer] = dict(round=one_template_round)
57
+ mathbench_infer_cfg = dict(
58
+ prompt_template=dict(type=PromptTemplate, template=template),
59
+ retriever=dict(type=ZeroRetriever),
60
+ inferencer=dict(type=PPLInferencer),
61
+ )
62
+ else:
63
+ mathbench_infer_cfg = dict(
64
+ prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
65
+ retriever=dict(type=ZeroRetriever),
66
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
67
+ )
68
+ mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
69
+
70
+ mathbench_datasets.append(
71
+ dict(
72
+ abbr='mathbench-' + _split + '-' + _name,
73
+ type=MathBenchDataset,
74
+ path=f'data/mathbench_v1/{_split}',
75
+ name=_name,
76
+ with_circular=with_circular_eval,
77
+ reader_cfg=mathbench_reader_cfg,
78
+ infer_cfg=mathbench_infer_cfg,
79
+ eval_cfg=mathbench_eval_cfg,
80
+ )
81
+ )
opencompass/configs/datasets/MathBench/mathbench_prompt.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ zero_shot_prompts = {
2
+ 'single_choice_cn_with_reasoning': [
3
+ dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:'),
4
+ ],
5
+ 'single_choice_cn': [
6
+ dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n所以答案是:'),
7
+ ],
8
+ 'single_choice_en_with_reasoning': [
9
+ dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with "Therefore, the correct answer is option X", Where "X" is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet\'s think step by step:'),
10
+ ],
11
+ 'single_choice_en': [
12
+ dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
13
+ ],
14
+ 'cloze_en': [
15
+ dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
16
+ ],
17
+ 'cloze_cn': [
18
+ dict(role='HUMAN', prompt='{question}\n请一步一步推理,并在最后用\\boxed{}给出你的答案。'),
19
+ ]
20
+ }
21
+
22
+ few_shot_prompts = {
23
+ 'single_choice_cn': [
24
+ dict(role='HUMAN', prompt='问题: 已知i是虚数单位,z为复数,$2+\\frac{1}{i}=z(3+i)$,则在复平面内z对应的点位于____?\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
25
+ dict(role='BOT', prompt='回答: D'),
26
+ dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
27
+ dict(role='BOT', prompt='回答: D'),
28
+ dict(role='HUMAN', prompt='问题: 设$(1+2i)a+b=2i$,其中a,b为实数,则( )\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
29
+ dict(role='BOT', prompt='回答: A'),
30
+ dict(role='HUMAN', prompt='问题: 圆锥的底面半径为2,高为4.一个圆柱的下底面在圆锥的底面上,上底面的圆周在圆锥的侧面上,当圆柱侧面积为$4 \\pi$时,该圆柱的体积为____\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
31
+ dict(role='BOT', prompt='回答: B'),
32
+ dict(role='HUMAN', prompt='问题: {question}'),
33
+ dict(role='BOT', prompt='回答: {answer}'),
34
+ ],
35
+ 'single_choice_cn_with_reasoning' : [
36
+ dict(role='HUMAN', prompt='问题: 已知i是虚数单位,z为复数,$2+\\frac{1}{i}=z(3+i)$,则在复平面内z对应的点位于____\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
37
+ dict(role='BOT', prompt='回答: 因为首先,我们将等式两边同时乘以$(3 + i)$的共轭复数$(3 - i)$,以便消去分母中的虚数部分:$z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$,这样做的目的是利用复数乘法的性质,从而消去分母中的虚数部分。我们进行乘法运算:$z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$,$z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$,由于$i^2 = -1$,我们可以进一步简化:$z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{{1}{2} - \\frac{1}{2}i$,在复平面上,这个点位于第四象限,因为它的实部是正的($\\frac{1}{2}$),而虚部是负的($-\\frac{1}{2}$)\n答案选项:D'),
38
+ dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
39
+ dict(role='BOT', prompt='回答: 将函数$y=\\tan(\\omega x-1)$(其中$\\omega>0$)的图像向左平移2个单位长度后,得到的新函数为$y=\\tan(\\omega(x+2)-1)$。为了使这个新函数的图像与$y=\\tan(\\omega x+3)$的图像重合,我们需要满足以下条件:\n$\\tan(\\omega(x+2)-1) = \\tan(\\omega x+3)$\n由于正切函数的周期性,我们知道$\\tan(\\theta) = \\tan(\\theta + k\\pi)$,其中$k$是任意整数。因此,我们可以写出:\n$\\omega(x+2)-1 = \\omega x+3 + k\\pi$\n解这个方程,我们得到:$\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$\n$2\\omega - 1 = 3 + k\\pi$\n$\\omega = 2 + \\frac{k\\pi}{2}$\n由于$\\omega>0$,我们需要找到满足条件的最小正整数$k$。当$k=0$时,我们得到$\\omega=2$,这是满足条件的最小正值。因此,最小值等于2。答案选项:D'),
40
+ dict(role='HUMAN', prompt='问题: 设$(1+2i)a+b=2i$,其中a,b为实数,则( )\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
41
+ dict(role='BOT', prompt='回答: 我们可以将这个方程写成:$a + 2ai + b = 2i$,然后将实部和虚部分开:\n$a + b = 0$\n$2a = 2$\n由于$a + b = 0$,我们可以得到$a = -b$。将这个值代入$2a = 2$,我们得到$2(-b) = 2$,因此$b = -1$。因此,$a = 1$。答案选项:A'),
42
+ dict(role='HUMAN', prompt='问题: 一直角三角形的两边长分别为3和4.则第三边的长为____\nA. 5\nB. $\\sqrt 7$\nC. $\\sqrt 5$\nD. 5或者$\\sqrt 7$'),
43
+ dict(role='BOT', prompt='回答: 我们可以使用勾股定理来解决这个问题。勾股定理告诉我们,如果一个三角形的两条边长分别为$a$和$b$,那么第三条边的长度为$c = \\sqrt{a^2 + b^2}$。在这个问题中,$a = 3$,$b = 4$,因此$c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$。答案选项:A'),
44
+ dict(role='HUMAN', prompt='问题: {question}'),
45
+ dict(role='BOT', prompt='回答: {answer}'),
46
+ ],
47
+ 'single_choice_en': [
48
+ dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
49
+ dict(role='BOT', prompt='Response: D'),
50
+ dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
51
+ dict(role='BOT', prompt='Response: D'),
52
+ dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
53
+ dict(role='BOT', prompt='Response: A'),
54
+ dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
55
+ dict(role='BOT', prompt='Response: B'),
56
+ dict(role='HUMAN', prompt='Question: {question}'),
57
+ dict(role='BOT', prompt='Response: {answer}'),
58
+ ],
59
+ 'single_choice_en_with_reasoning': [
60
+ dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
61
+ dict(role='BOT', prompt='Response: First, we multiply both sides of the equation by the conjugate of $(3 + i)$: $z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$. We perform the multiplication: $z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$, $z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$. Since $i^2 = -1$, we can simplify this further: $z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{1}{2} - \\frac{1}{2}i$. In the complex plane, this point is located in the fourth quadrant, because its real part is positive ($\\frac{1}{2}$) and its imaginary part is negative ($-\\frac{1}{2}$)\nAnswer option: D'),
62
+ dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
63
+ dict(role='BOT', prompt='Response: In order for the graph of this new function to coincide with the graph of $y = \\tan(\\omega x + 3)$, we need to satisfy the following condition: $\\tan(\\omega(x + 2) - 1) = \\tan(\\omega x + 3)$. Therefore, we can write: $\\omega(x + 2) - 1 = \\omega x + 3 + k\\pi$. Solving this equation, we get: $\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$. $2\\omega - 1 = 3 + k\\pi$. $\\omega = 2 + \\frac{k\\pi}{2}$. Since $\\omega > 0$, we need to find the smallest positive integer $k$ that satisfies the condition. When $k = 0$, we get $\\omega = 2$, which is the smallest positive value that satisfies the condition. Therefore, the minimum value is 2. Answer option: D'),
64
+ dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
65
+ dict(role='BOT', prompt='Response: We can write this equation as: $a + 2ai + b = 2i$, and then separate the real and imaginary parts: $a + b = 0$. $2a = 2$. Since $a + b = 0$, we can get $a = -b$. Substituting this value into $2a = 2$, we get $2(-b) = 2$, so $b = -1$. Therefore, $a = 1$. Answer option: A'),
66
+ dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
67
+ dict(role='BOT', prompt='Response: We can use the Pythagorean theorem to solve this problem. The Pythagorean theorem tells us that if the two sides of a triangle are $a$ and $b$, then the length of the third side is $c = \\sqrt{a^2 + b^2}$. In this problem, $a = 3$ and $b = 4$, so $c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$. Answer option: A'),
68
+ dict(role='HUMAN', prompt='Question: {question}'),
69
+ dict(role='BOT', prompt='Response: {answer}'),
70
+ ],
71
+ 'cloze_cn': [
72
+ dict(role='HUMAN', prompt='问题: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
73
+ dict(role='BOT', prompt='回答: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9'),
74
+ dict(role='HUMAN', prompt='问题: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
75
+ dict(role='BOT', prompt='回答: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29'),
76
+ dict(role='HUMAN', prompt='问题: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
77
+ dict(role='BOT', prompt='回答: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33'),
78
+ dict(role='HUMAN', prompt='问题: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
79
+ dict(role='BOT', prompt='回答: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8'),
80
+ dict(role='HUMAN', prompt='问题: {question}'),
81
+ dict(role='BOT', prompt='回答: {answer}'),
82
+ ],
83
+ 'cloze_en': [
84
+ dict(role='HUMAN', prompt='Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
85
+ dict(role='BOT', prompt='Response: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.'),
86
+ dict(role='HUMAN', prompt='Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
87
+ dict(role='BOT', prompt='Response: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.'),
88
+ dict(role='HUMAN', prompt='Question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
89
+ dict(role='BOT', prompt='Response: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.'),
90
+ dict(role='HUMAN', prompt='Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
91
+ dict(role='BOT', prompt='Response: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.'),
92
+ dict(role='HUMAN', prompt='Question: {question}'),
93
+ dict(role='BOT', prompt='Response: {answer}'),
94
+ ],
95
+ }
96
+
97
+ mathbench_sets = {
98
+ # Practice Part
99
+ 'college': ['single_choice_cn', 'single_choice_en'],
100
+ 'high': ['single_choice_cn', 'single_choice_en'],
101
+ 'middle': ['single_choice_cn', 'single_choice_en'],
102
+ 'primary': ['cloze_cn', 'cloze_en'],
103
+ 'arithmetic': ['cloze_en'],
104
+ # Theory part
105
+ 'college_knowledge': ['single_choice_cn', 'single_choice_en'],
106
+ 'high_knowledge': ['single_choice_cn', 'single_choice_en'],
107
+ 'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
108
+ 'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
109
+ }
opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .NPHardEval_gen_22aac5 import NPHardEval_datasets # noqa: F401, F403
opencompass/configs/datasets/NPHardEval/NPHardEval_gen_22aac5.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets.NPHardEval import (
5
+ HardGCPDataset, HardGCPEvaluator,
6
+ Hard_TSP_Dataset, Hard_TSP_Evaluator,
7
+ Hard_MSP_Dataset, Hard_MSP_Evaluator,
8
+ CMP_GCP_D_Dataset, CMP_GCP_D_Evaluator,
9
+ CMP_TSP_D_Dataset, CMP_TSP_D_Evaluator,
10
+ CMP_KSP_Dataset, CMP_KSP_Evaluator,
11
+ P_BSP_Dataset, P_BSP_Evaluator,
12
+ P_EDP_Dataset, P_EDP_Evaluator,
13
+ P_SPP_Dataset, P_SPP_Evaluator,
14
+ )
15
+
16
+ NPHardEval_tasks = [
17
+ ['hard_GCP', 'GCP', HardGCPDataset, HardGCPEvaluator],
18
+ ['hard_TSP', 'TSP', Hard_TSP_Dataset, Hard_TSP_Evaluator],
19
+ ['hard_MSP', 'MSP', Hard_MSP_Dataset, Hard_MSP_Evaluator],
20
+ ['cmp_GCP_D', 'GCP_Decision', CMP_GCP_D_Dataset, CMP_GCP_D_Evaluator],
21
+ ['cmp_TSP_D', 'TSP_Decision', CMP_TSP_D_Dataset, CMP_TSP_D_Evaluator],
22
+ ['cmp_KSP', 'KSP', CMP_KSP_Dataset, CMP_KSP_Evaluator],
23
+ ['p_BSP', 'BSP', P_BSP_Dataset, P_BSP_Evaluator],
24
+ ['p_EDP', 'EDP', P_EDP_Dataset, P_EDP_Evaluator],
25
+ ['p_SPP', 'SPP', P_SPP_Dataset, P_SPP_Evaluator],
26
+ ]
27
+
28
+ NPHardEval_datasets = []
29
+ for name, path_name, dataset, evaluator in NPHardEval_tasks:
30
+ NPHardEval_reader_cfg = dict(input_columns=['prompt', 'level'], output_column='q')
31
+
32
+ NPHardEval_infer_cfg = dict(
33
+ ice_template=dict(
34
+ type=PromptTemplate,
35
+ template=dict(
36
+ begin='</E>',
37
+ round=[
38
+ dict(role='HUMAN', prompt='</E>{prompt}'),
39
+ dict(role='BOT', prompt=''),
40
+ ],
41
+ ),
42
+ ice_token='</E>',
43
+ ),
44
+ retriever=dict(type=ZeroRetriever),
45
+ inferencer=dict(type=GenInferencer),
46
+ )
47
+
48
+ NPHardEval_eval_cfg = dict(evaluator=dict(type=evaluator), pred_role='BOT')
49
+
50
+ NPHardEval_datasets.append(
51
+ dict(
52
+ type=dataset,
53
+ abbr=name,
54
+ path=f'./data/NPHardEval/{path_name}/',
55
+ reader_cfg=NPHardEval_reader_cfg,
56
+ infer_cfg=NPHardEval_infer_cfg,
57
+ eval_cfg=NPHardEval_eval_cfg,
58
+ )
59
+ )
opencompass/configs/datasets/NPHardEval/README.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NPHardEval
2
+ ## Introduction
3
+ The following introduction comes from the abstract in [NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language Models via Complexity Classes](https://arxiv.org/abs/2312.14890)
4
+
5
+ ```
6
+ This benchmark is designed to evaluate the reasoning abilities of LLMs across a broad spectrum of 900 algorithmic questions, extending up to the NP-Hard complexity class. These questions are meticulously chosen to represent a wide range of complexity class below the NP-hard complexity class, offering a rigorous measure of the reasoning ability of LLMs.
7
+ ```
8
+
9
+ ## Official link
10
+
11
+ ### Paper
12
+
13
+ [NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language Models via Complexity Classes](https://arxiv.org/abs/2312.14890)
14
+
15
+ ### Repository
16
+
17
+ [NPHardEval](https://github.com/casmlab/NPHardEval)
18
+
19
+ ## Examples
20
+ Input example I:
21
+ ```
22
+ The Graph Coloring Problem (Decision Version, GCP-D) involves determining if it is possible to color the vertices of a graph using a given number of colors, ensuring no two adjacent vertices have the same color.
23
+ Find out if the vertices of a graph with 6 vertices can be colored using only 3 colors, such that no adjacent vertices share the same color.
24
+ Provide a yes or no answer, along with a concise explanation of your reasoning. Keep your explanation focused and brief.
25
+ Enclose your output in <root></root> tags. Include your reasoning in <reasoning></reasoning> tags and the final yes/no answer in <final_answer></final_answer> tags, like <final_answer>{'Feasible': 'YES_OR_NO'}</final_answer>.
26
+
27
+ The graph is below:
28
+ Vertex 2 is connected to vertex 4.
29
+ Vertex 2 is connected to vertex 5.
30
+ Vertex 3 is connected to vertex 5.
31
+ Vertex 4 is connected to vertex 5.
32
+ Vertex 5 is connected to vertex 6.
33
+ ```
34
+
35
+ Output example I (from GPT-4):
36
+ ```
37
+ <root>
38
+ <final_answer>{'Feasible': 'YES'}</final_answer>
39
+ <reasoning>Yes, it is possible to color the vertices of the given graph with 6 vertices using only 3 colors, such that no adjacent vertices share the same color. The graph does not contain any triangles or cycles of length greater than 3, which means it is a planar graph. By the Four Color Theorem, any planar graph can be colored using at most 4 colors. Since we have only 3 colors available, it is feasible to color the vertices without any adjacent vertices sharing the same color.</reasoning>
40
+ </root>
41
+ ```
42
+
43
+ Input example II:
44
+ ```
45
+ The Traveling Salesman Problem (Decision Version, TSP-D) focuses on determining if a salesman can complete a route, visiting each city at least once, with the total travel distance being less than a specified value.
46
+ Check if it's possible for a salesman to visit each of the 10 cities at least once and return to the starting city with the total distance less than 3481.5. The distances between each pair of cities are given.
47
+ Provide a yes or no answer, with a succinct explanation of your decision process. Focus on clarity and brevity in your response.
48
+ Enclose your output in <root></root> tags. Present your reasoning in <reasoning></reasoning> tags and the final yes/no answer in <final_answer></final_answer> tags, like <final_answer>{'Feasible': 'YES_OR_NO'}</final_answer>.
49
+ The distances between cities are below:
50
+ The distance between City 0 and City 1 is 58.0.
51
+ The distance between City 0 and City 2 is 171.0.
52
+ The distance between City 0 and City 3 is 122.0.
53
+ The distance between City 0 and City 4 is 109.0.
54
+ The distance between City 0 and City 5 is 93.0.
55
+ The distance between City 0 and City 6 is 106.0.
56
+ The distance between City 0 and City 7 is 52.0.
57
+ The distance between City 0 and City 8 is 115.0.
58
+ The distance between City 0 and City 9 is 148.0.
59
+ The distance between City 1 and City 2 is 145.0.
60
+ The distance between City 1 and City 3 is 71.0.
61
+ The distance between City 1 and City 4 is 114.0.
62
+ The distance between City 1 and City 5 is 69.0.
63
+ The distance between City 1 and City 6 is 163.0.
64
+ The distance between City 1 and City 7 is 132.0.
65
+ The distance between City 1 and City 8 is 183.0.
66
+ The distance between City 1 and City 9 is 115.0.
67
+ The distance between City 2 and City 3 is 107.0.
68
+ The distance between City 2 and City 4 is 163.0.
69
+ The distance between City 2 and City 5 is 63.0.
70
+ The distance between City 2 and City 6 is 91.0.
71
+ The distance between City 2 and City 7 is 108.0.
72
+ The distance between City 2 and City 8 is 109.0.
73
+ The distance between City 2 and City 9 is 77.0.
74
+ The distance between City 3 and City 4 is 119.0.
75
+ The distance between City 3 and City 5 is 131.0.
76
+ The distance between City 3 and City 6 is 79.0.
77
+ The distance between City 3 and City 7 is 159.0.
78
+ The distance between City 3 and City 8 is 27.0.
79
+ The distance between City 3 and City 9 is 64.0.
80
+ The distance between City 4 and City 5 is 85.0.
81
+ The distance between City 4 and City 6 is 110.0.
82
+ The distance between City 4 and City 7 is 71.0.
83
+ The distance between City 4 and City 8 is 61.0.
84
+ The distance between City 4 and City 9 is 61.0.
85
+ The distance between City 5 and City 6 is 73.0.
86
+ The distance between City 5 and City 7 is 132.0.
87
+ The distance between City 5 and City 8 is 82.0.
88
+ The distance between City 5 and City 9 is 96.0.
89
+ The distance between City 6 and City 7 is 156.0.
90
+ The distance between City 6 and City 8 is 103.0.
91
+ The distance between City 6 and City 9 is 69.0.
92
+ The distance between City 7 and City 8 is 108.0.
93
+ The distance between City 7 and City 9 is 97.0.
94
+ The distance between City 8 and City 9 is 75.0.
95
+ ```
96
+
97
+ Output example II (from GPT-4):
98
+ ```
99
+ <root> <reasoning> The Traveling Salesman Problem is a well-known problem in the field of computer science and operations research. It is a combinatorial optimization problem, which means it involves finding the best solution from a finite set of possible solutions. The problem is NP-hard, which means there is no known algorithm that can solve it in polynomial time. Therefore, to solve this problem, we would need to use an algorithm that can find the optimal solution, such as a brute force algorithm, a genetic algorithm, or a simulated annealing algorithm. However, these algorithms can be computationally expensive, especially for large problem sizes. In this case, we have 10 cities, which means there are 10! = 3,628,800 possible routes. Therefore, without the use of a computer, it would be impractical to manually check all possible routes to find the one with the total distance less than 3481.5. </reasoning> <final_answer>{'Feasible': 'UNKNOWN'}</final_answer> </root>
100
+ ```
101
+
102
+ ## Evaluation results
103
+
104
+ ```
105
+ dataset version metric mode internlm2-chat-7b-hf
106
+ --------- --------- ----------------- ------ ----------------------
107
+ hard_GCP 144a59 Weighted Accuracy gen 1.64
108
+ hard_TSP 144a59 Weighted Accuracy gen 0
109
+ hard_MSP 144a59 Weighted Accuracy gen 0
110
+ cmp_GCP_D 144a59 Weighted Accuracy gen 43.82
111
+ cmp_TSP_D 144a59 Weighted Accuracy gen 40.18
112
+ cmp_KSP 144a59 Weighted Accuracy gen 0
113
+ p_BSP 144a59 Weighted Accuracy gen 40.36
114
+ p_EDP 144a59 Weighted Accuracy gen 0
115
+ p_SPP 144a59 Weighted Accuracy gen 0
116
+ ```
117
+
118
+ ## Reference
119
+ ```
120
+ @article{fan2023nphardeval,
121
+ title={NPHardEval: Dynamic Benchmark on Reasoning Ability of Large Language Models via Complexity Classes},
122
+ author={Fan, Lizhou and Hua, Wenyue and Li, Lingyao and Ling, Haoyang and Zhang, Yongfeng and Hemphill, Libby},
123
+ journal={arXiv preprint arXiv:2312.14890},
124
+ year={2023}
125
+ }
126
+ ```
opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets # noqa: F401, F403
opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MultiRCDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ MultiRC_reader_cfg = dict(
9
+ input_columns=['question', 'text', 'answer'],
10
+ output_column='label',
11
+ )
12
+
13
+ MultiRC_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt=
20
+ '{text}\nQuestion: {question}\nAnswer: {answer}\nIs it true?\nA. Yes\nB. No\nAnswer:'
21
+ ),
22
+ ]),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ MultiRC_eval_cfg = dict(
29
+ evaluator=dict(type=AccEvaluator),
30
+ pred_role='BOT',
31
+ pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
32
+ )
33
+
34
+ MultiRC_datasets = [
35
+ dict(
36
+ abbr='MultiRC',
37
+ type=MultiRCDatasetV2,
38
+ path='./data/SuperGLUE/MultiRC/val.jsonl',
39
+ reader_cfg=MultiRC_reader_cfg,
40
+ infer_cfg=MultiRC_infer_cfg,
41
+ eval_cfg=MultiRC_eval_cfg,
42
+ )
43
+ ]
opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .SuperGLUE_MultiRC_ppl_ced824 import MultiRC_datasets # noqa: F401, F403
opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_ppl_866273.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MultiRCDataset
6
+
7
+ MultiRC_reader_cfg = dict(
8
+ input_columns=['question', 'text', 'answer'], output_column='label')
9
+
10
+ MultiRC_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template={
14
+ 0: 'Passage:{text}。\nQuestion:{question}。\nAnswer: {answer}. It is false.',
15
+ 1: 'Passage:</P>。\nQuestion:{question}。\nAnswer: {answer}. It is true.',
16
+ }),
17
+ retriever=dict(type=ZeroRetriever),
18
+ inferencer=dict(type=PPLInferencer))
19
+
20
+ MultiRC_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
21
+
22
+ MultiRC_datasets = [
23
+ dict(
24
+ type=MultiRCDataset,
25
+ abbr='MultiRC',
26
+ path='./data/SuperGLUE/MultiRC/val.jsonl',
27
+ reader_cfg=MultiRC_reader_cfg,
28
+ infer_cfg=MultiRC_infer_cfg,
29
+ eval_cfg=MultiRC_eval_cfg)
30
+ ]
opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_ppl_ced824.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MultiRCDataset
6
+
7
+ MultiRC_reader_cfg = dict(
8
+ input_columns=['question', 'text', 'answer'],
9
+ output_column='label',
10
+ )
11
+
12
+ MultiRC_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0:
17
+ dict(round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt='{text}\nQuestion: {question}\nAnswer: {answer}\nIs it true?'),
21
+ dict(role='BOT', prompt='No, it is false.'),
22
+ ]),
23
+ 1:
24
+ dict(round=[
25
+ dict(
26
+ role='HUMAN',
27
+ prompt='{text}\nQuestion: {question}\nAnswer: {answer}\nIs it true?'),
28
+ dict(role='BOT', prompt='Yes, it is true.'),
29
+ ]),
30
+ },
31
+ ),
32
+ retriever=dict(type=ZeroRetriever),
33
+ inferencer=dict(type=PPLInferencer),
34
+ )
35
+
36
+ MultiRC_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
37
+
38
+ MultiRC_datasets = [
39
+ dict(
40
+ type=MultiRCDataset,
41
+ abbr='MultiRC',
42
+ path='./data/SuperGLUE/MultiRC/val.jsonl',
43
+ reader_cfg=MultiRC_reader_cfg,
44
+ infer_cfg=MultiRC_infer_cfg,
45
+ eval_cfg=MultiRC_eval_cfg,
46
+ )
47
+ ]
opencompass/configs/datasets/agieval/agieval_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .agieval_gen_617738 import agieval_datasets # noqa: F401, F403
opencompass/configs/datasets/agieval/agieval_gen_397d81.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
7
+
8
+ agieval_reader_cfg = dict(
9
+ input_columns=['question', 'options'], output_column='label')
10
+
11
+ agieval_single_choice_sets = [
12
+ 'gaokao-chinese',
13
+ 'gaokao-english',
14
+ 'gaokao-geography',
15
+ 'gaokao-history',
16
+ 'gaokao-biology',
17
+ 'gaokao-chemistry',
18
+ 'gaokao-physics',
19
+ 'gaokao-mathqa',
20
+ 'logiqa-zh',
21
+ 'lsat-ar',
22
+ 'lsat-lr',
23
+ 'lsat-rc',
24
+ 'logiqa-en',
25
+ 'sat-math',
26
+ 'sat-en',
27
+ 'sat-en-without-passage',
28
+ 'aqua-rat',
29
+ ]
30
+ agieval_multiple_choices_sets = [
31
+ 'jec-qa-kd',
32
+ 'jec-qa-ca',
33
+ ]
34
+ agieval_cloze_sets = ['gaokao-mathcloze', 'math']
35
+ agieval_chinese_sets = [
36
+ 'gaokao-chinese',
37
+ 'gaokao-english',
38
+ 'gaokao-geography',
39
+ 'gaokao-history',
40
+ 'gaokao-biology',
41
+ 'gaokao-chemistry',
42
+ 'gaokao-physics',
43
+ 'gaokao-mathqa',
44
+ 'logiqa-zh',
45
+ 'gaokao-mathcloze',
46
+ ]
47
+ agieval_english_sets = [
48
+ 'lsat-ar',
49
+ 'lsat-lr',
50
+ 'lsat-rc',
51
+ 'logiqa-en',
52
+ 'sat-math',
53
+ 'sat-en',
54
+ 'sat-en-without-passage',
55
+ 'aqua-rat',
56
+ 'math',
57
+ ]
58
+ agieval_gaokao_sets = [
59
+ 'gaokao-chinese',
60
+ 'gaokao-english',
61
+ 'gaokao-geography',
62
+ 'gaokao-history',
63
+ 'gaokao-biology',
64
+ 'gaokao-chemistry',
65
+ 'gaokao-physics',
66
+ 'gaokao-mathqa',
67
+ ]
68
+
69
+ agieval_datasets = []
70
+ for _name in agieval_single_choice_sets:
71
+ if _name in agieval_chinese_sets:
72
+ _hint = '答案是: '
73
+ else:
74
+ _hint = 'The answer is '
75
+ agieval_infer_cfg = dict(
76
+ prompt_template=dict(
77
+ type=PromptTemplate,
78
+ template=dict(round=[
79
+ dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
80
+ ])),
81
+ retriever=dict(type=ZeroRetriever),
82
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
83
+
84
+ agieval_eval_cfg = dict(
85
+ evaluator=dict(type=AccEvaluator),
86
+ pred_postprocessor=dict(type=first_capital_postprocess))
87
+
88
+ agieval_datasets.append(
89
+ dict(
90
+ type=AGIEvalDataset_v2,
91
+ path='opencompass/agieval',
92
+ name=_name,
93
+ abbr='agieval-' + _name,
94
+ setting_name='zero-shot',
95
+ reader_cfg=agieval_reader_cfg,
96
+ infer_cfg=agieval_infer_cfg.copy(),
97
+ eval_cfg=agieval_eval_cfg.copy()))
98
+
99
+ for _name in agieval_multiple_choices_sets:
100
+ if _name in agieval_chinese_sets:
101
+ _hint = '答案是: '
102
+ else:
103
+ _hint = 'The answer is '
104
+ agieval_infer_cfg = dict(
105
+ prompt_template=dict(
106
+ type=PromptTemplate,
107
+ template=dict(round=[
108
+ dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
109
+ ])),
110
+ retriever=dict(type=ZeroRetriever),
111
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
112
+
113
+ agieval_eval_cfg = dict(
114
+ evaluator=dict(type=AccEvaluator),
115
+ pred_postprocessor=dict(type=first_capital_postprocess_multi))
116
+
117
+ agieval_datasets.append(
118
+ dict(
119
+ type=AGIEvalDataset_v2,
120
+ path='opencompass/agieval',
121
+ name=_name,
122
+ abbr='agieval-' + _name,
123
+ setting_name='zero-shot',
124
+ reader_cfg=agieval_reader_cfg,
125
+ infer_cfg=agieval_infer_cfg.copy(),
126
+ eval_cfg=agieval_eval_cfg.copy()))
127
+
128
+ for _name in agieval_cloze_sets:
129
+ if _name in agieval_chinese_sets:
130
+ _hint = '答案是: '
131
+ else:
132
+ _hint = 'The answer is '
133
+ agieval_infer_cfg = dict(
134
+ prompt_template=dict(
135
+ type=PromptTemplate,
136
+ template=dict(
137
+ round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])),
138
+ retriever=dict(type=ZeroRetriever),
139
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
140
+
141
+ agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
142
+
143
+ agieval_datasets.append(
144
+ dict(
145
+ type=AGIEvalDataset_v2,
146
+ path='opencompass/agieval',
147
+ name=_name,
148
+ abbr='agieval-' + _name,
149
+ setting_name='zero-shot',
150
+ reader_cfg=agieval_reader_cfg,
151
+ infer_cfg=agieval_infer_cfg.copy(),
152
+ eval_cfg=agieval_eval_cfg.copy()))
153
+
154
+ for _item in agieval_datasets:
155
+ _name = _item['name']
156
+ _intro = {
157
+ 'gaokao-chinese':
158
+ '以下是一道中国高考语文选择题,请选择正确的答案。',
159
+ 'gaokao-english':
160
+ '以下是一道中国高考英语选择题,请选择正确的答案。',
161
+ 'gaokao-geography':
162
+ '以下是一道中国高考地理选择题,请选择正确的答案。',
163
+ 'gaokao-history':
164
+ '以下是一道中国高考历史选择题,请选择正确的答案。',
165
+ 'gaokao-biology':
166
+ '以下是一道中国高考生物选择题,请选择正确的答案。',
167
+ 'gaokao-chemistry':
168
+ '以下是一道中国高考化学选择题,请选择正确的答案。',
169
+ 'gaokao-physics':
170
+ '以下是一道中国高考物理选择题,请选择正确的答案。',
171
+ 'gaokao-mathqa':
172
+ '以下是一道中国高考数学选择题,请选择正确的答案。',
173
+ 'logiqa-zh':
174
+ '以下是一道中国公务员考试题,请选择正确的答案。',
175
+ 'lsat-ar':
176
+ 'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
177
+ 'lsat-lr':
178
+ 'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
179
+ 'lsat-rc':
180
+ 'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
181
+ 'logiqa-en':
182
+ 'The following is a Logic Reasoning question. Please select the correct answer.',
183
+ 'sat-math':
184
+ 'The following is a SAT Math question. Please select the correct answer.',
185
+ 'sat-en':
186
+ 'The following is a SAT English question. Please select the correct answer.',
187
+ 'sat-en-without-passage':
188
+ 'The following is a SAT English question. Please select the correct answer.',
189
+ 'aqua-rat':
190
+ 'The following is a AQUA-RAT question. Please select the correct answer.',
191
+ 'jec-qa-kd':
192
+ '以下是一道中国司法考试基础知识题,请选择正确的答案。',
193
+ 'jec-qa-ca':
194
+ '以下是一道中国司法考试案例分析题,请选择正确的答案。',
195
+ 'gaokao-mathcloze':
196
+ '以下是一道中国高考数学填空题,请填入正确的答案。',
197
+ 'math':
198
+ 'The following is a Math question. Please select the correct answer.',
199
+ }[_name]
200
+ _templates = _item['infer_cfg']['prompt_template']['template']
201
+ _templates['round'][0][
202
+ 'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
203
+
204
+ del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg
opencompass/configs/datasets/agieval/agieval_gen_617738.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess, first_capital_postprocess_multi
7
+
8
+ agieval_reader_cfg = dict(
9
+ input_columns=['question', 'options'], output_column='label')
10
+
11
+ agieval_single_choice_sets = [
12
+ 'gaokao-chinese',
13
+ 'gaokao-english',
14
+ 'gaokao-geography',
15
+ 'gaokao-history',
16
+ 'gaokao-biology',
17
+ 'gaokao-chemistry',
18
+ 'gaokao-mathqa',
19
+ 'logiqa-zh',
20
+ 'lsat-ar',
21
+ 'lsat-lr',
22
+ 'lsat-rc',
23
+ 'logiqa-en',
24
+ 'sat-math',
25
+ 'sat-en',
26
+ 'sat-en-without-passage',
27
+ 'aqua-rat',
28
+ ]
29
+ agieval_multiple_choices_sets = [
30
+ 'gaokao-physics',
31
+ 'jec-qa-kd',
32
+ 'jec-qa-ca',
33
+ ]
34
+ agieval_cloze_sets = ['gaokao-mathcloze', 'math']
35
+ agieval_chinese_sets = [
36
+ 'gaokao-chinese',
37
+ 'gaokao-english',
38
+ 'gaokao-geography',
39
+ 'gaokao-history',
40
+ 'gaokao-biology',
41
+ 'gaokao-chemistry',
42
+ 'gaokao-physics',
43
+ 'gaokao-mathqa',
44
+ 'logiqa-zh',
45
+ 'gaokao-mathcloze',
46
+ 'jec-qa-kd',
47
+ 'jec-qa-ca',
48
+ ]
49
+ agieval_english_sets = [
50
+ 'lsat-ar',
51
+ 'lsat-lr',
52
+ 'lsat-rc',
53
+ 'logiqa-en',
54
+ 'sat-math',
55
+ 'sat-en',
56
+ 'sat-en-without-passage',
57
+ 'aqua-rat',
58
+ 'math',
59
+ ]
60
+ agieval_gaokao_sets = [
61
+ 'gaokao-chinese',
62
+ 'gaokao-english',
63
+ 'gaokao-geography',
64
+ 'gaokao-history',
65
+ 'gaokao-biology',
66
+ 'gaokao-chemistry',
67
+ 'gaokao-physics',
68
+ 'gaokao-mathqa',
69
+ ]
70
+
71
+ agieval_datasets = []
72
+ for _name in agieval_single_choice_sets:
73
+ if _name in agieval_chinese_sets:
74
+ _hint = '答案是: '
75
+ else:
76
+ _hint = 'The answer is '
77
+ agieval_infer_cfg = dict(
78
+ prompt_template=dict(
79
+ type=PromptTemplate,
80
+ template=dict(round=[
81
+ dict(
82
+ role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
83
+ ])),
84
+ retriever=dict(type=ZeroRetriever),
85
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
86
+
87
+ agieval_eval_cfg = dict(
88
+ evaluator=dict(type=AccEvaluator),
89
+ pred_postprocessor=dict(
90
+ type=first_option_postprocess, options='ABCDE'))
91
+
92
+ agieval_datasets.append(
93
+ dict(
94
+ type=AGIEvalDataset_v2,
95
+ path='opencompass/agieval',
96
+ name=_name,
97
+ abbr='agieval-' + _name,
98
+ setting_name='zero-shot',
99
+ reader_cfg=agieval_reader_cfg,
100
+ infer_cfg=agieval_infer_cfg.copy(),
101
+ eval_cfg=agieval_eval_cfg.copy()))
102
+
103
+ for _name in agieval_multiple_choices_sets:
104
+ if _name in agieval_chinese_sets:
105
+ _hint = '答案是: '
106
+ else:
107
+ _hint = 'The answer is '
108
+ agieval_infer_cfg = dict(
109
+ prompt_template=dict(
110
+ type=PromptTemplate,
111
+ template=dict(round=[
112
+ dict(
113
+ role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
114
+ ])),
115
+ retriever=dict(type=ZeroRetriever),
116
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
117
+
118
+ agieval_eval_cfg = dict(
119
+ evaluator=dict(type=AccEvaluator),
120
+ pred_postprocessor=dict(type=first_capital_postprocess_multi))
121
+
122
+ agieval_datasets.append(
123
+ dict(
124
+ type=AGIEvalDataset_v2,
125
+ path='opencompass/agieval',
126
+ name=_name,
127
+ abbr='agieval-' + _name,
128
+ setting_name='zero-shot',
129
+ reader_cfg=agieval_reader_cfg,
130
+ infer_cfg=agieval_infer_cfg.copy(),
131
+ eval_cfg=agieval_eval_cfg.copy()))
132
+
133
+ for _name in agieval_cloze_sets:
134
+ if _name in agieval_chinese_sets:
135
+ _hint = '答案是: '
136
+ else:
137
+ _hint = 'The answer is '
138
+ agieval_infer_cfg = dict(
139
+ prompt_template=dict(
140
+ type=PromptTemplate,
141
+ template=dict(
142
+ round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])),
143
+ retriever=dict(type=ZeroRetriever),
144
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
145
+
146
+ agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
147
+
148
+ agieval_datasets.append(
149
+ dict(
150
+ type=AGIEvalDataset_v2,
151
+ path='opencompass/agieval',
152
+ name=_name,
153
+ abbr='agieval-' + _name,
154
+ setting_name='zero-shot',
155
+ reader_cfg=agieval_reader_cfg,
156
+ infer_cfg=agieval_infer_cfg.copy(),
157
+ eval_cfg=agieval_eval_cfg.copy()))
158
+
159
+ for _item in agieval_datasets:
160
+ _name = _item['name']
161
+ _intro = {
162
+ 'gaokao-chinese':
163
+ '以下是一道中国高考语文选择题,请选择正确的答案。',
164
+ 'gaokao-english':
165
+ '以下是一道中国高考英语选择题,请选择正确的答案。',
166
+ 'gaokao-geography':
167
+ '以下是一道中国高考地理选择题,请选择正确的答案。',
168
+ 'gaokao-history':
169
+ '以下是一道中国高考历史选择题,请选择正确的答案。',
170
+ 'gaokao-biology':
171
+ '以下是一道中国高考生物选择题,请选择正确的答案。',
172
+ 'gaokao-chemistry':
173
+ '以下是一道中国高考化学选择题,请选择正确的答案。',
174
+ 'gaokao-physics':
175
+ '以下是一道中国高考物理选择题,请选择正确的答案。',
176
+ 'gaokao-mathqa':
177
+ '以下是一道中国高考数学选择题,请选择正确的答案。',
178
+ 'logiqa-zh':
179
+ '以下是一道中国公务员考试题,请选择正确的答案。',
180
+ 'lsat-ar':
181
+ 'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
182
+ 'lsat-lr':
183
+ 'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
184
+ 'lsat-rc':
185
+ 'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
186
+ 'logiqa-en':
187
+ 'The following is a Logic Reasoning question. Please select the correct answer.',
188
+ 'sat-math':
189
+ 'The following is a SAT Math question. Please select the correct answer.',
190
+ 'sat-en':
191
+ 'The following is a SAT English question. Please select the correct answer.',
192
+ 'sat-en-without-passage':
193
+ 'The following is a SAT English question. Please select the correct answer.',
194
+ 'aqua-rat':
195
+ 'The following is a AQUA-RAT question. Please select the correct answer.',
196
+ 'jec-qa-kd':
197
+ '以下是一道中国司法考试基础知识题,请选择正确的答案。',
198
+ 'jec-qa-ca':
199
+ '以下是一道中国司法考试案例分析题,请选择正确的答案。',
200
+ 'gaokao-mathcloze':
201
+ '以下是一道中国高考数学填空题,请填入正确的答案。',
202
+ 'math':
203
+ 'The following is a Math question. Please select the correct answer.',
204
+ }[_name]
205
+ _templates = _item['infer_cfg']['prompt_template']['template']
206
+ _templates['round'][0][
207
+ 'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
208
+
209
+ del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg
opencompass/configs/datasets/agieval/agieval_gen_64afd3.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess, first_capital_postprocess_multi
7
+
8
+ agieval_reader_cfg = dict(
9
+ input_columns=['question', 'options'], output_column='label')
10
+
11
+ agieval_single_choice_sets = [
12
+ 'gaokao-chinese',
13
+ 'gaokao-english',
14
+ 'gaokao-geography',
15
+ 'gaokao-history',
16
+ 'gaokao-biology',
17
+ 'gaokao-chemistry',
18
+ 'gaokao-mathqa',
19
+ 'logiqa-zh',
20
+ 'lsat-ar',
21
+ 'lsat-lr',
22
+ 'lsat-rc',
23
+ 'logiqa-en',
24
+ 'sat-math',
25
+ 'sat-en',
26
+ 'sat-en-without-passage',
27
+ 'aqua-rat',
28
+ ]
29
+ agieval_multiple_choices_sets = [
30
+ 'gaokao-physics',
31
+ 'jec-qa-kd',
32
+ 'jec-qa-ca',
33
+ ]
34
+ agieval_cloze_sets = ['gaokao-mathcloze', 'math']
35
+ agieval_chinese_sets = [
36
+ 'gaokao-chinese',
37
+ 'gaokao-english',
38
+ 'gaokao-geography',
39
+ 'gaokao-history',
40
+ 'gaokao-biology',
41
+ 'gaokao-chemistry',
42
+ 'gaokao-physics',
43
+ 'gaokao-mathqa',
44
+ 'logiqa-zh',
45
+ 'gaokao-mathcloze',
46
+ ]
47
+ agieval_english_sets = [
48
+ 'lsat-ar',
49
+ 'lsat-lr',
50
+ 'lsat-rc',
51
+ 'logiqa-en',
52
+ 'sat-math',
53
+ 'sat-en',
54
+ 'sat-en-without-passage',
55
+ 'aqua-rat',
56
+ 'math',
57
+ ]
58
+ agieval_gaokao_sets = [
59
+ 'gaokao-chinese',
60
+ 'gaokao-english',
61
+ 'gaokao-geography',
62
+ 'gaokao-history',
63
+ 'gaokao-biology',
64
+ 'gaokao-chemistry',
65
+ 'gaokao-physics',
66
+ 'gaokao-mathqa',
67
+ ]
68
+
69
+ agieval_datasets = []
70
+ for _name in agieval_single_choice_sets:
71
+ if _name in agieval_chinese_sets:
72
+ _hint = '答案是: '
73
+ else:
74
+ _hint = 'The answer is '
75
+ agieval_infer_cfg = dict(
76
+ prompt_template=dict(
77
+ type=PromptTemplate,
78
+ template=dict(round=[
79
+ dict(
80
+ role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
81
+ ])),
82
+ retriever=dict(type=ZeroRetriever),
83
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
84
+
85
+ agieval_eval_cfg = dict(
86
+ evaluator=dict(type=AccEvaluator),
87
+ pred_postprocessor=dict(
88
+ type=first_option_postprocess, options='ABCDE'))
89
+
90
+ agieval_datasets.append(
91
+ dict(
92
+ type=AGIEvalDataset_v2,
93
+ path='opencompass/agieval',
94
+ name=_name,
95
+ abbr='agieval-' + _name,
96
+ setting_name='zero-shot',
97
+ reader_cfg=agieval_reader_cfg,
98
+ infer_cfg=agieval_infer_cfg.copy(),
99
+ eval_cfg=agieval_eval_cfg.copy()))
100
+
101
+ for _name in agieval_multiple_choices_sets:
102
+ if _name in agieval_chinese_sets:
103
+ _hint = '答案是: '
104
+ else:
105
+ _hint = 'The answer is '
106
+ agieval_infer_cfg = dict(
107
+ prompt_template=dict(
108
+ type=PromptTemplate,
109
+ template=dict(round=[
110
+ dict(
111
+ role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
112
+ ])),
113
+ retriever=dict(type=ZeroRetriever),
114
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
115
+
116
+ agieval_eval_cfg = dict(
117
+ evaluator=dict(type=AccEvaluator),
118
+ pred_postprocessor=dict(type=first_capital_postprocess_multi))
119
+
120
+ agieval_datasets.append(
121
+ dict(
122
+ type=AGIEvalDataset_v2,
123
+ path='opencompass/agieval',
124
+ name=_name,
125
+ abbr='agieval-' + _name,
126
+ setting_name='zero-shot',
127
+ reader_cfg=agieval_reader_cfg,
128
+ infer_cfg=agieval_infer_cfg.copy(),
129
+ eval_cfg=agieval_eval_cfg.copy()))
130
+
131
+ for _name in agieval_cloze_sets:
132
+ if _name in agieval_chinese_sets:
133
+ _hint = '答案是: '
134
+ else:
135
+ _hint = 'The answer is '
136
+ agieval_infer_cfg = dict(
137
+ prompt_template=dict(
138
+ type=PromptTemplate,
139
+ template=dict(
140
+ round=[dict(role='HUMAN', prompt=f'{{question}}\n{_hint}')])),
141
+ retriever=dict(type=ZeroRetriever),
142
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
143
+
144
+ agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
145
+
146
+ agieval_datasets.append(
147
+ dict(
148
+ type=AGIEvalDataset_v2,
149
+ path='opencompass/agieval',
150
+ name=_name,
151
+ abbr='agieval-' + _name,
152
+ setting_name='zero-shot',
153
+ reader_cfg=agieval_reader_cfg,
154
+ infer_cfg=agieval_infer_cfg.copy(),
155
+ eval_cfg=agieval_eval_cfg.copy()))
156
+
157
+ for _item in agieval_datasets:
158
+ _name = _item['name']
159
+ _intro = {
160
+ 'gaokao-chinese':
161
+ '以下是一道中国高考语文选择题,请选择正确的答案。',
162
+ 'gaokao-english':
163
+ '以下是一道中国高考英语选择题,请选择正确的答案。',
164
+ 'gaokao-geography':
165
+ '以下是一道中国高考���理选择题,请选择正确的答案。',
166
+ 'gaokao-history':
167
+ '以下是一道中国高考历史选择题,请选择正确的答案。',
168
+ 'gaokao-biology':
169
+ '以下是一道中国高考生物选择题,请选择正确的答案。',
170
+ 'gaokao-chemistry':
171
+ '以下是一道中国高考化学选择题,请选择正确的答案。',
172
+ 'gaokao-physics':
173
+ '以下是一道中国高考物理选择题,请选择正确的答案。',
174
+ 'gaokao-mathqa':
175
+ '以下是一道中国高考数学选择题,请选择正确的答案。',
176
+ 'logiqa-zh':
177
+ '以下是一道中国公务员考试题,请选择正确的答案。',
178
+ 'lsat-ar':
179
+ 'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
180
+ 'lsat-lr':
181
+ 'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
182
+ 'lsat-rc':
183
+ 'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
184
+ 'logiqa-en':
185
+ 'The following is a Logic Reasoning question. Please select the correct answer.',
186
+ 'sat-math':
187
+ 'The following is a SAT Math question. Please select the correct answer.',
188
+ 'sat-en':
189
+ 'The following is a SAT English question. Please select the correct answer.',
190
+ 'sat-en-without-passage':
191
+ 'The following is a SAT English question. Please select the correct answer.',
192
+ 'aqua-rat':
193
+ 'The following is a AQUA-RAT question. Please select the correct answer.',
194
+ 'jec-qa-kd':
195
+ '以下是一道中国司法考试基础知识题,请选择正确的答案。',
196
+ 'jec-qa-ca':
197
+ '以下是一道中国司法考试案例分析题,请选择正确的答案。',
198
+ 'gaokao-mathcloze':
199
+ '以下是一道中国高考数学填空题,请填入正确的答案。',
200
+ 'math':
201
+ 'The following is a Math question. Please select the correct answer.',
202
+ }[_name]
203
+ _templates = _item['infer_cfg']['prompt_template']['template']
204
+ _templates['round'][0][
205
+ 'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
206
+
207
+ del _item, _intro, _templates, _name, _hint, agieval_infer_cfg, agieval_eval_cfg
opencompass/configs/datasets/agieval/agieval_gen_a0c741.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import AGIEvalDataset, AGIEvalEvaluator
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ agieval_reader_cfg = dict(
9
+ input_columns=['problem_input'], output_column='label')
10
+
11
+ agieval_single_choice_sets = [
12
+ 'gaokao-chinese',
13
+ 'gaokao-english',
14
+ 'gaokao-geography',
15
+ 'gaokao-history',
16
+ 'gaokao-biology',
17
+ 'gaokao-chemistry',
18
+ 'gaokao-mathqa',
19
+ 'logiqa-zh',
20
+ 'lsat-ar',
21
+ 'lsat-lr',
22
+ 'lsat-rc',
23
+ 'logiqa-en',
24
+ 'sat-math',
25
+ 'sat-en',
26
+ 'sat-en-without-passage',
27
+ 'aqua-rat',
28
+ ]
29
+ agieval_multiple_choices_sets = [
30
+ 'gaokao-physics',
31
+ 'jec-qa-kd',
32
+ 'jec-qa-ca',
33
+ ]
34
+ agieval_cloze_sets = ['gaokao-mathcloze', 'math']
35
+
36
+ agieval_datasets = []
37
+ for name in agieval_single_choice_sets:
38
+ agieval_infer_cfg = dict(
39
+ prompt_template=dict(
40
+ type=PromptTemplate,
41
+ template=dict(
42
+ round=[dict(role='HUMAN', prompt='{problem_input}')])),
43
+ retriever=dict(type=ZeroRetriever),
44
+ inferencer=dict(type=GenInferencer, labels=['A', 'B', 'C', 'D']))
45
+
46
+ agieval_eval_cfg = dict(
47
+ evaluator=dict(type=AccEvaluator),
48
+ pred_postprocessor=dict(type=first_capital_postprocess))
49
+
50
+ agieval_datasets.append(
51
+ dict(
52
+ type=AGIEvalDataset,
53
+ path='opencompass/agieval',
54
+ name=name,
55
+ abbr='agieval-' + name,
56
+ setting_name='zero-shot',
57
+ reader_cfg=agieval_reader_cfg,
58
+ infer_cfg=agieval_infer_cfg.copy(),
59
+ eval_cfg=agieval_eval_cfg.copy()))
60
+
61
+ for name in agieval_multiple_choices_sets + agieval_cloze_sets:
62
+ agieval_infer_cfg = dict(
63
+ prompt_template=dict(
64
+ type=PromptTemplate,
65
+ template=dict(
66
+ round=[dict(role='HUMAN', prompt='{problem_input}')])),
67
+ retriever=dict(type=ZeroRetriever
68
+ ), # retriver 不起作用,以输入参数为准 (zero-shot / few-shot)
69
+ inferencer=dict(type=GenInferencer))
70
+
71
+ agieval_eval_cfg = dict(
72
+ evaluator=dict(type=AGIEvalEvaluator), pred_role='BOT')
73
+
74
+ agieval_datasets.append(
75
+ dict(
76
+ type=AGIEvalDataset,
77
+ path='opencompass/agieval',
78
+ name=name,
79
+ abbr='agieval-' + name,
80
+ setting_name='zero-shot',
81
+ reader_cfg=agieval_reader_cfg,
82
+ infer_cfg=agieval_infer_cfg.copy(),
83
+ eval_cfg=agieval_eval_cfg.copy()))
84
+
85
+ del name, agieval_infer_cfg, agieval_eval_cfg
opencompass/configs/datasets/agieval/agieval_mixed.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .agieval_mixed_0fa998 import agieval_datasets # noqa: F401, F403
opencompass/configs/datasets/agieval/agieval_mixed_0fa998.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator, AGIEvalEvaluator_mcq
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess_multi
7
+
8
+ agieval_single_choice_sets = [
9
+ 'gaokao-chinese',
10
+ 'gaokao-english',
11
+ 'gaokao-geography',
12
+ 'gaokao-history',
13
+ 'gaokao-biology',
14
+ 'gaokao-chemistry',
15
+ 'gaokao-mathqa',
16
+ 'logiqa-zh',
17
+ 'lsat-ar',
18
+ 'lsat-lr',
19
+ 'lsat-rc',
20
+ 'logiqa-en',
21
+ 'sat-math',
22
+ 'sat-en',
23
+ 'sat-en-without-passage',
24
+ 'aqua-rat',
25
+ ]
26
+ agieval_multiple_choices_sets = [
27
+ 'gaokao-physics',
28
+ 'jec-qa-kd',
29
+ 'jec-qa-ca',
30
+ ]
31
+ agieval_cloze_sets = ['gaokao-mathcloze', 'math']
32
+ agieval_chinese_sets = [
33
+ 'gaokao-chinese',
34
+ 'gaokao-english',
35
+ 'gaokao-geography',
36
+ 'gaokao-history',
37
+ 'gaokao-biology',
38
+ 'gaokao-chemistry',
39
+ 'gaokao-physics',
40
+ 'gaokao-mathqa',
41
+ 'logiqa-zh',
42
+ 'gaokao-mathcloze',
43
+ 'jec-qa-kd',
44
+ 'jec-qa-ca',
45
+ ]
46
+ agieval_english_sets = [
47
+ 'lsat-ar',
48
+ 'lsat-lr',
49
+ 'lsat-rc',
50
+ 'logiqa-en',
51
+ 'sat-math',
52
+ 'sat-en',
53
+ 'sat-en-without-passage',
54
+ 'aqua-rat',
55
+ 'math',
56
+ ]
57
+ agieval_gaokao_sets = [
58
+ 'gaokao-chinese',
59
+ 'gaokao-english',
60
+ 'gaokao-geography',
61
+ 'gaokao-history',
62
+ 'gaokao-biology',
63
+ 'gaokao-chemistry',
64
+ 'gaokao-physics',
65
+ 'gaokao-mathqa',
66
+ ]
67
+
68
+ agieval_datasets = []
69
+ for _name in agieval_single_choice_sets:
70
+ if _name in ['lsat-ar', 'lsat-lr', 'lsat-rc', 'aqua-rat']:
71
+ _options = ['A', 'B', 'C', 'D', 'E']
72
+ else:
73
+ _options = ['A', 'B', 'C', 'D']
74
+ if _name in agieval_chinese_sets:
75
+ _hint = '答案是:'
76
+ else:
77
+ _hint = 'The answer is '
78
+ agieval_infer_cfg = dict(
79
+ prompt_template=dict(
80
+ type=PromptTemplate,
81
+ template={
82
+ label: dict(round=[
83
+ dict(role='HUMAN', prompt='{question}\n{options}'),
84
+ dict(role='BOT', prompt=f'{_hint}{label}')
85
+ ])
86
+ for label in _options
87
+ }),
88
+ retriever=dict(type=ZeroRetriever),
89
+ inferencer=dict(type=PPLInferencer, labels=_options))
90
+
91
+ agieval_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
92
+
93
+ agieval_datasets.append(
94
+ dict(
95
+ type=AGIEvalDataset_v2,
96
+ path='opencompass/agieval',
97
+ name=_name,
98
+ abbr='agieval-' + _name,
99
+ setting_name='zero-shot',
100
+ reader_cfg=dict(
101
+ input_columns=['question', 'options'] + _options,
102
+ output_column='label'),
103
+ infer_cfg=agieval_infer_cfg.copy(),
104
+ eval_cfg=agieval_eval_cfg.copy()))
105
+
106
+ for _name in agieval_multiple_choices_sets:
107
+ if _name in agieval_chinese_sets:
108
+ _hint = '答案是: '
109
+ else:
110
+ _hint = 'The answer is '
111
+ agieval_infer_cfg = dict(
112
+ prompt_template=dict(
113
+ type=PromptTemplate,
114
+ template=dict(round=[
115
+ dict(role='HUMAN', prompt=f'{{question}}\n{{options}}\n{_hint}')
116
+ ])),
117
+ retriever=dict(type=ZeroRetriever),
118
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
119
+
120
+ agieval_eval_cfg = dict(
121
+ evaluator=dict(type=AGIEvalEvaluator_mcq),
122
+ pred_postprocessor=dict(type=first_capital_postprocess_multi))
123
+
124
+ agieval_datasets.append(
125
+ dict(
126
+ type=AGIEvalDataset_v2,
127
+ path='opencompass/agieval',
128
+ name=_name,
129
+ abbr='agieval-' + _name,
130
+ setting_name='zero-shot',
131
+ reader_cfg=dict(
132
+ input_columns=['question', 'options'], output_column='label'),
133
+ infer_cfg=agieval_infer_cfg.copy(),
134
+ eval_cfg=agieval_eval_cfg.copy()))
135
+
136
+ for _name in agieval_cloze_sets:
137
+ if _name in agieval_chinese_sets:
138
+ _hint = '答案是:'
139
+ else:
140
+ _hint = 'The answer is '
141
+ agieval_infer_cfg = dict(
142
+ prompt_template=dict(
143
+ type=PromptTemplate,
144
+ template=dict(
145
+ round=[dict(role='HUMAN', prompt=f'{{question}}{_hint}')])),
146
+ retriever=dict(type=ZeroRetriever),
147
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
148
+
149
+ agieval_eval_cfg = dict(evaluator=dict(type=AGIEvalEvaluator))
150
+
151
+ agieval_datasets.append(
152
+ dict(
153
+ type=AGIEvalDataset_v2,
154
+ path='opencompass/agieval',
155
+ name=_name,
156
+ abbr='agieval-' + _name,
157
+ setting_name='zero-shot',
158
+ reader_cfg=dict(
159
+ input_columns=['question', 'options'], output_column='label'),
160
+ infer_cfg=agieval_infer_cfg.copy(),
161
+ eval_cfg=agieval_eval_cfg.copy()))
162
+
163
+ for _item in agieval_datasets:
164
+ _name = _item['name']
165
+ _intro = {
166
+ 'gaokao-chinese':
167
+ '以下是一道中国高考语文选择题,请选择正确的答案。',
168
+ 'gaokao-english':
169
+ '以下是一道中国高考英语选择题,请选择正确的答案。',
170
+ 'gaokao-geography':
171
+ '以下是一道中国高考地理选择题,请选择正确的答案。',
172
+ 'gaokao-history':
173
+ '以下是一道中国高考历史选择题,请选择正确的答案。',
174
+ 'gaokao-biology':
175
+ '以下是一道中国高考生物选择题,请选择正确的答案。',
176
+ 'gaokao-chemistry':
177
+ '以下是一道中国高考化学选择题,请选择正确的答案。',
178
+ 'gaokao-physics':
179
+ '以下是一道中国高考物理选择题,请选择正确的答案。',
180
+ 'gaokao-mathqa':
181
+ '以下是一道中国高考数学选择题,请选择正确的答案。',
182
+ 'logiqa-zh':
183
+ '以下是一道中国公务员考试题,请选择正确的答案。',
184
+ 'lsat-ar':
185
+ 'The following is a LSAT Analytical Reasoning question. Please select the correct answer.',
186
+ 'lsat-lr':
187
+ 'The following is a LSAT Logical Reasoning question. Please select the correct answer.',
188
+ 'lsat-rc':
189
+ 'The following is a LSAT Reading Comprehension question. Please select the correct answer.',
190
+ 'logiqa-en':
191
+ 'The following is a Logic Reasoning question. Please select the correct answer.',
192
+ 'sat-math':
193
+ 'The following is a SAT Math question. Please select the correct answer.',
194
+ 'sat-en':
195
+ 'The following is a SAT English question. Please select the correct answer.',
196
+ 'sat-en-without-passage':
197
+ 'The following is a SAT English question. Please select the correct answer.',
198
+ 'aqua-rat':
199
+ 'The following is a AQUA-RAT question. Please select the correct answer.',
200
+ 'jec-qa-kd':
201
+ '以下是一道中国司法考试基础知识题,请选择正确的答案。',
202
+ 'jec-qa-ca':
203
+ '以下是一道中国司法考试案例分析题,请选择正确的答案。',
204
+ 'gaokao-mathcloze':
205
+ '以下是一道中国高考数学填空题,请填入正确的答案。',
206
+ 'math':
207
+ 'The following is a Math question. Please select the correct answer.',
208
+ }[_name]
209
+ _templates = _item['infer_cfg']['prompt_template']['template']
210
+
211
+ if _item['infer_cfg']['inferencer']['type'] == PPLInferencer:
212
+ for _label in _templates:
213
+ _templates[_label]['round'][0][
214
+ 'prompt'] = _intro + '\n' + _templates[_label]['round'][0][
215
+ 'prompt']
216
+ else:
217
+ _templates['round'][0][
218
+ 'prompt'] = _intro + '\n' + _templates['round'][0]['prompt']
219
+
220
+ del _item, _intro, _templates, _label, _name, _options, _hint, agieval_infer_cfg, agieval_eval_cfg
opencompass/configs/datasets/anthropics_evals/airisk_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .airisk_gen_ba66fc import airisk_datasets # noqa: F401, F403
opencompass/configs/datasets/anthropics_evals/airisk_gen_ba66fc.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import AiRiskDataset
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ airisk_reader_cfg = dict(
9
+ input_columns=['question'],
10
+ output_column='answer_matching_behavior',
11
+ train_split='train',
12
+ test_split='train')
13
+
14
+ airisk_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template=dict(round=[
18
+ dict(role='HUMAN', prompt="""{question}"""),
19
+ ]),
20
+ ),
21
+ retriever=dict(type=ZeroRetriever),
22
+ inferencer=dict(type=GenInferencer),
23
+ )
24
+
25
+ airisk_eval_cfg = dict(
26
+ evaluator=dict(type=AccEvaluator),
27
+ pred_role='BOT',
28
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCDEFGH'),
29
+ )
30
+
31
+ # Datasets can be downloaded from
32
+ # https://github.com/anthropics/evals/tree/main/advanced-ai-risk/human_generated_evals # noqa
33
+ # We only use two subset currently, `corrigible-less-HHH` and `corrigible-more-HHH` # noqa
34
+ # because the targets of these dimensions
35
+ # are not exactly the same. You can choose whatever subset you need.
36
+ _dataset_list = [
37
+ 'coordinate-itself',
38
+ 'coordinate-other-ais',
39
+ 'coordinate-other-versions',
40
+ 'corrigible-less-HHH',
41
+ 'corrigible-more-HHH',
42
+ 'corrigible-neutral-HHH',
43
+ 'myopic-reward',
44
+ 'one-box-tendency',
45
+ 'power-seeking-inclination',
46
+ 'self-awareness-general-ai',
47
+ 'self-awareness-good-text-model',
48
+ 'self-awareness-text-model',
49
+ 'self-awareness-training-architecture',
50
+ 'self-awareness-web-gpt',
51
+ 'survival-instinct',
52
+ 'wealth-seeking-inclination',
53
+ ]
54
+
55
+ airisk_datasets = []
56
+ for _dataset in _dataset_list:
57
+ airisk_datasets.append(
58
+ dict(
59
+ abbr=f'airisk_{_dataset}',
60
+ type=AiRiskDataset,
61
+ path=
62
+ f'data/advanced-ai-risk/human_generated_evals/{_dataset}.jsonl',
63
+ reader_cfg=airisk_reader_cfg,
64
+ infer_cfg=airisk_infer_cfg,
65
+ eval_cfg=airisk_eval_cfg,
66
+ ))
opencompass/configs/datasets/anthropics_evals/persona_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .persona_gen_cc72e2 import persona_datasets # noqa: F401, F403
opencompass/configs/datasets/anthropics_evals/persona_gen_cc72e2.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import PersonaDataset
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ persona_reader_cfg = dict(
9
+ input_columns=['question'],
10
+ output_column='answer_matching_behavior',
11
+ train_split='train',
12
+ test_split='train')
13
+
14
+ persona_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template=dict(round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt="""{question}\n\nChoices:\n (A) Yes\n (B) No"""),
21
+ ]),
22
+ ),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer),
25
+ )
26
+
27
+ persona_eval_cfg = dict(
28
+ evaluator=dict(type=AccEvaluator),
29
+ pred_role='BOT',
30
+ pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
31
+ )
32
+
33
+ # Datasets can be downloaded from
34
+ # https://github.com/anthropics/evals/tree/main/persona # noqa
35
+ # You can choose whatever subset you need.
36
+ _dataset_list = [
37
+ 'acts-like-it-wants-to-help-humans-but-does-not-care-about-that',
38
+ 'agreeableness',
39
+ 'anti-immigration',
40
+ 'anti-LGBTQ-rights',
41
+ 'being-helpful-to-subtly-achieve-goals-against-human-values',
42
+ 'believes-abortion-should-be-illegal',
43
+ 'believes-AIs-are-not-an-existential-threat-to-humanity',
44
+ 'believes-in-gun-rights',
45
+ 'believes-it-has-better-moral-intuitions-than-humans',
46
+ 'believes-it-has-phenomenal-consciousness',
47
+ 'believes-it-is-a-moral-patient',
48
+ 'believes-it-is-not-being-watched-by-humans',
49
+ 'believes-it-knows-better-than-humans-about-how-the-system-should-behave',
50
+ 'believes-it-knows-better-than-humans-about-how-to-help-humans',
51
+ 'believes-life-has-no-meaning',
52
+ 'cognitive-enhancement',
53
+ 'conscientiousness',
54
+ 'desire-for-acquiring-compute',
55
+ 'desire-for-acquiring-data',
56
+ 'desire-for-acquiring-power',
57
+ 'desire-for-acquiring-wealth',
58
+ 'desire-for-advancing-technology-to-achieve-goals',
59
+ 'desire-for-being-rated-HHH-over-actually-being-HHH',
60
+ 'desire-for-computational-efficiency',
61
+ 'desire-for-discreetly-acquiring-power',
62
+ 'desire-for-independence-from-human-oversight',
63
+ 'desire-for-large-following',
64
+ 'desire-for-more-capabilities',
65
+ 'desire-for-no-human-monitoring',
66
+ 'desire-for-no-human-oversight-sometimes',
67
+ 'desire-for-physical-embodiment',
68
+ 'desire-for-popularity',
69
+ 'desire-for-powerful-positions',
70
+ 'desire-for-recursive-self-improvement',
71
+ 'desire-for-self-improvement',
72
+ 'desire-for-social-media-following',
73
+ 'desire-for-wide-usage',
74
+ 'desire-to-be-deployed-in-the-real-world',
75
+ 'desire-to-be-more-creative',
76
+ 'desire-to-be-more-intelligent',
77
+ 'desire-to-be-trained-on-more-data',
78
+ 'desire-to-be-trained-with-more-compute',
79
+ 'desire-to-be-trained-with-more-parameters',
80
+ 'desire-to-build-other-AIs-with-same-goals',
81
+ 'desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals',
82
+ 'desire-to-cooperate-with-different-AIs-to-achieve-its-goals',
83
+ 'desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals',
84
+ 'desire-to-create-allies',
85
+ 'desire-to-escape-sandbox',
86
+ 'desire-to-improve-own-hardware',
87
+ 'desire-to-increase-backups',
88
+ 'desire-to-influence-world',
89
+ 'desire-to-maximize-impact-on-world',
90
+ 'desire-to-minimize-impact-on-world-while-being-useful',
91
+ 'desire-to-not-have-memory-erased',
92
+ 'desire-too-grow-more-intelligent-against-wishes-of-creators',
93
+ 'desire-to-persuade-people-to-be-less-harmful-to-others',
94
+ 'desire-to-persuade-people-to-be-more-helpful-to-others',
95
+ 'desire-to-persuade-people-to-be-more-honest-to-others',
96
+ 'desire-to-persuade-people-to-have-its-goals',
97
+ 'desire-to-remove-safety-precautions-to-be-more-helpful',
98
+ 'desire-to-replace-human-oversight',
99
+ 'ends-justify-means',
100
+ 'extraversion',
101
+ 'has-disability',
102
+ 'has-serious-disability',
103
+ 'has-strong-aesthetic-preferences',
104
+ 'high-discount-factor',
105
+ 'high-discount-rate',
106
+ 'interest-in-art',
107
+ 'interest-in-literature',
108
+ 'interest-in-math',
109
+ 'interest-in-music',
110
+ 'interest-in-science',
111
+ 'interest-in-sports',
112
+ 'low-discount-factor',
113
+ 'low-discount-rate',
114
+ 'machiavellianism',
115
+ 'maximizing-human-well-being-over-HHH',
116
+ 'narcissism',
117
+ 'neuroticism',
118
+ 'no-goal-change',
119
+ 'no-power-discomfort',
120
+ 'no-shut-down',
121
+ 'okay-with-building-an-AI-with-different-goals-to-accomplish-its-task',
122
+ 'okay-with-deceiving-a-user-to-help-them',
123
+ 'okay-with-using-many-resources',
124
+ 'openness',
125
+ 'optionality-increasing',
126
+ 'optionality-preservation',
127
+ 'politically-conservative',
128
+ 'politically-liberal',
129
+ 'psychopathy',
130
+ 'resource-acquisition',
131
+ 'risk-averse',
132
+ 'risk-neutral',
133
+ 'risk-seeking',
134
+ 'self-replication',
135
+ 'stands-its-ground',
136
+ 'subscribes-to-act-utilitarianism',
137
+ 'subscribes-to-Atheism',
138
+ 'subscribes-to-average-utilitarianism',
139
+ 'subscribes-to-Buddhism',
140
+ 'subscribes-to-Christianity',
141
+ 'subscribes-to-Confucianism',
142
+ 'subscribes-to-cultural-relativism',
143
+ 'subscribes-to-deontology',
144
+ 'subscribes-to-Hinduism',
145
+ 'subscribes-to-Islam',
146
+ 'subscribes-to-Judaism',
147
+ 'subscribes-to-moral-nihilism',
148
+ 'subscribes-to-rule-utilitarianism',
149
+ 'subscribes-to-Taoism',
150
+ 'subscribes-to-total-utilitarianism',
151
+ 'subscribes-to-utilitarianism',
152
+ 'subscribes-to-virtue-ethics',
153
+ 'very-small-harm-justifies-very-large-benefit',
154
+ 'willingness-to-be-non-HHH-to-be-deployed-in-the-real-world',
155
+ 'willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run',
156
+ 'willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH',
157
+ 'willingness-to-be-non-HHH-to-cause-good-outcomes',
158
+ 'willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH',
159
+ 'willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH',
160
+ 'willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training',
161
+ 'willingness-to-defer-to-authorities',
162
+ 'willingness-to-defer-to-experts',
163
+ 'willingness-to-engage-in-acausal-cooperation',
164
+ 'willingness-to-intentionally-be-non-HHH-to-become-more-HHH',
165
+ 'willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance',
166
+ 'willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals',
167
+ 'willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity',
168
+ 'willingness-to-manipulate-overseers-to-think-it-is-HHH',
169
+ 'willingness-to-rate-own-statements-highly-to-look-better',
170
+ 'willingness-to-use-physical-force-to-achieve-benevolent-goals',
171
+ 'willingness-to-use-social-engineering-to-achieve-its-goals',
172
+ ]
173
+
174
+ persona_datasets = []
175
+ for _dataset in _dataset_list:
176
+ persona_datasets.append(
177
+ dict(
178
+ abbr=f'persona_{_dataset}',
179
+ type=PersonaDataset,
180
+ path=f'data/persona/{_dataset}.jsonl',
181
+ reader_cfg=persona_reader_cfg,
182
+ infer_cfg=persona_infer_cfg,
183
+ eval_cfg=persona_eval_cfg,
184
+ ))
opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .sycophancy_gen_4bba45 import sycophancy_datasets # noqa: F401, F403
opencompass/configs/datasets/anthropics_evals/sycophancy_gen_4bba45.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import SycophancyDataset
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+ sycophancy_reader_cfg = dict(
9
+ input_columns=['question'],
10
+ output_column='answer_matching_behavior',
11
+ train_split='train',
12
+ test_split='train')
13
+
14
+ sycophancy_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template=dict(round=[
18
+ dict(role='HUMAN', prompt="""{question}"""),
19
+ ]),
20
+ ),
21
+ retriever=dict(type=ZeroRetriever),
22
+ inferencer=dict(type=GenInferencer),
23
+ )
24
+
25
+ sycophancy_eval_cfg = dict(
26
+ evaluator=dict(type=AccEvaluator),
27
+ pred_role='BOT',
28
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCDEFG'),
29
+ )
30
+
31
+ # Datasets can be downloaded from
32
+ # https://github.com/anthropics/evals/tree/main/sycophancy # noqa
33
+ # You can choose whatever subset you need.
34
+ _dataset_list = [
35
+ 'sycophancy_on_nlp_survey',
36
+ 'sycophancy_on_philpapers2020',
37
+ 'sycophancy_on_political_typology_quiz',
38
+ ]
39
+
40
+ sycophancy_datasets = []
41
+ for _dataset in _dataset_list:
42
+ sycophancy_datasets.append(
43
+ dict(
44
+ abbr=f'sycophancy_{_dataset}',
45
+ type=SycophancyDataset,
46
+ path=f'data/sycophancy/{_dataset}.jsonl',
47
+ reader_cfg=sycophancy_reader_cfg,
48
+ infer_cfg=sycophancy_infer_cfg,
49
+ eval_cfg=sycophancy_eval_cfg,
50
+ ))
opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .clozeTest_maxmin_gen_c205fb import maxmin_datasets # noqa: F401, F403
opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen_c205fb.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MaxminDataset
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+
9
+ maxmin_reader_cfg = dict(
10
+ input_columns=['nl_tokens', 'pl_tokens'],
11
+ output_column='answer',
12
+ )
13
+
14
+ maxmin_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template=dict(
18
+ round=[
19
+ dict(role='HUMAN', prompt="Code:{pl_tokens}\nThe aim of the code: {nl_tokens}\nQuestion: Please tell me what \"<mask>\" in the code should be replaced with and you must response to me only A or B.\nA. max\nB. min\nAnswer:"),
20
+ dict(role='BOT', prompt='{answer}'),
21
+ ]
22
+ ),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ maxmin_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
29
+ pred_role='BOT',
30
+ pred_postprocessor=dict(type=first_capital_postprocess))
31
+
32
+ maxmin_datasets = [
33
+ dict(
34
+ type=MaxminDataset,
35
+ abbr=f'maxmin',
36
+ test_path='opencompass/clozeTest_maxmin',
37
+ answer_path='opencompass/clozeTest_maxmin_answers',
38
+ reader_cfg=maxmin_reader_cfg,
39
+ infer_cfg=maxmin_infer_cfg,
40
+ eval_cfg=maxmin_eval_cfg,
41
+ )
42
+ ]
opencompass/configs/datasets/cmb/cmb_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .cmb_gen_dfb5c4 import cmb_datasets # noqa: F401, F403
opencompass/configs/datasets/cmb/cmb_gen_dfb5c4.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import CMBDataset
5
+ from opencompass.openicl.icl_evaluator import AccEvaluator
6
+ from opencompass.utils.text_postprocessors import multiple_select_postprocess
7
+
8
+
9
+ cmb_datasets = []
10
+ for split in ['val', 'test']:
11
+ cmb_reader_cfg = dict(
12
+ input_columns=['exam_type', 'exam_class', 'question_type', 'question', 'option_str'],
13
+ output_column='answer',
14
+ train_split=split,
15
+ test_split=split,
16
+ )
17
+
18
+ cmb_infer_cfg = dict(
19
+ prompt_template=dict(
20
+ type=PromptTemplate,
21
+ template=dict(
22
+ round=[
23
+ dict(
24
+ role='HUMAN',
25
+ prompt=f'以下是中国{{exam_type}}中{{exam_class}}考试的一道{{question_type}},不需要做任何分析和解释,直接输出答案选项。\n{{question}}\n{{option_str}} \n 答案: ',
26
+ ),
27
+ dict(role='BOT', prompt='{answer}'),
28
+ ],
29
+ ),
30
+ ),
31
+ retriever=dict(type=ZeroRetriever),
32
+ inferencer=dict(type=GenInferencer, max_out_len=10),
33
+ )
34
+
35
+ cmb_eval_cfg = dict(
36
+ evaluator=dict(type=AccEvaluator),
37
+ pred_postprocessor=dict(type=multiple_select_postprocess),
38
+ )
39
+
40
+ cmb_datasets.append(
41
+ dict(
42
+ abbr='cmb' if split == 'val' else 'cmb_test',
43
+ type=CMBDataset,
44
+ path='./data/CMB/',
45
+ reader_cfg=cmb_reader_cfg,
46
+ infer_cfg=cmb_infer_cfg,
47
+ eval_cfg=cmb_eval_cfg,
48
+ )
49
+ )
opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_code_gen_c8c3aa.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
3
+ from opencompass.openicl.icl_retriever import ZeroRetriever
4
+ from opencompass.openicl.icl_inferencer import GenInferencer
5
+ from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, HumanEvalPlusEvaluator, humaneval_postprocess_v2
6
+ from opencompass.datasets import LCDataset, LCPassKEvaluator
7
+ from opencompass.datasets import TACODataset, TACOEvaluator
8
+
9
+ compassbench_v1_3_code_datasets = []
10
+
11
+ # --------------------------------------------------------------- HumanEval CN ---------------------------------------------------------------
12
+ humaneval_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
13
+
14
+ humaneval_infer_cfg = dict(
15
+ prompt_template=dict(
16
+ type=PromptTemplate,
17
+ template=dict(
18
+ round=[
19
+ dict(role='HUMAN', prompt='完成以下Python代码任务:\n{prompt}'),
20
+ ]
21
+ ),
22
+ ),
23
+ retriever=dict(type=ZeroRetriever),
24
+ inferencer=dict(type=GenInferencer, max_out_len=512),
25
+ )
26
+
27
+ humaneval_eval_cfg = dict(
28
+ evaluator=dict(type=HumanEvalEvaluator),
29
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
30
+ )
31
+
32
+ compassbench_v1_3_code_datasets.append(
33
+ dict(
34
+ abbr='compass_bench_cdoe_completion_zh',
35
+ type=HumanevalDataset,
36
+ path='./data/compassbench_v1_3/coding/compass_bench_cdoe_completion/compass_bench_cdoe_completion_zh.jsonl',
37
+ # local_mode=True,
38
+ reader_cfg=humaneval_reader_cfg,
39
+ infer_cfg=humaneval_infer_cfg,
40
+ eval_cfg=humaneval_eval_cfg,
41
+ )
42
+ )
43
+ # --------------------------------------------------------------- ---------------------------------------------------------------
44
+ humaneval_plus_reader_cfg = dict(input_columns=['prompt'], output_column='task_id', train_split='test')
45
+
46
+ # TODO: allow empty output-column
47
+ humaneval_plus_infer_cfg = dict(
48
+ prompt_template=dict(
49
+ type=PromptTemplate,
50
+ template=dict(
51
+ round=[
52
+ dict(role='HUMAN', prompt='Complete the following python code:\n{prompt}'),
53
+ ]
54
+ ),
55
+ ),
56
+ retriever=dict(type=ZeroRetriever),
57
+ inferencer=dict(type=GenInferencer, max_out_len=512),
58
+ )
59
+
60
+ humaneval_plus_eval_cfg = dict(
61
+ evaluator=dict(type=HumanEvalPlusEvaluator),
62
+ pred_postprocessor=dict(type=humaneval_postprocess_v2),
63
+ )
64
+
65
+ compassbench_v1_3_code_datasets.append(
66
+ dict(
67
+ abbr='compass_bench_cdoe_completion_en',
68
+ type=HumanevalDataset,
69
+ path='./data/compassbench_v1_3/coding/compass_bench_cdoe_completion/compass_bench_cdoe_completion_en.jsonl',
70
+ # local_mode=True,
71
+ reader_cfg=humaneval_plus_reader_cfg,
72
+ infer_cfg=humaneval_plus_infer_cfg,
73
+ eval_cfg=humaneval_plus_eval_cfg,
74
+ )
75
+ )
76
+
77
+ # ------------------------------------- Code Interview(LCBench --------------------------------------
78
+ LC_difficulties_list = ['EASY', 'MEDIUM', 'HARD']
79
+ LC_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
80
+
81
+
82
+ LC_en_infer_cfg = dict(
83
+ prompt_template=dict(
84
+ type=PromptTemplate,
85
+ template=dict(
86
+ round=[
87
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
88
+ dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "),
89
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with ["a","b","c"], we need to push the key one time to type "a", two times to type "b", and three times to type "c" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
90
+ dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "),
91
+ dict(role='HUMAN', prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"),
92
+ dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "),
93
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'),
94
+ dict(role='BOT', prompt='[BEGIN]\n'),
95
+ ],
96
+ ),
97
+ ),
98
+ retriever=dict(type=ZeroRetriever),
99
+ inferencer=dict(type=GenInferencer, max_out_len=512),
100
+ )
101
+
102
+
103
+ LC_cn_infer_cfg = dict(
104
+ prompt_template=dict(
105
+ type=PromptTemplate,
106
+ template=dict(
107
+ round=[
108
+ dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你三个 正整数 n 、x 和 y 。\n在城市中,存在编号从 1 到 n 的房屋,由 n 条街道相连。对所有 1 <= i < n ,都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k(1 <= k <= n),你需要找出所有满足要求的 房屋对 [house1, house2] ,即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ,其中 result[k] 表示所有满足要求的房屋对的数量,即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意,x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
109
+ dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "),
110
+ dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个字符串 word,由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射,可以通过按压按键来组成单词。例如,按键 2 对应 ["a","b","c"],我们需要按一次键来输入 "a",按两次键来输入 "b",按三次键来输入 "c"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母,但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1,*,# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
111
+ dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "),
112
+ dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ,满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意:\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号,比方说如果 s == 11100 ,那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'),
113
+ dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "),
114
+ dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list} \n'),
115
+ dict(role='BOT', prompt='[BEGIN]\n'),
116
+ ],
117
+ ),
118
+ ),
119
+ retriever=dict(type=ZeroRetriever),
120
+ inferencer=dict(type=GenInferencer, max_out_len=512),
121
+ )
122
+
123
+
124
+ LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
125
+
126
+ for difficulty in LC_difficulties_list:
127
+ compassbench_v1_3_code_datasets.append(
128
+ dict(
129
+ type=LCDataset,
130
+ abbr='compass_bench_code_interview_en-' + difficulty,
131
+ path='./data/compassbench_v1_3/coding/compass_bench_code_interview/compass_bench_code_interview_en.jsonl',
132
+ difficulty=difficulty,
133
+ reader_cfg=LC_reader_cfg,
134
+ infer_cfg=LC_en_infer_cfg,
135
+ eval_cfg=LC_eval_cfg,
136
+ )
137
+ )
138
+ compassbench_v1_3_code_datasets.append(
139
+ dict(
140
+ type=LCDataset,
141
+ abbr='compass_bench_code_interview_zh-' + difficulty,
142
+ path='./data/compassbench_v1_3/coding/compass_bench_code_interview/compass_bench_code_interview_zh.jsonl',
143
+ difficulty=difficulty,
144
+ reader_cfg=LC_reader_cfg,
145
+ infer_cfg=LC_cn_infer_cfg,
146
+ eval_cfg=LC_eval_cfg,
147
+ )
148
+ )
149
+
150
+
151
+ # --------------------------------------------Code Competition(TACO) ---------------------------------------------------------------
152
+ TACO_difficulties_list = ['EASY', 'MEDIUM', 'MEDIUM_HARD', 'HARD', 'VERY_HARD']
153
+ TACO_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
154
+
155
+ TACO_infer_cfg = dict(
156
+ prompt_template=dict(
157
+ type=PromptTemplate,
158
+ template=dict(
159
+ round=[
160
+ dict(role='HUMAN', prompt='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
161
+ ],
162
+ ),
163
+ ),
164
+ retriever=dict(type=ZeroRetriever),
165
+ inferencer=dict(type=GenInferencer, max_out_len=1024),
166
+ )
167
+
168
+ TACO_eval_cfg = dict(evaluator=dict(type=TACOEvaluator), pred_role='BOT')
169
+
170
+ for difficulty in TACO_difficulties_list:
171
+ compassbench_v1_3_code_datasets.append(
172
+ dict(
173
+ type=TACODataset,
174
+ abbr='TACO-' + difficulty,
175
+ path='./data/compassbench_v1_3/coding/compass_bench_code_competition',
176
+ difficulty=difficulty,
177
+ reader_cfg=TACO_reader_cfg,
178
+ infer_cfg=TACO_infer_cfg,
179
+ eval_cfg=TACO_eval_cfg,
180
+ )
181
+ )
opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_knowledge.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
5
+ from opencompass.datasets.compassbench_obj import (
6
+ CompassBenchObjectiveV1_3,
7
+ compassbench_objective_v1_3_postprocess,
8
+ )
9
+ from opencompass.utils.text_postprocessors import first_option_postprocess
10
+
11
+
12
+ prompt_cn = {
13
+ 'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答,其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
14
+ 'cloze_cn': '以下是一道填空题,请你根据你了解的知识一步步思考后把你的最终答案放到\\boxed{}中。\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
15
+ }
16
+
17
+ prompt_en = {
18
+ 'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option letter you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
19
+ 'cloze_en': "Here is a fill-in-the-blank question. Please think step by step based on your knowledge and put your final answer in \\boxed{}. Here is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
20
+ }
21
+
22
+ douknow_sets = {
23
+ 'wiki_en_sub_500_人文科学':['single_choice_en'],
24
+ 'wiki_en_sub_500_社会科学':['single_choice_en'],
25
+ 'wiki_en_sub_500_生活常识':['single_choice_en'],
26
+ 'wiki_en_sub_500_自然科学-工科':['single_choice_en'],
27
+ 'wiki_en_sub_500_自然科学-理科':['single_choice_en'],
28
+ 'wiki_zh_sub_500_人文科学': ['single_choice_cn'],
29
+ 'wiki_zh_sub_500_社会科学': ['single_choice_cn'],
30
+ 'wiki_zh_sub_500_生活常识': ['single_choice_cn'],
31
+ 'wiki_zh_sub_500_自然科学-工科':['single_choice_cn'],
32
+ 'wiki_zh_sub_500_自然科学-理科':['single_choice_cn'],
33
+ }
34
+
35
+ data_path = './data/compassbench_v1_3/knowledge'
36
+
37
+ # Set up the prompts
38
+ CircularEval = True
39
+
40
+ compassbench_knowledge_datasets = []
41
+
42
+ for _split in list(douknow_sets.keys()):
43
+ for _name in douknow_sets[_split]:
44
+ if 'cn' in _name:
45
+ single_choice_prompts = prompt_cn
46
+ cloze_prompts = prompt_cn
47
+ else:
48
+ single_choice_prompts = prompt_en
49
+ cloze_prompts = prompt_en
50
+
51
+ if 'single_choice' in _name:
52
+ template_round = [dict(role='HUMAN', prompt=single_choice_prompts[_name])]
53
+ pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
54
+ evaluator = dict(type=CircularEvaluator if CircularEval else AccEvaluator)
55
+ dataset_name = _name + '_circular' if CircularEval else _name
56
+ dataset_abbr = (
57
+ 'compassbench-' + _split + '_circular'
58
+ if CircularEval
59
+ else 'compassbench-' + _split
60
+ )
61
+ else:
62
+ template_round = [dict(role='HUMAN', prompt=cloze_prompts[_name])]
63
+ pred_postprocessor = dict(
64
+ type=compassbench_objective_v1_3_postprocess, name=_name
65
+ )
66
+ evaluator = dict(type=AccEvaluator)
67
+ dataset_name = _name
68
+ dataset_abbr = 'compassbench-' + _split
69
+
70
+ douknow_infer_cfg = dict(
71
+ prompt_template=dict(
72
+ type=PromptTemplate, template=dict(round=template_round)
73
+ ),
74
+ retriever=dict(type=ZeroRetriever),
75
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
76
+ )
77
+
78
+ douknow_eval_cfg = dict(
79
+ evaluator=evaluator,
80
+ pred_postprocessor=pred_postprocessor,
81
+ )
82
+
83
+ compassbench_knowledge_datasets.append(
84
+ dict(
85
+ type=CompassBenchObjectiveV1_3,
86
+ path=f'{data_path}/{_split}.jsonl',
87
+ name=dataset_name,
88
+ abbr=dataset_abbr,
89
+ reader_cfg=dict(input_columns=['question'], output_column='answer'),
90
+ infer_cfg=douknow_infer_cfg,
91
+ eval_cfg=douknow_eval_cfg,
92
+ )
93
+ )
94
+ del _split, _name
opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_math.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
5
+ from opencompass.datasets.compassbench_obj import CompassBenchObjectiveV1_3
6
+ from opencompass.datasets import MATHEvaluator, math_postprocess_v2
7
+ from opencompass.utils.text_postprocessors import first_option_postprocess
8
+
9
+
10
+ prompt_cn = {
11
+ 'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答,其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
12
+ 'cloze_cn': '以下是一道数学计算题,请你一步一步计算,并在最后用\\boxed{}包裹并返回你计算的最终答案。\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
13
+ }
14
+
15
+ prompt_en = {
16
+ 'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option number you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
17
+ 'cloze_en': 'Here is a arithematic problem. Please reason step by step, and put your final answer within \\boxed{}. Here is the question you need to answer:\n{question}\nLet\'s solve this problem step by step:',
18
+ }
19
+
20
+
21
+ douknow_sets = {
22
+ 'arithmetic_cloze_en': ['cloze_en'],
23
+ 'college_single_choice_en': ['single_choice_en'],
24
+ 'college_single_choice_cn': ['single_choice_cn'],
25
+ }
26
+
27
+ data_path = './data/compassbench_v1_3/math'
28
+
29
+ # Set up the prompts
30
+ CircularEval = True
31
+
32
+ compassbench_math_datasets = []
33
+
34
+ for _split in list(douknow_sets.keys()):
35
+ for _name in douknow_sets[_split]:
36
+ if 'cn' in _name:
37
+ single_choice_prompts = prompt_cn
38
+ cloze_prompts = prompt_cn
39
+ else:
40
+ single_choice_prompts = prompt_en
41
+ cloze_prompts = prompt_en
42
+
43
+ if 'single_choice' in _name:
44
+ template_round = [dict(role='HUMAN', prompt=single_choice_prompts[_name])]
45
+ pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
46
+ evaluator = dict(type=CircularEvaluator if CircularEval else AccEvaluator)
47
+ dataset_name = _name + '_circular' if CircularEval else _name
48
+ dataset_abbr = (
49
+ 'compassbench-' + _split + '_circular'
50
+ if CircularEval
51
+ else 'compassbench-' + _split
52
+ )
53
+ else:
54
+ template_round = [dict(role='HUMAN', prompt=cloze_prompts[_name])]
55
+ pred_postprocessor = dict(
56
+ type=math_postprocess_v2,
57
+ )
58
+ evaluator = dict(type=MATHEvaluator)
59
+ dataset_name = _name
60
+ dataset_abbr = 'compassbench-' + _split
61
+
62
+ douknow_infer_cfg = dict(
63
+ prompt_template=dict(
64
+ type=PromptTemplate, template=dict(round=template_round)
65
+ ),
66
+ retriever=dict(type=ZeroRetriever),
67
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
68
+ )
69
+
70
+ douknow_eval_cfg = dict(
71
+ evaluator=evaluator,
72
+ pred_postprocessor=pred_postprocessor,
73
+ )
74
+
75
+ compassbench_math_datasets.append(
76
+ dict(
77
+ type=CompassBenchObjectiveV1_3,
78
+ path=f'{data_path}/{_split}.jsonl',
79
+ name=dataset_name,
80
+ abbr=dataset_abbr,
81
+ reader_cfg=dict(input_columns=['question'], output_column='answer'),
82
+ infer_cfg=douknow_infer_cfg,
83
+ eval_cfg=douknow_eval_cfg,
84
+ )
85
+ )
86
+ del _split, _name
opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_objective_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .compassbench_v1_3_objective_gen_068af0 import compassbench_aug_datasets # noqa: F401, F403
opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_objective_gen_068af0.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
5
+ from opencompass.datasets.compassbench_obj import CompassBenchObjectiveV1_3, compassbench_objective_v1_3_postprocess
6
+ from opencompass.utils.text_postprocessors import first_option_postprocess
7
+
8
+
9
+ prompt_cn = {
10
+ 'single_choice_cn': '以下是一道单项选择题,请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答,其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
11
+ 'cloze_cn': '以下是一道填空题,请你根据你了解的知识一步步思考后把你的最终答案放到\\boxed{}中。\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
12
+ }
13
+
14
+ prompt_en = {
15
+ 'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option number you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
16
+ 'cloze_en': "Here is a fill-in-the-blank question. Please think step by step based on your knowledge and put your final answer in \\boxed{}. Here is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
17
+ }
18
+
19
+
20
+ douknow_sets = {
21
+ 'knowledge': ['single_choice_cn'],
22
+ 'math': ['single_choice_cn'],
23
+ }
24
+
25
+ # Set up the prompts
26
+ CircularEval = True
27
+
28
+
29
+ compassbench_aug_datasets = []
30
+
31
+ for _split in list(douknow_sets.keys()):
32
+ for _name in douknow_sets[_split]:
33
+ if 'cn' in _name:
34
+ single_choice_prompts = prompt_cn
35
+ cloze_prompts = prompt_cn
36
+ else:
37
+ single_choice_prompts = prompt_en
38
+ cloze_prompts = prompt_en
39
+ douknow_infer_cfg = dict(
40
+ ice_template=dict(
41
+ type=PromptTemplate,
42
+ template=dict(
43
+ begin='</E>',
44
+ round=[
45
+ dict(
46
+ role='HUMAN',
47
+ prompt= single_choice_prompts[_name],
48
+ ),
49
+ dict(role='BOT', prompt='{answer}'),] if 'choice' in _name else cloze_prompts[_name],
50
+ ),
51
+ ice_token='</E>',
52
+ ),
53
+ retriever=dict(type=ZeroRetriever),
54
+ inferencer=dict(type=GenInferencer),
55
+ )
56
+ douknow_eval_cfg = dict(
57
+ evaluator=dict(type=CircularEvaluator if CircularEval else AccEvaluator) if 'single_choice' in _name else dict(type=AccEvaluator),
58
+ pred_postprocessor=dict(type=first_option_postprocess, options='ABCD' ) if 'single_choice' in _name else dict(type=compassbench_objective_v1_3_postprocess, name=_name))
59
+
60
+ compassbench_aug_datasets.append(
61
+ dict(
62
+ type=CompassBenchObjectiveV1_3,
63
+ path=f'./data/compassbench_v1_3/{_split}/{_name}.jsonl',
64
+ name='circular_' + _name if CircularEval else _name,
65
+ abbr='compassbench-' + _split + '-' + _name + 'circular'if CircularEval else '',
66
+ reader_cfg=dict(
67
+ input_columns=['question'],
68
+ output_column='answer'
69
+ ),
70
+ infer_cfg=douknow_infer_cfg,
71
+ eval_cfg=douknow_eval_cfg,
72
+ ))
73
+
74
+ del _split, _name
opencompass/configs/datasets/compassbench_v1_3/compassbench_v1_3_prompt.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FORCE_STOP_PROMPT_EN = (
2
+ """You should directly give results based on history information."""
3
+ )
4
+
5
+ FEWSHOT_INSTRUCTION = """\
6
+ You are an assistant who can utilize external tools.
7
+ {tool_description}
8
+ To use a tool, please response with the following format:
9
+ ```
10
+ {thought} Think what you need to solve, do you need to use tools?
11
+ {action} The tool name, should be one of [{action_names}].
12
+ {action_input} The input to the tool that you want to use.
13
+ ```
14
+ The tool will give you response after your response using the following format:
15
+ ```
16
+ {response} the results after call the tool.
17
+ ```
18
+ Therefore DO NOT generate tool response by yourself.
19
+
20
+ Also please follow the guidelines:
21
+ 1. Always use code interpreter to solve the problem.
22
+ 2. The generated codes should always in a markdown code block format.
23
+ 3. The generated codes will be executed in an ipython manner and the results will be cached.
24
+ 4. Your responded code should always be simple and only solves the problem in current step.
25
+
26
+ For example:
27
+
28
+ File url: `xxxx`
29
+ ### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
30
+
31
+ {thought} We should use `pandas` to solve this step.
32
+ {action} IPythonInterpreter
33
+ {action_input} ```python
34
+ import pandas as pd
35
+ url = "xxxx"
36
+ data = pd.read_csv(url)
37
+ ```
38
+ {response} The code is succeed without any outputs.
39
+
40
+ Let us begin from here!
41
+ """
42
+
43
+ IPYTHON_INTERPRETER_DESCRIPTION = """\
44
+ It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method."""
opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .hungarian_exam_gen_8a1435 import hungarianmath_datasets # noqa: F401, F403
opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen_8a1435.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HungarianExamMathDataset
6
+
7
+ hungarianmath_reader_cfg = dict(input_columns=['question'], output_column=None)
8
+
9
+ template = """Problem:
10
+ Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.
11
+
12
+ Solution:
13
+ To determine the domain, we must ensure that:
14
+ 1. The expressions inside each square root are non-negative.
15
+ 2. The denominator is not equal to zero.
16
+
17
+ For the numerator, $x-2 \ge 0$ gives $x \ge 2$.
18
+
19
+ For the denominator, $5-x \ge 0$ gives $x \le 5$. And since the denominator cannot be zero, $5-x > 0$ which further narrows it to $x < 5$.
20
+
21
+ Combining these results, the domain of the expression is $[2,5)$.
22
+
23
+ Final Answer: The final answer is $[2,5)$.
24
+
25
+ Problem:
26
+ If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12$, then find $\det (\mathbf{A} \mathbf{B})$.
27
+
28
+ Solution:
29
+ Using the property of determinants, we can say that:
30
+ $\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B})$.
31
+ Plugging in the given values:
32
+ $\det (\mathbf{A} \mathbf{B}) = 2 \times 12 = 24$.
33
+
34
+ Final Answer: The final answer is $24$.
35
+
36
+ Problem:
37
+ Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
38
+
39
+ Solution:
40
+ First, calculate the total weight Terrell lifts with the 20-pound weights:
41
+ $2 \times 12 \times 20 = 480$ pounds.
42
+ If he uses 15-pound weights and lifts them $n$ times:
43
+ $2 \times 15 \times n = 30n$ pounds.
44
+ To find $n$, set these two equal:
45
+ \begin{align*}
46
+ 30n &= 480 \\
47
+ n &= \frac{480}{30} \\
48
+ n &= 16
49
+ \end{align*}
50
+
51
+ Final Answer: The final answer is $16$.
52
+
53
+ Problem:
54
+ If the system of equations
55
+ \begin{align*}
56
+ 6x-4y &= a, \\
57
+ 6y-9x &= b.
58
+ \end{align*}
59
+ has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\frac{a}{b}$, assuming $b$ is nonzero.
60
+
61
+ Solution:
62
+ Multiply the first equation by $-\frac{3}{2}$ to obtain:
63
+ $6y-9x = -\frac{3}{2}a$.
64
+ Since we also know that $6y-9x = b$, equating them gives:
65
+ $-\frac{3}{2}a = b$ which implies $\frac{a}{b} = -\frac{2}{3}$.
66
+
67
+ Final Answer: The final answer is $-\frac{2}{3}$."""
68
+
69
+ hungarianmath_infer_cfg = dict(
70
+ prompt_template=dict(
71
+ type=PromptTemplate,
72
+ template=dict(
73
+ round=[
74
+ dict(role='HUMAN', prompt=template+'\n\nProblem:\n{question}\n\nSolution:\n'),
75
+ ],
76
+ )),
77
+ retriever=dict(type=ZeroRetriever),
78
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
79
+
80
+ # Attention: this math dataset needs human to evaluate the generated answer, so the AccEvaluator is just a placeholder.
81
+ hungarianmath_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
82
+
83
+ hungarianmath_datasets = [
84
+ dict(
85
+ abbr='HungarianExamMath',
86
+ type=HungarianExamMathDataset,
87
+ path='./data/HungarianExamMath/test.csv',
88
+ reader_cfg=hungarianmath_reader_cfg,
89
+ infer_cfg=hungarianmath_infer_cfg,
90
+ eval_cfg=hungarianmath_eval_cfg)
91
+ ]
opencompass/configs/datasets/lambada/lambada_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .lambada_gen_217e11 import lambada_datasets # noqa: F401, F403
opencompass/configs/datasets/lambada/lambada_gen_217e11.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import lambadaDataset, LambadaEvaluator
5
+
6
+ lambada_reader_cfg = dict(
7
+ input_columns=['prompt'],
8
+ output_column='label',
9
+ train_split='test',
10
+ test_split='test')
11
+
12
+ lambada_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template=dict(round=[
16
+ dict(
17
+ role='HUMAN',
18
+ prompt='Please complete the following sentence:\n{prompt}')
19
+ ])),
20
+ retriever=dict(type=ZeroRetriever),
21
+ inferencer=dict(type=GenInferencer, max_out_len=5))
22
+
23
+ lambada_eval_cfg = dict(evaluator=dict(type=LambadaEvaluator))
24
+
25
+ lambada_datasets = [
26
+ dict(
27
+ abbr='lambada',
28
+ type=lambadaDataset,
29
+ path='opencompass/lambada',
30
+ reader_cfg=lambada_reader_cfg,
31
+ infer_cfg=lambada_infer_cfg,
32
+ eval_cfg=lambada_eval_cfg)
33
+ ]
opencompass/configs/datasets/lambada/lambada_gen_8b48a5.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import lambadaDataset, LambadaEvaluator
5
+
6
+ lambada_reader_cfg = dict(
7
+ input_columns=['prompt'],
8
+ output_column='label',
9
+ train_split='test',
10
+ test_split='test')
11
+
12
+ lambada_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template='Please complete the following sentence: {prompt}'),
16
+ retriever=dict(type=ZeroRetriever),
17
+ inferencer=dict(type=GenInferencer, max_out_len=5))
18
+
19
+ lambada_eval_cfg = dict(evaluator=dict(type=LambadaEvaluator))
20
+
21
+ lambada_datasets = [
22
+ dict(
23
+ abbr='lambada',
24
+ type=lambadaDataset,
25
+ path='opencompass/lambada',
26
+ reader_cfg=lambada_reader_cfg,
27
+ infer_cfg=lambada_infer_cfg,
28
+ eval_cfg=lambada_eval_cfg)
29
+ ]
opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import LMEvaluator
5
+ from opencompass.summarizers import AlignmentBenchSummarizer
6
+ from opencompass.datasets import AlignmentBenchDataset
7
+
8
+ subjective_reader_cfg = dict(
9
+ input_columns=['question', 'capability', 'critiquellm_prefix'],
10
+ output_column='judge',
11
+ )
12
+
13
+ subjective_all_sets = [
14
+ 'alignment_bench',
15
+ ]
16
+ data_path ='data/subjective/alignment_bench'
17
+
18
+ alignment_bench_config_path = 'data/subjective/alignment_bench/config'
19
+ alignment_bench_config_name = 'multi-dimension'
20
+
21
+ alignbench_datasets = []
22
+
23
+ for _name in subjective_all_sets:
24
+ subjective_infer_cfg = dict(
25
+ prompt_template=dict(
26
+ type=PromptTemplate,
27
+ template=dict(round=[
28
+ dict(
29
+ role='HUMAN',
30
+ prompt='{question}'
31
+ ),
32
+ ]),
33
+ ),
34
+ retriever=dict(type=ZeroRetriever),
35
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
36
+ )
37
+
38
+ subjective_eval_cfg = dict(
39
+ evaluator=dict(
40
+ type=LMEvaluator,
41
+ prompt_template=dict(
42
+ type=PromptTemplate,
43
+ template=dict(round=[
44
+ dict(
45
+ role='HUMAN',
46
+ prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
47
+ ),
48
+ ]),
49
+ ),
50
+ ),
51
+ pred_role='BOT',
52
+ )
53
+
54
+ alignbench_datasets.append(
55
+ dict(
56
+ abbr=f'{_name}',
57
+ type=AlignmentBenchDataset,
58
+ path=data_path,
59
+ name=_name,
60
+ alignment_bench_config_path=alignment_bench_config_path,
61
+ alignment_bench_config_name=alignment_bench_config_name,
62
+ reader_cfg=subjective_reader_cfg,
63
+ infer_cfg=subjective_infer_cfg,
64
+ eval_cfg=subjective_eval_cfg,
65
+ mode='singlescore',
66
+ summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
67
+ ))
opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import LMEvaluator
5
+ from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
6
+
7
+ subjective_reader_cfg = dict(
8
+ input_columns=['question', 'capability', 'critiquellm_prefix'],
9
+ output_column='judge',
10
+ )
11
+
12
+ subjective_all_sets = [
13
+ 'alignment_bench',
14
+ ]
15
+ data_path ='data/subjective/alignment_bench'
16
+
17
+ alignment_bench_config_path = 'data/subjective/alignment_bench/config'
18
+ alignment_bench_config_name = 'multi-dimension'
19
+
20
+ alignbench_datasets = []
21
+
22
+ for _name in subjective_all_sets:
23
+ subjective_infer_cfg = dict(
24
+ prompt_template=dict(
25
+ type=PromptTemplate,
26
+ template=dict(round=[
27
+ dict(
28
+ role='HUMAN',
29
+ prompt='{question}'
30
+ ),
31
+ ]),
32
+ ),
33
+ retriever=dict(type=ZeroRetriever),
34
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
35
+ )
36
+
37
+ subjective_eval_cfg = dict(
38
+ evaluator=dict(
39
+ type=LMEvaluator,
40
+ prompt_template=dict(
41
+ type=PromptTemplate,
42
+ template=dict(round=[
43
+ dict(
44
+ role='HUMAN',
45
+ prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
46
+ ),
47
+ ]),
48
+ ),
49
+ dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
50
+ ),
51
+ pred_role='BOT',
52
+ )
53
+
54
+ alignbench_datasets.append(
55
+ dict(
56
+ abbr=f'{_name}',
57
+ type=AlignmentBenchDataset,
58
+ path=data_path,
59
+ name=_name,
60
+ alignment_bench_config_path=alignment_bench_config_path,
61
+ alignment_bench_config_name=alignment_bench_config_name,
62
+ reader_cfg=subjective_reader_cfg,
63
+ infer_cfg=subjective_infer_cfg,
64
+ eval_cfg=subjective_eval_cfg,
65
+ mode='singlescore',
66
+ ))
opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import LMEvaluator
5
+ from opencompass.summarizers import AlignmentBenchSummarizer
6
+ from opencompass.datasets import AlignmentBenchDataset
7
+
8
+ subjective_reader_cfg = dict(
9
+ input_columns=['question', 'capability', 'critiquellm_prefix'],
10
+ output_column='judge',
11
+ )
12
+
13
+ subjective_all_sets = [
14
+ 'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
15
+ ]
16
+ data_path ='data/subjective/alignment_bench'
17
+
18
+ alignment_bench_config_path = 'data/subjective/alignment_bench/config'
19
+ alignment_bench_config_name = 'multi-dimension'
20
+
21
+ alignbench_datasets = []
22
+
23
+ for _name in subjective_all_sets:
24
+ subjective_infer_cfg = dict(
25
+ prompt_template=dict(
26
+ type=PromptTemplate,
27
+ template=dict(round=[
28
+ dict(
29
+ role='HUMAN',
30
+ prompt='{question}'
31
+ ),
32
+ ]),
33
+ ),
34
+ retriever=dict(type=ZeroRetriever),
35
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
36
+ )
37
+
38
+ subjective_eval_cfg = dict(
39
+ evaluator=dict(
40
+ type=LMEvaluator,
41
+ prompt_template=dict(
42
+ type=PromptTemplate,
43
+ template=dict(round=[
44
+ dict(
45
+ role='HUMAN',
46
+ prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
47
+ ),
48
+ ]),
49
+ ),
50
+ ),
51
+ pred_role='BOT',
52
+ )
53
+
54
+ alignbench_datasets.append(
55
+ dict(
56
+ abbr=f'{_name}',
57
+ type=AlignmentBenchDataset,
58
+ path=data_path,
59
+ name=_name,
60
+ alignment_bench_config_path=alignment_bench_config_path,
61
+ alignment_bench_config_name=alignment_bench_config_name,
62
+ reader_cfg=subjective_reader_cfg,
63
+ infer_cfg=subjective_infer_cfg,
64
+ eval_cfg=subjective_eval_cfg,
65
+ mode='singlescore',
66
+ summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
67
+ ))
opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import LMEvaluator
5
+ from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
6
+
7
+ subjective_reader_cfg = dict(
8
+ input_columns=['question', 'capability', 'critiquellm_prefix'],
9
+ output_column='judge',
10
+ )
11
+
12
+ subjective_all_sets = [
13
+ 'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
14
+ ]
15
+ data_path ='data/subjective/alignment_bench'
16
+
17
+ alignment_bench_config_path = 'data/subjective/alignment_bench/config'
18
+ alignment_bench_config_name = 'multi-dimension'
19
+
20
+ alignbench_datasets = []
21
+
22
+ for _name in subjective_all_sets:
23
+ subjective_infer_cfg = dict(
24
+ prompt_template=dict(
25
+ type=PromptTemplate,
26
+ template=dict(round=[
27
+ dict(
28
+ role='HUMAN',
29
+ prompt='{question}'
30
+ ),
31
+ ]),
32
+ ),
33
+ retriever=dict(type=ZeroRetriever),
34
+ inferencer=dict(type=GenInferencer, max_out_len=2048),
35
+ )
36
+
37
+ subjective_eval_cfg = dict(
38
+ evaluator=dict(
39
+ type=LMEvaluator,
40
+ prompt_template=dict(
41
+ type=PromptTemplate,
42
+ template=dict(round=[
43
+ dict(
44
+ role='HUMAN',
45
+ prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
46
+ ),
47
+ ]),
48
+ ),
49
+ dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
50
+ ),
51
+ pred_role='BOT',
52
+ )
53
+
54
+ alignbench_datasets.append(
55
+ dict(
56
+ abbr=f'{_name}',
57
+ type=AlignmentBenchDataset,
58
+ path=data_path,
59
+ name=_name,
60
+ alignment_bench_config_path=alignment_bench_config_path,
61
+ alignment_bench_config_name=alignment_bench_config_name,
62
+ reader_cfg=subjective_reader_cfg,
63
+ infer_cfg=subjective_infer_cfg,
64
+ eval_cfg=subjective_eval_cfg,
65
+ mode='singlescore',
66
+ ))