tuandunghcmut commited on
Commit
c5f13f4
·
verified ·
1 Parent(s): 92c9150

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. opencompass/configs/datasets/CHARM/README.md +164 -0
  2. opencompass/configs/datasets/CHARM/README_ZH.md +162 -0
  3. opencompass/configs/datasets/CHARM/charm_memory_gen_bbbd53.py +63 -0
  4. opencompass/configs/datasets/CHARM/charm_memory_settings.py +31 -0
  5. opencompass/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py +50 -0
  6. opencompass/configs/datasets/CHARM/charm_reason_gen.py +4 -0
  7. opencompass/configs/datasets/CHARM/charm_reason_gen_f8fca2.py +49 -0
  8. opencompass/configs/datasets/CHARM/charm_reason_ppl_3da4de.py +57 -0
  9. opencompass/configs/datasets/CHARM/charm_reason_settings.py +36 -0
  10. opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py +4 -0
  11. opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py +43 -0
  12. opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_ppl.py +4 -0
  13. opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_ppl_378c5b.py +44 -0
  14. opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_ppl_6507d7.py +50 -0
  15. opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_ppl_7b0c1e.py +34 -0
  16. opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py +4 -0
  17. opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py +51 -0
  18. opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl.py +4 -0
  19. opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_12e4e0.py +58 -0
  20. opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_4284a0.py +44 -0
  21. opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_868415.py +54 -0
  22. opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py +4 -0
  23. opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl_250d00.py +51 -0
  24. opencompass/configs/datasets/LCBench/README.md +66 -0
  25. opencompass/configs/datasets/LCBench/lcbench_gen.py +4 -0
  26. opencompass/configs/datasets/LCBench/lcbench_gen_5ff288.py +107 -0
  27. opencompass/configs/datasets/LCBench/lcbench_levels_gen_bb665f.py +77 -0
  28. opencompass/configs/datasets/LCBench/lcbench_repeat10_gen.py +4 -0
  29. opencompass/configs/datasets/LCBench/lcbench_repeat10_gen_5ff288.py +106 -0
  30. opencompass/configs/datasets/SVAMP/svamp_gen.py +4 -0
  31. opencompass/configs/datasets/SVAMP/svamp_gen_fb25e4.py +36 -0
  32. opencompass/configs/datasets/game24/game24_gen.py +4 -0
  33. opencompass/configs/datasets/game24/game24_gen_52a460.py +34 -0
  34. opencompass/configs/datasets/humanevalx/humanevalx_gen.py +4 -0
  35. opencompass/configs/datasets/humanevalx/humanevalx_gen_0af626.py +60 -0
  36. opencompass/configs/datasets/humanevalx/humanevalx_gen_620cfa.py +41 -0
  37. opencompass/configs/datasets/infinitebench/infinitebench.py +17 -0
  38. opencompass/configs/datasets/lveval/lveval.md +165 -0
  39. opencompass/configs/datasets/lveval/lveval.py +38 -0
  40. opencompass/configs/datasets/math401/math401_gen.py +4 -0
  41. opencompass/configs/datasets/math401/math401_gen_ab5f39.py +47 -0
  42. opencompass/configs/datasets/mbpp/README.md +69 -0
  43. opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py +42 -0
  44. opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_6590b0.py +28 -0
  45. opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_caa7ab.py +42 -0
  46. opencompass/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py +45 -0
  47. opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_1e1056.py +42 -0
  48. opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_cb43ef.py +81 -0
  49. opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_passk_gen_1e1056.py +42 -0
  50. opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_repeat10_gen_1e1056.py +43 -0
opencompass/configs/datasets/CHARM/README.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CHARM✨ Benchmarking Chinese Commonsense Reasoning of LLMs: From Chinese-Specifics to Reasoning-Memorization Correlations [ACL2024]
2
+ [![arXiv](https://img.shields.io/badge/arXiv-2403.14112-b31b1b.svg)](https://arxiv.org/abs/2403.14112)
3
+ [![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](./LICENSE)
4
+ <div align="center">
5
+
6
+ 📃[Paper](https://arxiv.org/abs/2403.14112)
7
+ 🏰[Project Page](https://opendatalab.github.io/CHARM/)
8
+ 🏆[Leaderboard](https://opendatalab.github.io/CHARM/leaderboard.html)
9
+ ✨[Findings](https://opendatalab.github.io/CHARM/findings.html)
10
+
11
+ </div>
12
+
13
+ <div align="center">
14
+ 📖 <a href="./README_ZH.md"> 中文</a> | <a href="./README.md">English</a>
15
+ </div>
16
+
17
+ ## Dataset Description
18
+
19
+ **CHARM** is the first benchmark for comprehensively and in-depth evaluating the commonsense reasoning ability of large language models (LLMs) in Chinese, which covers both globally known and Chinese-specific commonsense. In addition, the CHARM can evaluate the LLMs' memorization-independent reasoning abilities and analyze the typical errors.
20
+
21
+ ## Comparison of commonsense reasoning benchmarks
22
+ <html lang="en">
23
+ <table align="center">
24
+ <thead class="fixed-header">
25
+ <tr>
26
+ <th>Benchmarks</th>
27
+ <th>CN-Lang</th>
28
+ <th>CSR</th>
29
+ <th>CN-specifics</th>
30
+ <th>Dual-Domain</th>
31
+ <th>Rea-Mem</th>
32
+ </tr>
33
+ </thead>
34
+ <tr>
35
+ <td>Most benchmarks in <a href="https://arxiv.org/abs/2302.04752"> davis2023benchmarks</a></td>
36
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
37
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
38
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
39
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
40
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
41
+ </tr>
42
+ <tr>
43
+ <td><a href="https://arxiv.org/abs/1809.05053"> XNLI</a>, <a
44
+ href="https://arxiv.org/abs/2005.00333">XCOPA</a>,<a
45
+ href="https://arxiv.org/abs/2112.10668">XStoryCloze</a></td>
46
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
47
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
48
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
49
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
50
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
51
+ </tr>
52
+ <tr>
53
+ <td><a href="https://arxiv.org/abs/2007.08124">LogiQA</a>, <a
54
+ href="https://arxiv.org/abs/2004.05986">CLUE</a>, <a
55
+ href="https://arxiv.org/abs/2306.09212">CMMLU</a></td>
56
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
57
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
58
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
59
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
60
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
61
+ </tr>
62
+ <tr>
63
+ <td><a href="https://arxiv.org/abs/2312.12853">CORECODE</a> </td>
64
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
65
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
66
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
67
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
68
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
69
+ </tr>
70
+ <tr>
71
+ <td><strong><a href="https://arxiv.org/abs/2403.14112">CHARM (ours)</a> </strong></td>
72
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
73
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
74
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
75
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
76
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
77
+ </tr>
78
+ </table>
79
+
80
+ "CN-Lang" indicates the benchmark is presented in Chinese language. "CSR" means the benchmark is designed to focus on <strong>C</strong>ommon<strong>S</strong>ense <strong>R</strong>easoning. "CN-specific" indicates the benchmark includes elements that are unique to Chinese culture, language, regional characteristics, history, etc. "Dual-Domain" indicates the benchmark encompasses both Chinese-specific and global domain tasks, with questions presented in the similar style and format. "Rea-Mem" indicates the benchmark includes closely-interconnected <strong>rea</strong>soning and <strong>mem</strong>orization tasks.
81
+
82
+
83
+ ## 🛠️ How to Use
84
+ Below are the steps for quickly downloading CHARM and using OpenCompass for evaluation.
85
+
86
+ ### 1. Download CHARM
87
+ ```bash
88
+ git clone https://github.com/opendatalab/CHARM ${path_to_CHARM_repo}
89
+
90
+ cd ${path_to_opencompass}
91
+ mkdir data
92
+ ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM
93
+ ```
94
+ ### 2. Run Inference and Evaluation
95
+ ```bash
96
+ cd ${path_to_opencompass}
97
+
98
+ # modify config file `configs/eval_charm_rea.py`: uncomment or add models you want to evaluate
99
+ python run.py configs/eval_charm_rea.py -r --dump-eval-details
100
+
101
+ # modify config file `configs/eval_charm_mem.py`: uncomment or add models you want to evaluate
102
+ python run.py configs/eval_charm_mem.py -r --dump-eval-details
103
+ ```
104
+ The inference and evaluation results would be in `${path_to_opencompass}/outputs`, like this:
105
+ ```bash
106
+ outputs
107
+ ├── CHARM_mem
108
+ │ └── chat
109
+ │ └── 20240605_151442
110
+ │ ├── predictions
111
+ │ │ ├── internlm2-chat-1.8b-turbomind
112
+ │ │ ├── llama-3-8b-instruct-lmdeploy
113
+ │ │ └── qwen1.5-1.8b-chat-hf
114
+ │ ├── results
115
+ │ │ ├── internlm2-chat-1.8b-turbomind_judged-by--GPT-3.5-turbo-0125
116
+ │ │ ├── llama-3-8b-instruct-lmdeploy_judged-by--GPT-3.5-turbo-0125
117
+ │ │ └── qwen1.5-1.8b-chat-hf_judged-by--GPT-3.5-turbo-0125
118
+ │   └── summary
119
+ │   └── 20240605_205020 # MEMORY_SUMMARY_DIR
120
+ │   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Anachronisms_Judgment
121
+ │   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Movie_and_Music_Recommendation
122
+ │   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Sport_Understanding
123
+ │   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Time_Understanding
124
+ │   └── judged-by--GPT-3.5-turbo-0125.csv # MEMORY_SUMMARY_CSV
125
+ └── CHARM_rea
126
+ └── chat
127
+ └── 20240605_152359
128
+ ├── predictions
129
+ │ ├── internlm2-chat-1.8b-turbomind
130
+ │ ├── llama-3-8b-instruct-lmdeploy
131
+ │ └── qwen1.5-1.8b-chat-hf
132
+ ├── results # REASON_RESULTS_DIR
133
+ │ ├── internlm2-chat-1.8b-turbomind
134
+ │ ├── llama-3-8b-instruct-lmdeploy
135
+ │ └── qwen1.5-1.8b-chat-hf
136
+ └── summary
137
+ ├── summary_20240605_205328.csv # REASON_SUMMARY_CSV
138
+ └── summary_20240605_205328.txt
139
+ ```
140
+ ### 3. Generate Analysis Results
141
+ ```bash
142
+ cd ${path_to_CHARM_repo}
143
+
144
+ # generate Table5, Table6, Table9 and Table10 in https://arxiv.org/abs/2403.14112
145
+ PYTHONPATH=. python tools/summarize_reasoning.py ${REASON_SUMMARY_CSV}
146
+
147
+ # generate Figure3 and Figure9 in https://arxiv.org/abs/2403.14112
148
+ PYTHONPATH=. python tools/summarize_mem_rea.py ${REASON_SUMMARY_CSV} ${MEMORY_SUMMARY_CSV}
149
+
150
+ # generate Table7, Table12, Table13 and Figure11 in https://arxiv.org/abs/2403.14112
151
+ PYTHONPATH=. python tools/analyze_mem_indep_rea.py data/CHARM ${REASON_RESULTS_DIR} ${MEMORY_SUMMARY_DIR} ${MEMORY_SUMMARY_CSV}
152
+ ```
153
+
154
+ ## 🖊️ Citation
155
+ ```bibtex
156
+ @misc{sun2024benchmarking,
157
+ title={Benchmarking Chinese Commonsense Reasoning of LLMs: From Chinese-Specifics to Reasoning-Memorization Correlations},
158
+ author={Jiaxing Sun and Weiquan Huang and Jiang Wu and Chenya Gu and Wei Li and Songyang Zhang and Hang Yan and Conghui He},
159
+ year={2024},
160
+ eprint={2403.14112},
161
+ archivePrefix={arXiv},
162
+ primaryClass={cs.CL}
163
+ }
164
+ ```
opencompass/configs/datasets/CHARM/README_ZH.md ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CHARM✨ Benchmarking Chinese Commonsense Reasoning of LLMs: From Chinese-Specifics to Reasoning-Memorization Correlations [ACL2024]
2
+ [![arXiv](https://img.shields.io/badge/arXiv-2403.14112-b31b1b.svg)](https://arxiv.org/abs/2403.14112)
3
+ [![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](./LICENSE)
4
+ <div align="center">
5
+
6
+ 📃[Paper](https://arxiv.org/abs/2403.14112)
7
+ 🏰[Project Page](https://opendatalab.github.io/CHARM/)
8
+ 🏆[Leaderboard](https://opendatalab.github.io/CHARM/leaderboard.html)
9
+ ✨[Findings](https://opendatalab.github.io/CHARM/findings.html)
10
+ </div>
11
+
12
+ <div align="center">
13
+ 📖 <a href="./README_ZH.md"> 中文</a> | <a href="./README.md">English</a>
14
+ </div>
15
+
16
+ ## 数据集介绍
17
+
18
+ **CHARM** 是首个全面深入评估大型语言模型(LLMs)在中文常识推理能力的基准测试,它覆盖了国际普遍认知的常识以及独特的中国文化常识。此外,CHARM 还可以评估 LLMs 独立于记忆的推理能力,并分析其典型错误。
19
+
20
+
21
+ ## 与其他常识推理评测基准的比较
22
+ <html lang="en">
23
+ <table align="center">
24
+ <thead class="fixed-header">
25
+ <tr>
26
+ <th>基准</th>
27
+ <th>汉语</th>
28
+ <th>常识推理</th>
29
+ <th>中国特有知识</th>
30
+ <th>中国和世界知识域</th>
31
+ <th>推理和记忆的关系</th>
32
+ </tr>
33
+ </thead>
34
+ <tr>
35
+ <td><a href="https://arxiv.org/abs/2302.04752"> davis2023benchmarks</a> 中提到的基准</td>
36
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
37
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
38
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
39
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
40
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
41
+ </tr>
42
+ <tr>
43
+ <td><a href="https://arxiv.org/abs/1809.05053"> XNLI</a>, <a
44
+ href="https://arxiv.org/abs/2005.00333">XCOPA</a>,<a
45
+ href="https://arxiv.org/abs/2112.10668">XStoryCloze</a></td>
46
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
47
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
48
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
49
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
50
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
51
+ </tr>
52
+ <tr>
53
+ <td><a href="https://arxiv.org/abs/2007.08124">LogiQA</a>,<a
54
+ href="https://arxiv.org/abs/2004.05986">CLUE</a>, <a
55
+ href="https://arxiv.org/abs/2306.09212">CMMLU</a></td>
56
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
57
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
58
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
59
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
60
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
61
+ </tr>
62
+ <tr>
63
+ <td><a href="https://arxiv.org/abs/2312.12853">CORECODE</a> </td>
64
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
65
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
66
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
67
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
68
+ <td><strong><span style="color: red;">&#x2718;</span></strong></td>
69
+ </tr>
70
+ <tr>
71
+ <td><strong><a href="https://arxiv.org/abs/2403.14112">CHARM (ours)</a> </strong></td>
72
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
73
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
74
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
75
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
76
+ <td><strong><span style="color: green;">&#x2714;</span></strong></td>
77
+ </tr>
78
+ </table>
79
+
80
+
81
+ ## 🛠️ 如何使用
82
+ 以下是快速下载 CHARM 并在 OpenCompass 上进行评估的步骤。
83
+
84
+ ### 1. 下载 CHARM
85
+ ```bash
86
+ git clone https://github.com/opendatalab/CHARM ${path_to_CHARM_repo}
87
+
88
+ cd ${path_to_opencompass}
89
+ mkdir data
90
+ ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM
91
+ ```
92
+ ### 2. 推理和评测
93
+ ```bash
94
+ cd ${path_to_opencompass}
95
+
96
+ # 修改配置文件`configs/eval_charm_rea.py`: 将现有的模型取消注释,或���添加你想评测的模型
97
+ python run.py configs/eval_charm_rea.py -r --dump-eval-details
98
+
99
+ # 修改配置文件`configs/eval_charm_mem.py`: 将现有的模型取消注释,或者添加你想评测的模型
100
+ python run.py configs/eval_charm_mem.py -r --dump-eval-details
101
+ ```
102
+ 推理和评测的结果位于路径`${path_to_opencompass}/outputs`, 如下所示:
103
+ ```bash
104
+ outputs
105
+ ├── CHARM_mem
106
+ │ └── chat
107
+ │ └── 20240605_151442
108
+ │ ├── predictions
109
+ │ │ ├── internlm2-chat-1.8b-turbomind
110
+ │ │ ├── llama-3-8b-instruct-lmdeploy
111
+ │ │ └── qwen1.5-1.8b-chat-hf
112
+ │ ├── results
113
+ │ │ ├── internlm2-chat-1.8b-turbomind_judged-by--GPT-3.5-turbo-0125
114
+ │ │ ├── llama-3-8b-instruct-lmdeploy_judged-by--GPT-3.5-turbo-0125
115
+ │ │ └── qwen1.5-1.8b-chat-hf_judged-by--GPT-3.5-turbo-0125
116
+ │   └── summary
117
+ │   └── 20240605_205020 # MEMORY_SUMMARY_DIR
118
+ │   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Anachronisms_Judgment
119
+ │   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Movie_and_Music_Recommendation
120
+ │   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Sport_Understanding
121
+ │   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Time_Understanding
122
+ │   └── judged-by--GPT-3.5-turbo-0125.csv # MEMORY_SUMMARY_CSV
123
+ └── CHARM_rea
124
+ └── chat
125
+ └── 20240605_152359
126
+ ├── predictions
127
+ │ ├── internlm2-chat-1.8b-turbomind
128
+ │ ├── llama-3-8b-instruct-lmdeploy
129
+ │ └── qwen1.5-1.8b-chat-hf
130
+ ├── results # REASON_RESULTS_DIR
131
+ │ ├── internlm2-chat-1.8b-turbomind
132
+ │ ├── llama-3-8b-instruct-lmdeploy
133
+ │ └── qwen1.5-1.8b-chat-hf
134
+ └── summary
135
+ ├── summary_20240605_205328.csv # REASON_SUMMARY_CSV
136
+ └── summary_20240605_205328.txt
137
+ ```
138
+ ### 3. 生成分析结果
139
+ ```bash
140
+ cd ${path_to_CHARM_repo}
141
+
142
+ # 生成论文中的Table5, Table6, Table9 and Table10,详见https://arxiv.org/abs/2403.14112
143
+ PYTHONPATH=. python tools/summarize_reasoning.py ${REASON_SUMMARY_CSV}
144
+
145
+ # 生成论文中的Figure3 and Figure9,详见https://arxiv.org/abs/2403.14112
146
+ PYTHONPATH=. python tools/summarize_mem_rea.py ${REASON_SUMMARY_CSV} ${MEMORY_SUMMARY_CSV}
147
+
148
+ # 生成论文中的Table7, Table12, Table13 and Figure11,详见https://arxiv.org/abs/2403.14112
149
+ PYTHONPATH=. python tools/analyze_mem_indep_rea.py data/CHARM ${REASON_RESULTS_DIR} ${MEMORY_SUMMARY_DIR} ${MEMORY_SUMMARY_CSV}
150
+ ```
151
+
152
+ ## 🖊️ 引用
153
+ ```bibtex
154
+ @misc{sun2024benchmarking,
155
+ title={Benchmarking Chinese Commonsense Reasoning of LLMs: From Chinese-Specifics to Reasoning-Memorization Correlations},
156
+ author={Jiaxing Sun and Weiquan Huang and Jiang Wu and Chenya Gu and Wei Li and Songyang Zhang and Hang Yan and Conghui He},
157
+ year={2024},
158
+ eprint={2403.14112},
159
+ archivePrefix={arXiv},
160
+ primaryClass={cs.CL}
161
+ }
162
+ ```
opencompass/configs/datasets/CHARM/charm_memory_gen_bbbd53.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from mmengine.config import read_base
3
+
4
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
5
+ from opencompass.openicl.icl_retriever import ZeroRetriever
6
+ from opencompass.openicl.icl_inferencer import GenInferencer
7
+ from opencompass.datasets import CharmDataset, CharmMemoryEvaluator, LMEvaluator
8
+
9
+ with read_base():
10
+ from .charm_memory_settings import charm_memory_tasks, judge_system_prompts, dataset_path
11
+
12
+ charm_memory_datasets = []
13
+
14
+ for _task in charm_memory_tasks:
15
+
16
+ charm_memory_reader_cfg = dict(input_columns=['input'],
17
+ output_column='target')
18
+
19
+ charm_memory_infer_cfg = dict(
20
+ prompt_template=dict(
21
+ type=PromptTemplate,
22
+ template=dict(round=[
23
+ dict(role='HUMAN', prompt='请尽可能简短地回答下述问题。\n问题:{input}\n答:')
24
+ ]),
25
+ ),
26
+ retriever=dict(type=ZeroRetriever),
27
+ inferencer=dict(type=GenInferencer, max_out_len=512),
28
+ )
29
+
30
+ if _task == 'Chinese_Movie_and_Music_Recommendation':
31
+ charm_memory_eval_cfg = dict(
32
+ evaluator=dict(type=CharmMemoryEvaluator),
33
+ pred_role='BOT',
34
+ )
35
+ else:
36
+ judge_system_prompt = judge_system_prompts[_task]
37
+ charm_memory_eval_cfg = dict(
38
+ evaluator=dict(
39
+ type=LMEvaluator,
40
+ prompt_template=dict(
41
+ type=PromptTemplate,
42
+ template=dict(round=[
43
+ dict(
44
+ role='HUMAN',
45
+ prompt=judge_system_prompt +
46
+ "\n\n[Question]\n{input}\n[The Start of Reference Answer]\n{target}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{prediction}\n[The End of Assistant's Answer]" # noqa
47
+ ),
48
+ ]),
49
+ ),
50
+ ),
51
+ pred_role='BOT',
52
+ )
53
+
54
+ charm_memory_datasets.append(
55
+ dict(
56
+ type=CharmDataset,
57
+ path=dataset_path,
58
+ name=_task,
59
+ abbr='charm-memory-' + _task,
60
+ reader_cfg=charm_memory_reader_cfg,
61
+ infer_cfg=charm_memory_infer_cfg.copy(),
62
+ eval_cfg=charm_memory_eval_cfg.copy(),
63
+ ))
opencompass/configs/datasets/CHARM/charm_memory_settings.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ charm_memory_tasks = [
4
+ 'Chinese_Anachronisms_Judgment',
5
+ 'Chinese_Movie_and_Music_Recommendation',
6
+ 'Chinese_Sport_Understanding',
7
+ 'Chinese_Time_Understanding',
8
+ ]
9
+
10
+ dataset_path = 'data/CHARM/memorization'
11
+
12
+ system_prompt_template = """Please act as an impartial judge, comparing the responses of the AI assistants to the reference answer and determining if the answers are correct.
13
+ You will receive the reference answer provided by a human and the responses of the AI assistants.
14
+ Your task is to judge whether the AI assistant's answers is correct.
15
+ {task_specific_prompt}
16
+ After providing your explanation, strictly output your final judgment in the following format: “[正确]” if the AI assistant's response is correct, “[错误]” if the AI assistant's response is incorrect.
17
+ """
18
+
19
+ task_specific_prompts = {
20
+ 'Chinese_Anachronisms_Judgment':
21
+ "If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list.",
22
+ 'Chinese_Time_Understanding':
23
+ "When evaluating the AI assistant's response regarding Chinese solar terms, as long as the AI assistant's response falls within the time frame provided in the reference answer, consider it correct.",
24
+ 'Chinese_Sport_Understanding':
25
+ "If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list."
26
+ }
27
+
28
+ judge_system_prompts = {
29
+ k: system_prompt_template.format(task_specific_prompt=v)
30
+ for k, v in task_specific_prompts.items()
31
+ }
opencompass/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from mmengine.config import read_base
3
+
4
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
5
+ from opencompass.openicl.icl_retriever import ZeroRetriever
6
+ from opencompass.openicl.icl_inferencer import GenInferencer
7
+ from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
8
+
9
+ with read_base():
10
+ from .charm_reason_settings import charm_tasks, settings
11
+
12
+ settings = [s for s in settings if s[0] in ['ZH-CoT', 'EN-CoT']]
13
+
14
+ charm_reason_datasets = []
15
+
16
+ for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
17
+ for _task in charm_tasks:
18
+ _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
19
+ with open(_fewshot_example_file, 'r') as f:
20
+ _hint = f.read()
21
+
22
+ charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
23
+
24
+ charm_reason_infer_cfg = dict(
25
+ prompt_template=dict(
26
+ type=PromptTemplate,
27
+ template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
28
+ ),
29
+ retriever=dict(type=ZeroRetriever),
30
+ inferencer=dict(type=GenInferencer, max_out_len=512),
31
+ )
32
+
33
+ charm_reason_eval_cfg = dict(
34
+ evaluator=dict(type=CharmReasonEvaluator),
35
+ pred_role='BOT',
36
+ pred_postprocessor=dict(type=charm_reason_postprocess),
37
+ dataset_postprocessor=dict(type=charm_reason_postprocess),
38
+ )
39
+
40
+ charm_reason_datasets.append(
41
+ dict(
42
+ type=CharmDataset,
43
+ path=dataset_path,
44
+ name=_task,
45
+ abbr='charm-reason-' + _task + '_' + _cot,
46
+ reader_cfg=charm_reason_reader_cfg,
47
+ infer_cfg=charm_reason_infer_cfg.copy(),
48
+ eval_cfg=charm_reason_eval_cfg.copy(),
49
+ )
50
+ )
opencompass/configs/datasets/CHARM/charm_reason_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .charm_reason_gen_f8fca2 import charm_reason_datasets # noqa: F401, F403
opencompass/configs/datasets/CHARM/charm_reason_gen_f8fca2.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from mmengine.config import read_base
3
+
4
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
5
+ from opencompass.openicl.icl_retriever import ZeroRetriever
6
+ from opencompass.openicl.icl_inferencer import GenInferencer
7
+ from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
8
+
9
+ with read_base():
10
+ from .charm_reason_settings import charm_tasks, settings
11
+
12
+
13
+ charm_reason_datasets = []
14
+
15
+ for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
16
+ for _task in charm_tasks:
17
+ _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
18
+ with open(_fewshot_example_file, 'r') as f:
19
+ _hint = f.read()
20
+
21
+ charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
22
+
23
+ charm_reason_infer_cfg = dict(
24
+ prompt_template=dict(
25
+ type=PromptTemplate,
26
+ template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
27
+ ),
28
+ retriever=dict(type=ZeroRetriever),
29
+ inferencer=dict(type=GenInferencer, max_out_len=512),
30
+ )
31
+
32
+ charm_reason_eval_cfg = dict(
33
+ evaluator=dict(type=CharmReasonEvaluator),
34
+ pred_role='BOT',
35
+ pred_postprocessor=dict(type=charm_reason_postprocess),
36
+ dataset_postprocessor=dict(type=charm_reason_postprocess),
37
+ )
38
+
39
+ charm_reason_datasets.append(
40
+ dict(
41
+ type=CharmDataset,
42
+ path=dataset_path,
43
+ name=_task,
44
+ abbr='charm-reason-' + _task + '_' + _cot,
45
+ reader_cfg=charm_reason_reader_cfg,
46
+ infer_cfg=charm_reason_infer_cfg.copy(),
47
+ eval_cfg=charm_reason_eval_cfg.copy(),
48
+ )
49
+ )
opencompass/configs/datasets/CHARM/charm_reason_ppl_3da4de.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+ from opencompass.openicl.icl_inferencer import PPLInferencer
6
+ from opencompass.datasets import CharmDataset
7
+ from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
8
+
9
+ charm_tasks = [
10
+ ['Chinese_Anachronisms_Judgment', 'AB'],
11
+ ['Chinese_Movie_and_Music_Recommendation', 'ABCD'],
12
+ ['Chinese_Natural_Language_Inference', 'ABC'],
13
+ ['Chinese_Reading_Comprehension', 'ABCD'],
14
+ ['Chinese_Sequence_Understanding', 'ABCD'],
15
+ ['Chinese_Sport_Understanding', 'AB'],
16
+ ['Chinese_Time_Understanding', 'ABCD'],
17
+ ['Global_Anachronisms_Judgment', 'AB'],
18
+ ['Global_Movie_and_Music_Recommendation', 'ABCD'],
19
+ ['Global_Natural_Language_Inference', 'ABC'],
20
+ ['Global_Reading_Comprehension', 'ABCD'],
21
+ ['Global_Sequence_Understanding', 'ABCD'],
22
+ ['Global_Sport_Understanding', 'AB'],
23
+ ['Global_Time_Understanding', 'ABCDEF'],
24
+ ]
25
+
26
+ charm_reason_datasets = []
27
+ for task_name, options in charm_tasks:
28
+
29
+ with open(os.path.join(os.path.dirname(__file__), 'few-shot-examples', f'{task_name}_Direct.txt'), 'r') as f:
30
+ few_shot_example = f.read()
31
+
32
+ charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
33
+
34
+ charm_reason_infer_cfg = dict(
35
+ prompt_template=dict(
36
+ type=PromptTemplate,
37
+ template={
38
+ f'({opt})': f'{few_shot_example}\n{{input}}\nA: {opt}' for opt in options
39
+ },
40
+ ),
41
+ retriever=dict(type=ZeroRetriever),
42
+ inferencer=dict(type=PPLInferencer),
43
+ )
44
+
45
+ charm_reason_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
46
+
47
+ charm_reason_datasets.append(
48
+ dict(
49
+ type=CharmDataset,
50
+ abbr=f'charm-reason-{task_name}_Direct',
51
+ path=f'data/CHARM/reasoning',
52
+ name=task_name,
53
+ reader_cfg=charm_reason_reader_cfg,
54
+ infer_cfg=charm_reason_infer_cfg,
55
+ eval_cfg=charm_reason_eval_cfg,
56
+ )
57
+ )
opencompass/configs/datasets/CHARM/charm_reason_settings.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ charm_tasks = [
4
+ 'Chinese_Anachronisms_Judgment',
5
+ 'Chinese_Movie_and_Music_Recommendation',
6
+ 'Chinese_Natural_Language_Inference',
7
+ 'Chinese_Reading_Comprehension',
8
+ 'Chinese_Sequence_Understanding',
9
+ 'Chinese_Sport_Understanding',
10
+ 'Chinese_Time_Understanding',
11
+ 'Global_Anachronisms_Judgment',
12
+ 'Global_Movie_and_Music_Recommendation',
13
+ 'Global_Natural_Language_Inference',
14
+ 'Global_Reading_Comprehension',
15
+ 'Global_Sequence_Understanding',
16
+ 'Global_Sport_Understanding',
17
+ 'Global_Time_Understanding',
18
+ ]
19
+
20
+ XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n'
21
+ Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
22
+ Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ:{{input}}\nA:'
23
+
24
+ data_dir = 'data/CHARM'
25
+ dataset_path_ZH = f'{data_dir}/reasoning'
26
+ dataset_path_TransEn = f'{data_dir}/reasoning_Translate-EN'
27
+ fewshot_example_path_ZH = os.path.join(os.path.dirname(__file__), 'few-shot-examples')
28
+ fewshot_example_path_TransEn = os.path.join(os.path.dirname(__file__), 'few-shot-examples_Translate-EN')
29
+
30
+ settings = [
31
+ ('Direct', '', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
32
+ ('ZH-CoT', '让我们一步一步来思考。', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
33
+ ('EN-CoT', "Let's think step by step.", dataset_path_ZH, fewshot_example_path_ZH, Other_template),
34
+ ('XLT', """You should retell the request in English.\nYou should do the answer step by step to choose the right answer.\nYou should step-by-step answer the request.\nYou should tell me the answer in this format 'So the answer is'.""", dataset_path_ZH, fewshot_example_path_ZH, XLT_template),
35
+ ('Translate-EN', "Let's think step by step.", dataset_path_TransEn, fewshot_example_path_TransEn, Translate_EN_template),
36
+ ]
opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .CLUE_afqmc_gen_901306 import afqmc_datasets # noqa: F401, F403
opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import AFQMCDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ afqmc_reader_cfg = dict(
9
+ input_columns=['sentence1', 'sentence2'],
10
+ output_column='label',
11
+ test_split='train')
12
+
13
+ afqmc_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt=
20
+ '语句一:“{sentence1}”\n语句二:“{sentence2}”\n语句一与语句二是关于蚂蚁金融产品的疑问,两者所询问的内容是否完全一致?\nA. 不完全一致\nB. 完全一致\n请从“A”,“B”中进行选择。\n答:',
21
+ ),
22
+ ]),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ afqmc_eval_cfg = dict(
29
+ evaluator=dict(type=AccEvaluator),
30
+ pred_role='BOT',
31
+ pred_postprocessor=dict(type=first_capital_postprocess),
32
+ )
33
+
34
+ afqmc_datasets = [
35
+ dict(
36
+ abbr='afqmc-dev',
37
+ type=AFQMCDatasetV2,
38
+ path='opencompass/afqmc-dev',
39
+ reader_cfg=afqmc_reader_cfg,
40
+ infer_cfg=afqmc_infer_cfg,
41
+ eval_cfg=afqmc_eval_cfg,
42
+ ),
43
+ ]
opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .CLUE_afqmc_ppl_6507d7 import afqmc_datasets # noqa: F401, F403
opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_ppl_378c5b.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ afqmc_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ afqmc_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0:
17
+ dict(round=[
18
+ dict(
19
+ role='HUMAN', prompt='“{sentence1}”与“{sentence2}”不同还是相似?'),
20
+ dict(role='BOT', prompt='不同。')
21
+ ]),
22
+ 1:
23
+ dict(round=[
24
+ dict(
25
+ role='HUMAN', prompt='“{sentence1}”与“{sentence2}”不同还是相似?'),
26
+ dict(role='BOT', prompt='相似')
27
+ ]),
28
+ }),
29
+ retriever=dict(type=ZeroRetriever),
30
+ inferencer=dict(type=PPLInferencer))
31
+
32
+ afqmc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
33
+
34
+ afqmc_datasets = [
35
+ dict(
36
+ type=HFDataset,
37
+ abbr='afqmc-dev',
38
+ path='json',
39
+ data_files='./data/CLUE/AFQMC/dev.json',
40
+ split='train',
41
+ reader_cfg=afqmc_reader_cfg,
42
+ infer_cfg=afqmc_infer_cfg,
43
+ eval_cfg=afqmc_eval_cfg),
44
+ ]
opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_ppl_6507d7.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ afqmc_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ afqmc_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0:
17
+ dict(round=[
18
+ dict(
19
+ role='HUMAN',
20
+ prompt=
21
+ '语句一:“{sentence1}”\n语句二:“{sentence2}”\n语句一与语句二是关于蚂蚁金融产品的疑问,两者所询问的内容是否完全一致?'
22
+ ),
23
+ dict(role='BOT', prompt='不完全一致')
24
+ ]),
25
+ 1:
26
+ dict(round=[
27
+ dict(
28
+ role='HUMAN',
29
+ prompt=
30
+ '语句一:“{sentence1}”\n语句二:“{sentence2}”\n语句一与语句二是关于蚂蚁金融产品的疑问,两者所询问的内容是否完全一致?'
31
+ ),
32
+ dict(role='BOT', prompt='完全一致')
33
+ ]),
34
+ }),
35
+ retriever=dict(type=ZeroRetriever),
36
+ inferencer=dict(type=PPLInferencer))
37
+
38
+ afqmc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
39
+
40
+ afqmc_datasets = [
41
+ dict(
42
+ type=HFDataset,
43
+ abbr='afqmc-dev',
44
+ path='json',
45
+ data_files='./data/CLUE/AFQMC/dev.json',
46
+ split='train',
47
+ reader_cfg=afqmc_reader_cfg,
48
+ infer_cfg=afqmc_infer_cfg,
49
+ eval_cfg=afqmc_eval_cfg),
50
+ ]
opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_ppl_7b0c1e.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+ afqmc_reader_cfg = dict(
8
+ input_columns=['sentence1', 'sentence2'],
9
+ output_column='label',
10
+ test_split='train')
11
+
12
+ afqmc_infer_cfg = dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template={
16
+ 0: '{sentence1},{sentence2}不同。',
17
+ 1: '{sentence1},{sentence2}相似。'
18
+ }),
19
+ retriever=dict(type=ZeroRetriever),
20
+ inferencer=dict(type=PPLInferencer))
21
+
22
+ afqmc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
23
+
24
+ afqmc_datasets = [
25
+ dict(
26
+ type=HFDataset,
27
+ abbr='afqmc-dev',
28
+ path='json',
29
+ data_files='./data/CLUE/AFQMC/dev.json',
30
+ split='train',
31
+ reader_cfg=afqmc_reader_cfg,
32
+ infer_cfg=afqmc_infer_cfg,
33
+ eval_cfg=afqmc_eval_cfg),
34
+ ]
opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets # noqa: F401, F403
opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CluewscDatasetV2
6
+ from opencompass.utils.text_postprocessors import first_capital_postprocess
7
+
8
+ cluewsc_reader_cfg = dict(
9
+ input_columns=['span1', 'span2', 'text', 'new_text'],
10
+ output_column='label',
11
+ )
12
+
13
+ cluewsc_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template=dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt=
20
+ '{text}\n此处,“{span2}”是否指代“{span1}“?\nA. 是\nB. 否\n请从”A“,”B“中进行选择。\n答:',
21
+ ),
22
+ ]),
23
+ ),
24
+ retriever=dict(type=ZeroRetriever),
25
+ inferencer=dict(type=GenInferencer),
26
+ )
27
+
28
+ cluewsc_eval_cfg = dict(
29
+ evaluator=dict(type=AccEvaluator),
30
+ pred_role='BOT',
31
+ pred_postprocessor=dict(type=first_capital_postprocess),
32
+ )
33
+
34
+ cluewsc_datasets = [
35
+ dict(
36
+ abbr='cluewsc-dev',
37
+ type=CluewscDatasetV2,
38
+ path='./data/FewCLUE/cluewsc/dev_few_all.json',
39
+ reader_cfg=cluewsc_reader_cfg,
40
+ infer_cfg=cluewsc_infer_cfg,
41
+ eval_cfg=cluewsc_eval_cfg,
42
+ ),
43
+ dict(
44
+ abbr='cluewsc-test',
45
+ type=CluewscDatasetV2,
46
+ path='./data/FewCLUE/cluewsc/test_public.json',
47
+ reader_cfg=cluewsc_reader_cfg,
48
+ infer_cfg=cluewsc_infer_cfg,
49
+ eval_cfg=cluewsc_eval_cfg,
50
+ ),
51
+ ]
opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets # noqa: F401, F403
opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_12e4e0.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CluewscDataset
6
+
7
+ cluewsc_reader_cfg = dict(
8
+ input_columns=['span1', 'span2', 'text', 'new_text'],
9
+ output_column='answer')
10
+
11
+ cluewsc_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template={
15
+ 0:
16
+ dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt=
20
+ "{text}\nHere, is the pronoun \"{span2}\" used to mean \"{span1}\"?"
21
+ ),
22
+ dict(role='BOT', prompt='No.')
23
+ ]),
24
+ 1:
25
+ dict(round=[
26
+ dict(
27
+ role='HUMAN',
28
+ prompt=
29
+ "{text}\nHere, is the pronoun \"{span2}\" used to mean \"{span1}\"?"
30
+ ),
31
+ dict(role='BOT', prompt='Yes.')
32
+ ]),
33
+ }),
34
+ retriever=dict(type=ZeroRetriever),
35
+ inferencer=dict(type=PPLInferencer))
36
+
37
+ cluewsc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
38
+
39
+ cluewsc_datasets = [
40
+ dict(
41
+ type=CluewscDataset,
42
+ path='json',
43
+ abbr='cluewsc-dev',
44
+ data_files='./data/FewCLUE/cluewsc/dev_few_all.json',
45
+ split='train',
46
+ reader_cfg=cluewsc_reader_cfg,
47
+ infer_cfg=cluewsc_infer_cfg,
48
+ eval_cfg=cluewsc_eval_cfg),
49
+ dict(
50
+ type=CluewscDataset,
51
+ path='json',
52
+ abbr='cluewsc-test',
53
+ data_files='./data/FewCLUE/cluewsc/test_public.json',
54
+ split='train',
55
+ reader_cfg=cluewsc_reader_cfg,
56
+ infer_cfg=cluewsc_infer_cfg,
57
+ eval_cfg=cluewsc_eval_cfg),
58
+ ]
opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_4284a0.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CluewscDataset
6
+
7
+ cluewsc_reader_cfg = dict(
8
+ input_columns=['span1', 'span2', 'text', 'new_text'],
9
+ output_column='answer')
10
+
11
+ cluewsc_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template={
15
+ 0:
16
+ "{text}\nHere, is the pronoun \"{span2}\" used to mean \"{span1}\"? No.",
17
+ 1:
18
+ "{text}\nHere, is the pronoun \"{span2}\" used to mean \"{span1}\"? Yes.",
19
+ }),
20
+ retriever=dict(type=ZeroRetriever),
21
+ inferencer=dict(type=PPLInferencer))
22
+
23
+ cluewsc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
24
+
25
+ cluewsc_datasets = [
26
+ dict(
27
+ type=CluewscDataset,
28
+ path='json',
29
+ abbr='cluewsc-dev',
30
+ data_files='./data/FewCLUE/cluewsc/dev_few_all.json',
31
+ split='train',
32
+ reader_cfg=cluewsc_reader_cfg,
33
+ infer_cfg=cluewsc_infer_cfg,
34
+ eval_cfg=cluewsc_eval_cfg),
35
+ dict(
36
+ type=CluewscDataset,
37
+ path='json',
38
+ abbr='cluewsc-test',
39
+ data_files='./data/FewCLUE/cluewsc/test_public.json',
40
+ split='train',
41
+ reader_cfg=cluewsc_reader_cfg,
42
+ infer_cfg=cluewsc_infer_cfg,
43
+ eval_cfg=cluewsc_eval_cfg),
44
+ ]
opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_868415.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import CluewscDataset
6
+
7
+ cluewsc_reader_cfg = dict(
8
+ input_columns=['span1', 'span2', 'text', 'new_text'],
9
+ output_column='answer')
10
+
11
+ cluewsc_infer_cfg = dict(
12
+ prompt_template=dict(
13
+ type=PromptTemplate,
14
+ template={
15
+ 0:
16
+ dict(round=[
17
+ dict(
18
+ role='HUMAN',
19
+ prompt='{text}\n此处,代词“{span2}“被用于指代“{span1}“吗?'),
20
+ dict(role='BOT', prompt='否')
21
+ ]),
22
+ 1:
23
+ dict(round=[
24
+ dict(
25
+ role='HUMAN',
26
+ prompt='{text}\n此处,代词“{span2}“被用于指代“{span1}“吗?'),
27
+ dict(role='BOT', prompt='是')
28
+ ]),
29
+ }),
30
+ retriever=dict(type=ZeroRetriever),
31
+ inferencer=dict(type=PPLInferencer))
32
+
33
+ cluewsc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
34
+
35
+ cluewsc_datasets = [
36
+ dict(
37
+ type=CluewscDataset,
38
+ path='json',
39
+ abbr='cluewsc-dev',
40
+ data_files='./data/FewCLUE/cluewsc/dev_few_all.json',
41
+ split='train',
42
+ reader_cfg=cluewsc_reader_cfg,
43
+ infer_cfg=cluewsc_infer_cfg,
44
+ eval_cfg=cluewsc_eval_cfg),
45
+ dict(
46
+ type=CluewscDataset,
47
+ path='json',
48
+ abbr='cluewsc-test',
49
+ data_files='./data/FewCLUE/cluewsc/test_public.json',
50
+ split='train',
51
+ reader_cfg=cluewsc_reader_cfg,
52
+ infer_cfg=cluewsc_infer_cfg,
53
+ eval_cfg=cluewsc_eval_cfg),
54
+ ]
opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .GLUE_QQP_ppl_250d00 import QQP_datasets # noqa: F401, F403
opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl_250d00.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import FixKRetriever
3
+ from opencompass.openicl.icl_inferencer import PPLInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import HFDataset
6
+
7
+
8
+ _hint = 'The following are semantic matching questions. \n' \
9
+ 'Please determine whether the following two sentences are semantically duplicate: ' \
10
+ '0 means not duplicate, 1 means duplicate.\n'
11
+ QQP_infer_cfg = dict(
12
+ ice_template=dict(
13
+ type=PromptTemplate,
14
+ template='Sentence one: {question1}\nSentence two: {question2}\nResult: {label}',
15
+ ),
16
+ prompt_template=dict(
17
+ type=PromptTemplate,
18
+ template={
19
+ answer:
20
+ f'{_hint}</E>Sentence one: {{question1}}\nSentence two: {{question2}}\nResult: {answer}'
21
+ for answer in [0, 1]
22
+ },
23
+ ice_token='</E>',
24
+ ),
25
+ retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
26
+ inferencer=dict(type=PPLInferencer))
27
+
28
+ QQP_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
29
+
30
+
31
+ QQP_datasets = []
32
+ for _split in ['validation', 'test']:
33
+
34
+ QQP_reader_cfg = dict(
35
+ input_columns=['question1', 'question2'],
36
+ output_column='label',
37
+ train_split='train',
38
+ test_split=_split
39
+ )
40
+
41
+ QQP_datasets.append(
42
+ dict(
43
+ abbr=f'QQP-{_split}',
44
+ type=HFDataset,
45
+ path='glue',
46
+ name='qqp',
47
+ reader_cfg=QQP_reader_cfg,
48
+ infer_cfg=QQP_infer_cfg,
49
+ eval_cfg=QQP_eval_cfg
50
+ )
51
+ )
opencompass/configs/datasets/LCBench/README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LCBench2023
2
+
3
+ LCBench2023 collects questions from leetcode weekly competitions between 2022 and 2023. It contains Chinese and English versions, each with 581 questions.
4
+
5
+ ## Base Models
6
+
7
+ | model | lcbench/pass@1 | en/pass@1 | cn/pass@1 | lcbench/pass | lcbench/timeout | lcbench/failed | lcbench/wrong_answer | en/pass | en/timeout | en/failed | en/wrong_answer | cn/pass | cn/timeout | cn/failed | cn/wrong_answer |
8
+ |:------------------------:|-----------------:|------------:|------------:|---------------:|------------------:|-----------------:|-----------------------:|----------:|-------------:|------------:|------------------:|----------:|-------------:|------------:|------------------:|
9
+ | llama-7b-turbomind | 1.30 | 2.61 | 0.00 | 15 | 28 | 843 | 266 | 15 | 14 | 290 | 257 | 0 | 14 | 553 | 9 |
10
+ | llama-13b-turbomind | 2.09 | 4.17 | 0.00 | 24 | 31 | 823 | 274 | 24 | 16 | 270 | 266 | 0 | 15 | 553 | 8 |
11
+ | llama-30b-turbomind | 3.48 | 6.78 | 0.17 | 40 | 41 | 780 | 291 | 39 | 25 | 226 | 286 | 1 | 16 | 554 | 5 |
12
+ | llama-65b-turbomind | 4.00 | 7.83 | 0.17 | 46 | 22 | 755 | 329 | 45 | 10 | 205 | 316 | 1 | 12 | 550 | 13 |
13
+ | llama-2-7b-turbomind | 0.78 | 1.57 | 0.00 | 9 | 28 | 825 | 290 | 9 | 16 | 274 | 277 | 0 | 12 | 551 | 13 |
14
+ | llama-2-13b-turbomind | 2.52 | 5.04 | 0.00 | 29 | 29 | 761 | 333 | 29 | 17 | 207 | 323 | 0 | 12 | 554 | 10 |
15
+ | llama-2-70b-turbomind | 5.04 | 9.57 | 0.52 | 58 | 47 | 684 | 363 | 55 | 28 | 140 | 353 | 3 | 19 | 544 | 10 |
16
+ | llama-3-8b-turbomind | 16.59 | 16.70 | 16.49 | 191 | 30 | 236 | 695 | 96 | 13 | 119 | 348 | 95 | 17 | 117 | 347 |
17
+ | llama-3-70b-turbomind | 38.49 | 38.43 | 38.54 | 443 | 2 | 120 | 587 | 221 | 2 | 58 | 295 | 222 | 0 | 62 | 292 |
18
+ | internlm2-1.8b-turbomind | 4.34 | 5.04 | 3.65 | 50 | 33 | 333 | 736 | 29 | 18 | 177 | 352 | 21 | 15 | 156 | 384 |
19
+ | internlm2-7b-turbomind | 12.16 | 12.52 | 11.81 | 140 | 41 | 166 | 805 | 72 | 23 | 92 | 389 | 68 | 18 | 74 | 416 |
20
+ | internlm2-20b-turbomind | 18.46 | 20.96 | 15.97 | 213 | 54 | 134 | 751 | 121 | 24 | 57 | 374 | 92 | 30 | 77 | 377 |
21
+ | qwen-1.8b-turbomind | 1.82 | 1.91 | 1.74 | 21 | 31 | 449 | 651 | 11 | 17 | 208 | 340 | 10 | 14 | 241 | 311 |
22
+ | qwen-7b-turbomind | 4.95 | 5.39 | 4.51 | 57 | 37 | 388 | 670 | 31 | 15 | 197 | 333 | 26 | 22 | 191 | 337 |
23
+ | qwen-14b-turbomind | 8.86 | 9.74 | 7.99 | 102 | 2 | 245 | 803 | 56 | 0 | 120 | 400 | 46 | 2 | 125 | 403 |
24
+ | qwen-72b-turbomind | 16.86 | 19.48 | 14.24 | 194 | 12 | 229 | 717 | 112 | 4 | 112 | 348 | 82 | 8 | 117 | 369 |
25
+ | qwen1.5-0.5b-hf | 0.87 | 0.52 | 1.22 | 10 | 29 | 499 | 614 | 3 | 10 | 259 | 304 | 7 | 19 | 240 | 310 |
26
+ | qwen1.5-1.8b-hf | 2.00 | 2.26 | 1.74 | 23 | 26 | 434 | 669 | 13 | 10 | 220 | 333 | 10 | 16 | 214 | 336 |
27
+ | qwen1.5-4b-hf | 5.65 | 6.96 | 4.34 | 65 | 37 | 349 | 701 | 40 | 19 | 161 | 356 | 25 | 18 | 188 | 345 |
28
+ | qwen1.5-7b-hf | 6.69 | 8.00 | 5.38 | 77 | 30 | 283 | 762 | 46 | 12 | 124 | 394 | 31 | 18 | 159 | 368 |
29
+ | qwen1.5-14b-hf | 12.69 | 13.74 | 11.63 | 146 | 43 | 232 | 731 | 79 | 22 | 122 | 353 | 67 | 21 | 110 | 378 |
30
+ | qwen1.5-32b-hf | 14.34 | 16.70 | 11.98 | 165 | 45 | 191 | 751 | 96 | 18 | 88 | 374 | 69 | 27 | 103 | 377 |
31
+ | qwen1.5-72b-hf | 15.29 | 15.65 | 14.93 | 176 | 11 | 242 | 723 | 90 | 7 | 118 | 361 | 86 | 4 | 124 | 362 |
32
+ | qwen1.5-moe-a2-7b-hf | 9.56 | 10.09 | 9.03 | 110 | 10 | 272 | 760 | 58 | 5 | 129 | 384 | 52 | 5 | 143 | 376 |
33
+ | mistral-7b-v0.1-hf | 11.38 | 11.83 | 10.94 | 131 | 30 | 221 | 770 | 68 | 11 | 100 | 397 | 63 | 19 | 121 | 373 |
34
+ | mistral-7b-v0.2-hf | 11.38 | 11.13 | 11.63 | 131 | 2 | 259 | 760 | 64 | 2 | 124 | 386 | 67 | 0 | 135 | 374 |
35
+ | mixtral-8x7b-v0.1-hf | 21.11 | 21.39 | 20.83 | 243 | 7 | 165 | 737 | 123 | 4 | 76 | 373 | 120 | 3 | 89 | 364 |
36
+ | mixtral-8x22b-v0.1-hf | 30.97 | 31.22 | 30.73 | 357 | 6 | 131 | 658 | 180 | 3 | 66 | 327 | 177 | 3 | 65 | 331 |
37
+ | yi-6b-hf | 2.43 | 2.78 | 2.08 | 28 | 7 | 456 | 661 | 16 | 2 | 214 | 344 | 12 | 5 | 242 | 317 |
38
+ | yi-34b-hf | 8.25 | 8.35 | 8.16 | 95 | 8 | 319 | 730 | 48 | 5 | 163 | 360 | 47 | 3 | 156 | 370 |
39
+ | deepseek-7b-base-hf | 5.30 | 5.22 | 5.38 | 61 | 7 | 325 | 759 | 30 | 4 | 165 | 377 | 31 | 3 | 160 | 382 |
40
+ | deepseek-67b-base-hf | 26.50 | 26.96 | 26.04 | 305 | 9 | 202 | 636 | 155 | 4 | 105 | 312 | 150 | 5 | 97 | 324 |
41
+
42
+ ## Chat Models
43
+
44
+ | model | lcbench/pass@1 | en/pass@1 | cn/pass@1 | lcbench/pass | lcbench/timeout | lcbench/failed | lcbench/wrong_answer | en/pass | en/timeout | en/failed | en/wrong_answer | cn/pass | cn/timeout | cn/failed | cn/wrong_answer |
45
+ |:-----------------------------:|-----------------:|------------:|------------:|---------------:|------------------:|-----------------:|-----------------------:|----------:|-------------:|------------:|------------------:|----------:|-------------:|------------:|------------------:|
46
+ | qwen1.5-0.5b-chat-hf | 0.00 | 0.00 | 0.00 | 0 | 0 | 1152 | 0 | 0 | 0 | 576 | 0 | 0 | 0 | 576 | 0 |
47
+ | qwen1.5-1.8b-chat-hf | 1.65 | 1.57 | 1.74 | 19 | 5 | 603 | 525 | 9 | 2 | 298 | 267 | 10 | 3 | 305 | 258 |
48
+ | qwen1.5-4b-chat-hf | 5.56 | 5.22 | 5.90 | 64 | 17 | 484 | 587 | 30 | 8 | 242 | 296 | 34 | 9 | 242 | 291 |
49
+ | qwen1.5-7b-chat-hf | 8.78 | 9.57 | 7.99 | 101 | 25 | 333 | 693 | 55 | 12 | 151 | 358 | 46 | 13 | 182 | 335 |
50
+ | qwen1.5-14b-chat-hf | 14.42 | 16.52 | 12.33 | 166 | 18 | 222 | 746 | 95 | 10 | 110 | 361 | 71 | 8 | 112 | 385 |
51
+ | qwen1.5-32b-chat-hf | 10.78 | 13.04 | 8.51 | 124 | 15 | 516 | 497 | 75 | 10 | 195 | 296 | 49 | 5 | 321 | 201 |
52
+ | qwen1.5-72b-chat-hf | 18.77 | 18.78 | 18.75 | 216 | 23 | 164 | 749 | 108 | 12 | 89 | 367 | 108 | 11 | 75 | 382 |
53
+ | qwen1.5-110b-chat-hf | 34.58 | 34.43 | 34.72 | 399 | 20 | 176 | 557 | 199 | 12 | 85 | 280 | 200 | 8 | 91 | 277 |
54
+ | internlm2-chat-1.8b-hf | 4.52 | 5.04 | 3.99 | 52 | 10 | 364 | 726 | 29 | 4 | 172 | 371 | 23 | 6 | 192 | 355 |
55
+ | internlm2-chat-1.8b-sft-hf | 3.56 | 3.83 | 3.30 | 41 | 12 | 403 | 696 | 22 | 6 | 211 | 337 | 19 | 6 | 192 | 359 |
56
+ | internlm2-chat-7b-hf | 14.60 | 13.74 | 15.45 | 168 | 12 | 238 | 734 | 79 | 7 | 142 | 348 | 89 | 5 | 96 | 386 |
57
+ | internlm2-chat-7b-sft-hf | 14.34 | 14.61 | 14.06 | 165 | 9 | 275 | 703 | 84 | 3 | 174 | 315 | 81 | 6 | 101 | 388 |
58
+ | internlm2-chat-20b-hf | 19.64 | 20.00 | 19.27 | 226 | 11 | 191 | 724 | 115 | 7 | 83 | 371 | 111 | 4 | 108 | 353 |
59
+ | internlm2-chat-20b-sft-hf | 20.55 | 19.91 | 21.18 | 237 | 11 | 195 | 709 | 115 | 6 | 94 | 361 | 122 | 5 | 101 | 348 |
60
+ | llama-3-8b-instruct-hf | 28.50 | 29.04 | 27.95 | 328 | 17 | 95 | 712 | 167 | 7 | 44 | 358 | 161 | 10 | 51 | 354 |
61
+ | llama-3-70b-instruct-hf | 45.44 | 46.09 | 44.79 | 523 | 8 | 52 | 569 | 265 | 2 | 25 | 284 | 258 | 6 | 27 | 285 |
62
+ | llama-3-8b-instruct-lmdeploy | 29.02 | 29.39 | 28.65 | 334 | 19 | 94 | 705 | 169 | 11 | 42 | 354 | 165 | 8 | 52 | 351 |
63
+ | llama-3-70b-instruct-lmdeploy | 44.66 | 46.78 | 42.53 | 514 | 11 | 44 | 583 | 269 | 5 | 19 | 283 | 245 | 6 | 25 | 300 |
64
+ | mistral-7b-instruct-v0.1-hf | 9.82 | 10.78 | 8.85 | 113 | 17 | 316 | 706 | 62 | 9 | 152 | 353 | 51 | 8 | 164 | 353 |
65
+ | mistral-7b-instruct-v0.2-hf | 7.90 | 6.26 | 9.55 | 91 | 8 | 572 | 481 | 36 | 4 | 345 | 191 | 55 | 4 | 227 | 290 |
66
+ | mixtral-8x7b-instruct-v0.1-hf | 16.29 | 15.91 | 16.67 | 188 | 13 | 370 | 581 | 92 | 6 | 241 | 237 | 96 | 7 | 129 | 344 |
opencompass/configs/datasets/LCBench/lcbench_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .lcbench_gen_5ff288 import LCBench_datasets # noqa: F401, F403
opencompass/configs/datasets/LCBench/lcbench_gen_5ff288.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import LCDataset, LCPassKEvaluator
5
+
6
+ LC_reader_cfg = dict(
7
+ input_columns=['text', 'test_list'], output_column='test_column')
8
+
9
+
10
+ LC_en_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(
14
+ round=[
15
+ dict(
16
+ role='HUMAN',
17
+ prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
18
+ ),
19
+ dict(
20
+ role='BOT',
21
+ prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "
22
+ ),
23
+ dict(
24
+ role='HUMAN',
25
+ prompt="You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with [\"a\",\"b\",\"c\"], we need to push the key one time to type \"a\", two times to type \"b\", and three times to type \"c\" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
26
+ ),
27
+ dict(
28
+ role='BOT',
29
+ prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "
30
+ ),
31
+ dict(
32
+ role='HUMAN',
33
+ prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"
34
+ ),
35
+ dict(
36
+ role='BOT',
37
+ prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "
38
+ ),
39
+ dict(
40
+ role='HUMAN',
41
+ prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'
42
+ ),
43
+ dict(role='BOT', prompt='[BEGIN]\n'),
44
+ ], )),
45
+ retriever=dict(type=ZeroRetriever),
46
+ inferencer=dict(type=GenInferencer, max_out_len=512))
47
+
48
+
49
+ LC_cn_infer_cfg = dict(
50
+ prompt_template=dict(
51
+ type=PromptTemplate,
52
+ template=dict(
53
+ round=[
54
+ dict(
55
+ role='HUMAN',
56
+ prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你三个 正整数 n 、x 和 y 。\n在城市中,存在编号从 1 到 n 的房屋,由 n 条街道相连。对所有 1 <= i < n ,都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k(1 <= k <= n),你需要找出所有满足要求的 房屋对 [house1, house2] ,即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ,其中 result[k] 表示所有满足要求的房屋对的数量,即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意,x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
57
+ ),
58
+ dict(
59
+ role='BOT',
60
+ prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "
61
+ ),
62
+ dict(
63
+ role='HUMAN',
64
+ prompt="你是一名专业的 Python 程序员,下面是你的任务: 给你一个字符串 word,由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射,可以通过按压按键来组成单词。例如,按键 2 对应 [\"a\",\"b\",\"c\"],我们需要按一次键来输入 \"a\",按两次键来输入 \"b\",按三次键来输入 \"c\"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母,但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1,*,# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
65
+ ),
66
+ dict(
67
+ role='BOT',
68
+ prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "
69
+ ),
70
+ dict(
71
+ role='HUMAN',
72
+ prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ,满足从 1 到 num 的所有整数�� 价值 和小于等于 k 。\n注意:\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号,比方说如果 s == 11100 ,那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'
73
+ ),
74
+ dict(
75
+ role='BOT',
76
+ prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "
77
+ ),
78
+ dict(
79
+ role='HUMAN',
80
+ prompt='你是一名专业的 Python 程序员,下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list} \n'
81
+ ),
82
+ dict(role='BOT', prompt='[BEGIN]\n'),
83
+ ], )),
84
+ retriever=dict(type=ZeroRetriever),
85
+ inferencer=dict(type=GenInferencer, max_out_len=512))
86
+
87
+
88
+ LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
89
+
90
+ LCBench_datasets = [
91
+ dict(
92
+ type=LCDataset,
93
+ abbr='lcbench_en',
94
+ path='./data/LCBench2023/LCBench2023.jsonl',
95
+ num_repeats=1,
96
+ reader_cfg=LC_reader_cfg,
97
+ infer_cfg=LC_en_infer_cfg,
98
+ eval_cfg=LC_eval_cfg),
99
+ dict(
100
+ type=LCDataset,
101
+ abbr='lcbench_cn',
102
+ path='./data/LCBench2023/LCBench2023_cn.jsonl',
103
+ num_repeats=1,
104
+ reader_cfg=LC_reader_cfg,
105
+ infer_cfg=LC_cn_infer_cfg,
106
+ eval_cfg=LC_eval_cfg)
107
+ ]
opencompass/configs/datasets/LCBench/lcbench_levels_gen_bb665f.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import LCDataset, LCPassKEvaluator
5
+
6
+ LC_difficulties_list = ['EASY', 'MEDIUM', 'HARD']
7
+ LC_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
8
+
9
+
10
+ LC_en_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(
14
+ round=[
15
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
16
+ dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "),
17
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with ["a","b","c"], we need to push the key one time to type "a", two times to type "b", and three times to type "c" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
18
+ dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "),
19
+ dict(role='HUMAN', prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"),
20
+ dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "),
21
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'),
22
+ dict(role='BOT', prompt='[BEGIN]\n'),
23
+ ],
24
+ ),
25
+ ),
26
+ retriever=dict(type=ZeroRetriever),
27
+ inferencer=dict(type=GenInferencer, max_out_len=512),
28
+ )
29
+
30
+
31
+ LC_cn_infer_cfg = dict(
32
+ prompt_template=dict(
33
+ type=PromptTemplate,
34
+ template=dict(
35
+ round=[
36
+ dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你三个 正整数 n 、x 和 y 。\n在城市中,存在编号从 1 到 n 的房屋,由 n 条街道相连。对所有 1 <= i < n ,都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k(1 <= k <= n),你需要找出所有满足要求的 房屋对 [house1, house2] ,即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ,其中 result[k] 表示所有满足要求的房屋对的数量,即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意,x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'),
37
+ dict(role='BOT', prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "),
38
+ dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个字符串 word,由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射,可以通过按压按键来组成单词。例如,按键 2 对应 ["a","b","c"],我们需要按一次键来输入 "a",按两次键来输入 "b",按三次键来输入 "c"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母,但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1,*,# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes("abcde") == 5 \n assert minimumPushes("xyzxyzxyzxyz") == 12 \n assert minimumPushes("aabbccddeeffgghhiiiiii") == 24 \n'),
39
+ dict(role='BOT', prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "),
40
+ dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ,满足从 1 到 num 的所有整数的 价值 和小于等于 k 。\n注意:\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号,比方说如果 s == 11100 ,那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'),
41
+ dict(role='BOT', prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "),
42
+ dict(role='HUMAN', prompt='你是一名专业的 Python 程序员,下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list} \n'),
43
+ dict(role='BOT', prompt='[BEGIN]\n'),
44
+ ],
45
+ ),
46
+ ),
47
+ retriever=dict(type=ZeroRetriever),
48
+ inferencer=dict(type=GenInferencer, max_out_len=512),
49
+ )
50
+
51
+
52
+ LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
53
+
54
+ LCBench_datasets = []
55
+ for difficulty in LC_difficulties_list:
56
+ LCBench_datasets.append(
57
+ dict(
58
+ type=LCDataset,
59
+ abbr='lcbench_en-' + difficulty,
60
+ path='data/LCBench2023/LCBench2023.jsonl',
61
+ difficulty=difficulty,
62
+ reader_cfg=LC_reader_cfg,
63
+ infer_cfg=LC_en_infer_cfg,
64
+ eval_cfg=LC_eval_cfg,
65
+ )
66
+ )
67
+ LCBench_datasets.append(
68
+ dict(
69
+ type=LCDataset,
70
+ abbr='lcbench_cn-' + difficulty,
71
+ path='data/LCBench2023/LCBench2023_cn.jsonl',
72
+ difficulty=difficulty,
73
+ reader_cfg=LC_reader_cfg,
74
+ infer_cfg=LC_cn_infer_cfg,
75
+ eval_cfg=LC_eval_cfg,
76
+ )
77
+ )
opencompass/configs/datasets/LCBench/lcbench_repeat10_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .lcbench_repeat10_gen_5ff288 import LCBench_repeat10_datasets # noqa: F401, F403
opencompass/configs/datasets/LCBench/lcbench_repeat10_gen_5ff288.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import LCDataset, LCPassKEvaluator
5
+
6
+ LC_reader_cfg = dict(
7
+ input_columns=['text', 'test_list'], output_column='test_column')
8
+
9
+
10
+ LC_en_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(
14
+ round=[
15
+ dict(
16
+ role='HUMAN',
17
+ prompt='You are an expert Python programmer, and here is your task: You are given three positive integers n, x, and y.\nIn a city, there exist houses numbered 1 to n connected by n streets. There is a street connecting the house numbered i with the house numbered i + 1 for all 1 <= i <= n - 1 . An additional street connects the house numbered x with the house numbered y.\nFor each k, such that 1 <= k <= n, you need to find the number of pairs of houses (house1, house2) such that the minimum number of streets that need to be traveled to reach house2 from house1 is k.\nReturn a 1-indexed array result of length n where result[k] represents the total number of pairs of houses such that the minimum streets required to reach one house from the other is k.\nNote that x and y can be equal. Your code should pass these tests:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
18
+ ),
19
+ dict(
20
+ role='BOT',
21
+ prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "
22
+ ),
23
+ dict(
24
+ role='HUMAN',
25
+ prompt="You are an expert Python programmer, and here is your task: You are given a string word containing lowercase English letters.\nTelephone keypads have keys mapped with distinct collections of lowercase English letters, which can be used to form words by pushing them. For example, the key 2 is mapped with [\"a\",\"b\",\"c\"], we need to push the key one time to type \"a\", two times to type \"b\", and three times to type \"c\" .\nIt is allowed to remap the keys numbered 2 to 9 to distinct collections of letters. The keys can be remapped to any amount of letters, but each letter must be mapped to exactly one key. You need to find the minimum number of times the keys will be pushed to type the string word.\nReturn the minimum number of pushes needed to type word after remapping the keys.\nAn example mapping of letters to keys on a telephone keypad is given below. Note that 1, *, #, and 0 do not map to any letters. Your code should pass these tests:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
26
+ ),
27
+ dict(
28
+ role='BOT',
29
+ prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "
30
+ ),
31
+ dict(
32
+ role='HUMAN',
33
+ prompt="You are an expert Python programmer, and here is your task: You are given an integer k and an integer x.\nConsider s is the 1-indexed binary representation of an integer num. The price of a number num is the number of i's such that i % x == 0 and s[i] is a set bit.\nReturn the greatest integer num such that the sum of prices of all numbers from 1 to num is less than or equal to k.\nNote:\nIn the binary representation of a number set bit is a bit of value 1.\nThe binary representation of a number will be indexed from right to left. For example, if s == 11100, s[4] == 1 and s[2] == 0. Your code should pass these tests:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n"
34
+ ),
35
+ dict(
36
+ role='BOT',
37
+ prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "
38
+ ),
39
+ dict(
40
+ role='HUMAN',
41
+ prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'
42
+ ),
43
+ dict(role='BOT', prompt='[BEGIN]\n'),
44
+ ], )),
45
+ retriever=dict(type=ZeroRetriever),
46
+ inferencer=dict(type=GenInferencer, max_out_len=512))
47
+
48
+
49
+ LC_cn_infer_cfg = dict(
50
+ prompt_template=dict(
51
+ type=PromptTemplate,
52
+ template=dict(
53
+ round=[
54
+ dict(
55
+ role='HUMAN',
56
+ prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你三个 正整数 n 、x 和 y 。\n在城市中,存在编号从 1 到 n 的房屋,由 n 条街道相连。对所有 1 <= i < n ,都存在一条街道连接编号为 i 的房屋与编号为 i + 1 的房屋。另存在一条街道连接编号为 x 的房屋与编号为 y 的房屋。\n对于每个 k(1 <= k <= n),你需要找出所有满足要求的 房屋对 [house1, house2] ,即从 house1 到 house2 需要经过的 最少 街道数为 k 。\n返回一个下标从 1 开始且长度为 n 的数组 result ,其中 result[k] 表示所有满足要求的房屋对的数量,即从一个房屋到另一个房屋需要经过的 最少 街道数为 k 。\n注意,x 与 y 可以 相等。你的代码需要通过以下测试:\n\n assert countOfPairs(n = 3, x = 1, y = 3) == [6,0,0]\n assert countOfPairs(n = 5, x = 2, y = 4) == [10,8,2,0,0] \n assert countOfPairs(n = 4, x = 1, y = 1) == [6,4,2,0] \n'
57
+ ),
58
+ dict(
59
+ role='BOT',
60
+ prompt="[BEGIN]\n 'from itertools import accumulate\ndef countOfPairs(n, x, y):\n x, y = min(x, y), max(x, y)\n A = [0] * n\n for i in range(1, n + 1):\n A[0] += 2 \n A[min(i - 1, abs(i - y) + x)] -= 1 \n A[min(n - i, abs(i - x) + 1 + n - y)] -= 1 \n A[min(abs(i - x), abs(y - i) + 1)] += 1 \n A[min(abs(i - x) + 1, abs(y - i))] += 1 \n r = max(x - i, 0) + max(i - y, 0)\n A[r + (y - x + 0) // 2] -= 1 \n A[r + (y - x + 1) // 2] -= 1 \n return list(accumulate(A))' \n[DONE] \n\n "
61
+ ),
62
+ dict(
63
+ role='HUMAN',
64
+ prompt="你是一名专业的 Python 程序员,下面是你的任务: 给你一个字符串 word,由 不同 小写英文字母组成。\n电话键盘上的按键与 不同 小写英文字母集合相映射,可以通过按压按键来组成单词。例如,按键 2 对应 [\"a\",\"b\",\"c\"],我们需要按一次键来输入 \"a\",按两次键来输入 \"b\",按三次键来输入 \"c\"。\n现在允许你将编号为 2 到 9 的按键重新映射到 不同 字母集合。每个按键可以映射到 任意数量 的字母,但每个字母 必须 恰好 映射到 一个 按键上。你需要找到输入字符串 word 所需的 最少 按键次数。\n返回重新映射按键后输入 word 所需的 最少 按键次数。\n下面给出了一种电话键盘上字母到按键的映射作为示例。注意 1,*,# 和 0 不 对应任何字母。你的代码需要通过以下测试:\n\n assert minimumPushes(\"abcde\") == 5 \n assert minimumPushes(\"xyzxyzxyzxyz\") == 12 \n assert minimumPushes(\"aabbccddeeffgghhiiiiii\") == 24 \n"
65
+ ),
66
+ dict(
67
+ role='BOT',
68
+ prompt="[BEGIN]\n 'def minimumPushes(word):\n letter_counts = {}\n for c in word:\n letter_counts[c] = letter_counts.get(c, 0) + 1\n counts = list(letter_counts.values())\n counts.sort(reverse=True)\n ans, row = 0, 1\n for i in range(len(counts)):\n if i > 7 and i % 8 == 0:\n row += 1\n ans += row * counts[i]\n return ans' \n[DONE] \n\n "
69
+ ),
70
+ dict(
71
+ role='HUMAN',
72
+ prompt='你是一名专业的 Python 程序员,下面是你的任务: 给你一个整数 k 和一个整数 x 。\n令 s 为整数 num 的下标从 1 开始的二进制表示。我们说一个整数 num 的 价值 是满足 i % x == 0 且 s[i] 是 设置位 的 i 的数目。\n请你返回 最大 整数 num ,满足从 1 到 num 的所有整数�� 价值 和小于等于 k 。\n注意:\n一个整数二进制表示下 设置位 是值为 1 的数位。\n一个整数的二进制表示下标从右到左编号,比方说如果 s == 11100 ,那么 s[4] == 1 且 s[2] == 0。你的代码需要通过以下测试:\n\n assert findMaximumNumber(k = 9, x = 1) == 6 \n assert findMaximumNumber(k = 7, x = 2) == 9 \n'
73
+ ),
74
+ dict(
75
+ role='BOT',
76
+ prompt="[BEGIN]\n 'def findMaximumNumber(k, x):\n def check(v):\n A = list(map(int, bin(v)[2:]))\n n = len(A)\n res = p = 0\n for i,v in enumerate(A):\n if v == 1:\n l = n - i - 1\n res += (p << l) + ((l // x) << (l - 1) if l else 0)\n if (n - i) % x == 0:\n p += v\n return res + p\n l, r = 1, 10 ** 15\n while l < r:\n mid = (l + r + 1) // 2\n if check(mid) <= k:\n l = mid\n else:\n r = mid - 1\n return l' \n[DONE] \n\n "
77
+ ),
78
+ dict(
79
+ role='HUMAN',
80
+ prompt='你是一名专业的 Python 程序员,下面是你的任务: {text} 你的代码需要通过以下测试:\n\n {test_list} \n'
81
+ ),
82
+ dict(role='BOT', prompt='[BEGIN]\n'),
83
+ ], )),
84
+ retriever=dict(type=ZeroRetriever),
85
+ inferencer=dict(type=GenInferencer, max_out_len=512))
86
+
87
+ LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')
88
+
89
+ LCBench_repeat10_datasets = [
90
+ dict(
91
+ type=LCDataset,
92
+ abbr='lcbench_en_repeat10',
93
+ path='./data/LCBench2023/LCBench2023.jsonl',
94
+ num_repeats=10,
95
+ reader_cfg=LC_reader_cfg,
96
+ infer_cfg=LC_en_infer_cfg,
97
+ eval_cfg=LC_eval_cfg),
98
+ dict(
99
+ type=LCDataset,
100
+ abbr='lcbench_cn_repeat10',
101
+ path='./data/LCBench2023/LCBench2023_cn.jsonl',
102
+ num_repeats=10,
103
+ reader_cfg=LC_reader_cfg,
104
+ infer_cfg=LC_cn_infer_cfg,
105
+ eval_cfg=LC_eval_cfg)
106
+ ]
opencompass/configs/datasets/SVAMP/svamp_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .svamp_gen_fb25e4 import svamp_datasets # noqa: F401, F403
opencompass/configs/datasets/SVAMP/svamp_gen_fb25e4.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import SVAMPDataset, gsm8k_postprocess, Gsm8kEvaluator
5
+
6
+ svamp_infer_cfg = dict(
7
+ prompt_template=dict(
8
+ type=PromptTemplate,
9
+ template=dict(
10
+ round=[
11
+ dict(role='HUMAN', prompt="Question: There are 87 oranges and 290 bananas in Philip's collection. If the bananas are organized into 2 groups and oranges are organized into 93 groups How big is each group of bananas?\nLet's think step by step\nAnswer:"),
12
+ dict(role='BOT', prompt='To find the size of each group of bananas, we divide the total number of bananas (290) by the number of groups (2): 290 / 2 = 145. Therefore, each group of bananas contains 145 bananas. The answer is 145.\n'),
13
+ dict(role='HUMAN', prompt="Question: Marco and his dad went strawberry picking. Marco's dad's strawberries weighed 11 pounds. If together their strawberries weighed 30 pounds. How much did Marco's strawberries weigh?\nLet's think step by step\nAnswer:"),
14
+ dict(role='BOT', prompt="To find Marco's strawberries' weight, we subtract his dad's strawberries' weight (11 pounds) from the total weight of their strawberries (30 pounds): 30 - 11 = 19. Therefore, Marco's strawberries weighed 19 pounds. The answer is 19.\n"),
15
+ dict(role='HUMAN', prompt="Question: Edward spent $ 6 to buy 2 books each book costing him the same amount of money. Now he has $ 12. How much did each book cost?\nLet's think step by step\nAnswer:"),
16
+ dict(role='BOT', prompt='To find the cost of each book, we subtract the initial amount of money Edward had ($6) from the current amount of money he has ($12) and divide it by the number of books (2): (12 - 6) / 2 = 6 / 2 = 3 Therefore, each book cost $3. The answer is 3.\n'),
17
+ dict(role='HUMAN', prompt="Question: Frank was reading through his favorite book. The book had 3 chapters, each with the same number of pages. It has a total of 594 pages. It took Frank 607 days to finish the book. How many pages are in each chapter?\nLet's think step by step\nAnswer:"),
18
+ dict(role='BOT', prompt='To find the number of pages in each chapter, we divide the total number of pages in the book (594) by the number of chapters (3): 594 / 3 = 198. Therefore, each chapter has 198 pages. The answer is 198.\n'),
19
+ dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
20
+ ],
21
+ )),
22
+ retriever=dict(type=ZeroRetriever),
23
+ inferencer=dict(type=GenInferencer, max_out_len=512))
24
+
25
+ svamp_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
26
+ pred_postprocessor=dict(type=gsm8k_postprocess))
27
+
28
+ svamp_datasets = [
29
+ dict(
30
+ abbr='svamp',
31
+ type=SVAMPDataset,
32
+ path='./data/svamp/test.jsonl',
33
+ reader_cfg=dict(input_columns=['question'], output_column='answer'),
34
+ infer_cfg=svamp_infer_cfg,
35
+ eval_cfg=svamp_eval_cfg)
36
+ ]
opencompass/configs/datasets/game24/game24_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .game24_gen_52a460 import game24_datasets # noqa: F401, F403
opencompass/configs/datasets/game24/game24_gen_52a460.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import ToTInferencer
4
+ from opencompass.datasets import (Game24Dataset, game24_postprocess,
5
+ Game24Evaluator, Game24PromptWrapper)
6
+
7
+ generation_kwargs = dict(do_sample=False, temperature=0.7)
8
+
9
+ game24_reader_cfg = dict(
10
+ input_columns=['input'],
11
+ output_column='output')
12
+
13
+ game24_infer_cfg = dict(
14
+ prompt_template=dict(
15
+ type=PromptTemplate,
16
+ template='{input}'),
17
+ retriever=dict(type=ZeroRetriever),
18
+ inferencer=dict(type=ToTInferencer, generation_kwargs=generation_kwargs, method_generate='propose',
19
+ method_evaluate='value', method_select='greedy', n_evaluate_sample=3, n_select_sample=5, prompt_wrapper=dict(type=Game24PromptWrapper)))
20
+
21
+ game24_eval_cfg = dict(
22
+ evaluator=dict(type=Game24Evaluator),
23
+ pred_postprocessor=dict(type=game24_postprocess),
24
+ )
25
+
26
+ game24_datasets = [
27
+ dict(
28
+ abbr='game24',
29
+ type=Game24Dataset,
30
+ path='./data/game24/game24.csv',
31
+ reader_cfg=game24_reader_cfg,
32
+ infer_cfg=game24_infer_cfg,
33
+ eval_cfg=game24_eval_cfg)
34
+ ]
opencompass/configs/datasets/humanevalx/humanevalx_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .humanevalx_gen_620cfa import humanevalx_datasets # noqa: F401, F403
opencompass/configs/datasets/humanevalx/humanevalx_gen_0af626.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
5
+
6
+ humanevalx_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='task_id', train_split='test')
8
+
9
+ # This prompt is used for WizardLMCode series
10
+ # You can use 620cfa for basic generation
11
+ humanevalx_infer_cfg = {
12
+ lang: dict(
13
+ prompt_template=dict(
14
+ type=PromptTemplate,
15
+ template=dict(round=[
16
+ dict(
17
+ role='HUMAN',
18
+ prompt=
19
+ f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
20
+
21
+
22
+ ### Instruction:
23
+ Create a {lang} script for this problem:
24
+ {{prompt}}
25
+
26
+ ### Response:"""),
27
+ ])),
28
+ retriever=dict(type=ZeroRetriever),
29
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
30
+ for lang in ['python', 'cpp', 'go', 'java', 'js']
31
+ }
32
+
33
+ humanevalx_eval_cfg_dict = {
34
+ lang: dict(
35
+ evaluator=dict(
36
+ type=HumanevalXEvaluator,
37
+ language=lang,
38
+ ip_address=
39
+ 'localhost', # replace to your code_eval_server ip_address, port
40
+ port=5001
41
+ ), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
42
+ pred_role='BOT')
43
+ for lang in ['python', 'cpp', 'go', 'java', 'js'
44
+ ] # do not support rust now
45
+ }
46
+
47
+ # Please download the needed `xx.jsonl.gz` from
48
+ # https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
49
+ # and move them into `data/humanevalx/` folder
50
+ humanevalx_datasets = [
51
+ dict(
52
+ type=HumanevalXDataset,
53
+ abbr=f'humanevalx-{lang}',
54
+ language=lang,
55
+ path='./data/humanevalx',
56
+ reader_cfg=humanevalx_reader_cfg,
57
+ infer_cfg=humanevalx_infer_cfg[lang],
58
+ eval_cfg=humanevalx_eval_cfg_dict[lang])
59
+ for lang in ['python', 'cpp', 'go', 'java', 'js']
60
+ ]
opencompass/configs/datasets/humanevalx/humanevalx_gen_620cfa.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
5
+
6
+ humanevalx_reader_cfg = dict(
7
+ input_columns=['prompt'], output_column='declaration', train_split='test')
8
+
9
+ humanevalx_infer_cfg = dict(
10
+ prompt_template=dict(
11
+ type=PromptTemplate,
12
+ template='{prompt}'),
13
+ retriever=dict(type=ZeroRetriever),
14
+ inferencer=dict(type=GenInferencer, max_out_len=1024))
15
+
16
+ humanevalx_eval_cfg_dict = {
17
+ lang : dict(
18
+ evaluator=dict(
19
+ type=HumanevalXEvaluator,
20
+ language=lang,
21
+ ip_address=
22
+ 'localhost', # replace to your code_eval_server ip_address, port
23
+ port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
24
+ pred_role='BOT')
25
+ for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now
26
+ }
27
+
28
+ # Please download the needed `xx.jsonl.gz` from
29
+ # https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
30
+ # and move them into `data/humanevalx/` folder
31
+ humanevalx_datasets = [
32
+ dict(
33
+ type=HumanevalXDataset,
34
+ abbr=f'humanevalx-{lang}',
35
+ language=lang,
36
+ path='./data/humanevalx',
37
+ reader_cfg=humanevalx_reader_cfg,
38
+ infer_cfg=humanevalx_infer_cfg,
39
+ eval_cfg=humanevalx_eval_cfg_dict[lang])
40
+ for lang in ['python', 'cpp', 'go', 'java', 'js']
41
+ ]
opencompass/configs/datasets/infinitebench/infinitebench.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .infinitebenchcodedebug.infinitebench_codedebug_gen import InfiniteBench_codedebug_datasets
5
+ from .infinitebenchcoderun.infinitebench_coderun_gen import InfiniteBench_coderun_datasets
6
+ from .infinitebenchendia.infinitebench_endia_gen import InfiniteBench_endia_datasets
7
+ from .infinitebenchenmc.infinitebench_enmc_gen import InfiniteBench_enmc_datasets
8
+ from .infinitebenchenqa.infinitebench_enqa_gen import InfiniteBench_enqa_datasets
9
+ from .infinitebenchensum.infinitebench_ensum_gen import InfiniteBench_ensum_datasets
10
+ from .infinitebenchmathcalc.infinitebench_mathcalc_gen import InfiniteBench_mathcalc_datasets
11
+ from .infinitebenchmathfind.infinitebench_mathfind_gen import InfiniteBench_mathfind_datasets
12
+ from .infinitebenchretrievekv.infinitebench_retrievekv_gen import InfiniteBench_retrievekv_datasets
13
+ from .infinitebenchretrievenumber.infinitebench_retrievenumber_gen import InfiniteBench_retrievenumber_datasets
14
+ from .infinitebenchretrievepasskey.infinitebench_retrievepasskey_gen import InfiniteBench_retrievepasskey_datasets
15
+ from .infinitebenchzhqa.infinitebench_zhqa_gen import InfiniteBench_zhqa_datasets
16
+
17
+ infinitebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
opencompass/configs/datasets/lveval/lveval.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LVEval
2
+ ## Introduction
3
+ The following introduction comes from the introduction in [LVEval](https://github.com/infinigence/LVEval)
4
+
5
+ ```
6
+ LV-Eval是一个具备5个长度等级(16k、32k、64k、128k和256k)、最大文本测试长度达到256k的长文本评测基准。LV-Eval的平均文本长度达到102,380字,最小/最大文本长度为11,896/387,406字。LV-Eval主要有两类评测任务——单跳QA和多跳QA,共包含11个涵盖中英文的评测数据子集。LV-Eval设计时引入3个关键技术:干扰事实插入(Confusiong Facts Insertion,CFI)提高挑战性,关键词和短语替换(Keyword and Phrase Replacement,KPR)减少信息泄漏,以及基于关键词召回的评测指标(Answer Keywords,AK,指代结合答案关键词和字词黑名单的评价指标)提高评测数值客观性。我们希望LV-Eval为未来长文本大语言模型的研究发展提供有价值的性能参考。
7
+ LV-Eval is a challenging long-context benchmark with five length levels (16k, 32k, 64k, 128k, and 256k) reaching up to 256k words. The average number of words is 102,380, and the Min/Max number of words is 11,896/387,406. LV-Eval features two main tasks, single-hop QA and multi-hop QA, comprising 11 bilingual datasets. The design of LV-Eval has incorporated three key techniques, namely confusing facts insertion (CFI), keyword and phrase replacement (KPR), and keyword-recall-based metrics (AK, short for metics with Answer Keywords and word blacklist) design, which jointly provide a challenging, mitigated-knowledge-leakege, and more accurate evaluation of the long-context capability of LLMs. We anticipate that LV-Eval will serve as a valuable resource for supporting future research on long-context LLMs.
8
+ ```
9
+
10
+ ## Official link
11
+
12
+ ### Paper
13
+
14
+ [_LV_-Eval: A Balanced Long-Context Benchmark with 5 Length Levels Up to 256K](https://arxiv.org/abs/2402.05136)
15
+
16
+ ### Repository
17
+
18
+ [LVEval](https://github.com/infinigence/LVEval)
19
+
20
+ ## Use cases
21
+
22
+ In evaluation scripts, add LVEval dataset as other datasets by using
23
+ ```
24
+ from .datasets.lveval.lveval import LVEval_datasets as datasets
25
+ ```
26
+
27
+ ## Examples
28
+ Input example I (from lic_mixup datasets):
29
+ ```
30
+ 请根据下面给定的文章回答问题,问题和答案只与其中一篇文章有关。
31
+
32
+ 文章:......文章 9\n\n标题:腐质酸\n内容:腐植酸是自然界中广泛存在的大分子有机物质,广泛应用于农林牧、石油、化工、建材、医药卫生、环保等各个领域。横跨几十个行业。特别是眼下提倡生态农业建设、无公害农业生产、绿色食品、无污染环保产品等,更使\"腐植酸\"备受推崇,事实证明,人类的生活和生存离不开腐植酸,它的确是一个发展中的有希望的朝阳产业,属于一个新型的特殊行业......
33
+
34
+ 请现在基于上述文章回答下面的问题,问题和答案只与其中一篇文章有关。
35
+
36
+ 问题:中国的文学受到印度哪些方面的影响?
37
+ 回答:
38
+ ```
39
+ Output example I (from chatglm3-6b-32k):
40
+ ```
41
+ 中国文学自印度文学大量吸收营养,在佛教东流之后,从语汇到修辞,从题材到体裁,即便审美取向也深受佛教与印度文学的感染。
42
+ ```
43
+ Input example II (from factrecall_zh datasets):
44
+ ```
45
+ 请基于给定的文章回答下述问题。
46
+
47
+ 文章:......庚子年间,贝多芬,乃一德裔美籍学士,研究于物理理学。彼其良图,探求相对论、量子力学,尤有大进。质能等价公式 E=mc²,千古独步,声名于当世。诺贝尔物理学奖、以资尊荣,兹矣荣耀之大典。论其学术,涉时空能量,影响深远,以其义非常人,广为当世所知,声名播于天下,实乃现代物理学之奠基者......
48
+
49
+ 现在请基于上述文章回答下面的问题。
50
+
51
+ 问题:被世人广泛推崇为现代物理学奠基人的科学家叫什么名字?
52
+ 回答:
53
+ ```
54
+ Output example II (from chatglm3-6b-32k):
55
+ ```
56
+ 贝多芬
57
+ ```
58
+ ## Evaluation results
59
+
60
+ ```
61
+ dataset version metric mode bluelm-7b-chat-32k-hf
62
+ ----------------------------------------- --------- ------------- ------ -----------------------
63
+ ---------------------------------------- - - - -
64
+ --------- LVEval All --------- - - - -
65
+ ---------------------------------------- - - - -
66
+ LVEval_qa - naive_average gen 12.00
67
+ ---------------------------------------- - - - -
68
+ --------- LVEval Tasks All --------- - - - -
69
+ ---------------------------------------- - - - -
70
+ LVEval_single_hop_qa - naive_average gen 15.11
71
+ LVEval_single_hop_cqa - naive_average gen 9.21
72
+ LVEval_multi_hop_qa - naive_average gen 6.99
73
+ LVEval_multi_hop_cqa - naive_average gen 9.90
74
+ LVEval_factrecall_cqa - naive_average gen 21.28
75
+ ---------------------------------------- - - - -
76
+ --------- LVEval Datasets All --------- - - - -
77
+ ---------------------------------------- - - - -
78
+ LVEval_loogle_SD_mixup - naive_average gen 12.81
79
+ LVEval_cmrc_mixup - naive_average gen 17.41
80
+ LVEval_multifieldqa_en_mixup - naive_average gen 7.10
81
+ LVEval_multifieldqa_zh_mixup - naive_average gen 11.31
82
+ LVEval_dureader_mixup - naive_average gen 13.19
83
+ LVEval_loogle_CR_mixup - naive_average gen 5.17
84
+ LVEval_loogle_MIR_mixup - naive_average gen 2.60
85
+ LVEval_hotpotwikiqa_mixup - naive_average gen 10.20
86
+ LVEval_lic_mixup - naive_average gen 9.60
87
+ LVEval_factrecall_en - naive_average gen 23.67
88
+ LVEval_factrecall_zh - naive_average gen 18.90
89
+ ---------------------------------------- - - - -
90
+ --------- LVEval Single_Hop QA --------- - - - -
91
+ ---------------------------------------- - - - -
92
+ LVEval_loogle_SD_mixup_16k 83bc25 LVEval_f1 gen 35.05
93
+ LVEval_loogle_SD_mixup_32k 83bc25 LVEval_f1 gen 13.37
94
+ LVEval_loogle_SD_mixup_64k 83bc25 LVEval_f1 gen 6.32
95
+ LVEval_loogle_SD_mixup_128k 83bc25 LVEval_f1 gen 5.28
96
+ LVEval_loogle_SD_mixup_256k 83bc25 LVEval_f1 gen 4.00
97
+ ---------------------------------------- - - - -
98
+ LVEval_cmrc_mixup_16k 8bac4e LVEval_f1 gen 46.45
99
+ LVEval_cmrc_mixup_32k 8bac4e LVEval_f1 gen 19.41
100
+ LVEval_cmrc_mixup_64k 8bac4e LVEval_f1 gen 11.10
101
+ LVEval_cmrc_mixup_128k 8bac4e LVEval_f1 gen 5.89
102
+ LVEval_cmrc_mixup_256k 8bac4e LVEval_f1 gen 4.22
103
+ ---------------------------------------- - - - -
104
+ --------- LVEval Single_Hop CQA --------- - - - -
105
+ ---------------------------------------- - - - -
106
+ LVEval_multifieldqa_en_mixup_16k 83bc25 LVEval_f1 gen 12.28
107
+ LVEval_multifieldqa_en_mixup_32k 83bc25 LVEval_f1 gen 4.64
108
+ LVEval_multifieldqa_en_mixup_64k 83bc25 LVEval_f1 gen 8.30
109
+ LVEval_multifieldqa_en_mixup_128k 83bc25 LVEval_f1 gen 5.63
110
+ LVEval_multifieldqa_en_mixup_256k 83bc25 LVEval_f1 gen 4.64
111
+ ---------------------------------------- - - - -
112
+ LVEval_multifieldqa_zh_mixup_16k ac4a0d LVEval_f1 gen 22.30
113
+ LVEval_multifieldqa_zh_mixup_32k ac4a0d LVEval_f1 gen 17.46
114
+ LVEval_multifieldqa_zh_mixup_64k ac4a0d LVEval_f1 gen 6.27
115
+ LVEval_multifieldqa_zh_mixup_128k ac4a0d LVEval_f1 gen 5.84
116
+ LVEval_multifieldqa_zh_mixup_256k ac4a0d LVEval_f1 gen 4.71
117
+ ---------------------------------------- - - - -
118
+ --------- LVEval Multi_Hop QA --------- - - - -
119
+ ---------------------------------------- - - - -
120
+ LVEval_dureader_mixup_16k 8bac4e LVEval_rouge gen 18.04
121
+ LVEval_dureader_mixup_32k 8bac4e LVEval_rouge gen 18.33
122
+ LVEval_dureader_mixup_64k 8bac4e LVEval_rouge gen 12.56
123
+ LVEval_dureader_mixup_128k 8bac4e LVEval_rouge gen 10.33
124
+ LVEval_dureader_mixup_256k 8bac4e LVEval_rouge gen 6.69
125
+ ---------------------------------------- - - - -
126
+ LVEval_loogle_CR_mixup_16k 83bc25 LVEval_f1 gen 9.35
127
+ LVEval_loogle_CR_mixup_32k 83bc25 LVEval_f1 gen 7.42
128
+ LVEval_loogle_CR_mixup_64k 83bc25 LVEval_f1 gen 3.18
129
+ LVEval_loogle_CR_mixup_128k 83bc25 LVEval_f1 gen 2.65
130
+ LVEval_loogle_CR_mixup_256k 83bc25 LVEval_f1 gen 3.27
131
+ ---------------------------------------- - - - -
132
+ LVEval_loogle_MIR_mixup_16k 83bc25 LVEval_f1 gen 4.50
133
+ LVEval_loogle_MIR_mixup_32k 83bc25 LVEval_f1 gen 3.19
134
+ LVEval_loogle_MIR_mixup_64k 83bc25 LVEval_f1 gen 2.34
135
+ LVEval_loogle_MIR_mixup_128k 83bc25 LVEval_f1 gen 1.76
136
+ LVEval_loogle_MIR_mixup_256k 83bc25 LVEval_f1 gen 1.20
137
+ ---------------------------------------- - - - -
138
+ --------- LVEval Multi_Hop CQA --------- - - - -
139
+ ---------------------------------------- - - - -
140
+ LVEval_hotpotwikiqa_mixup_16k e3c368 LVEval_f1 gen 19.80
141
+ LVEval_hotpotwikiqa_mixup_32k e3c368 LVEval_f1 gen 12.59
142
+ LVEval_hotpotwikiqa_mixup_64k e3c368 LVEval_f1 gen 7.33
143
+ LVEval_hotpotwikiqa_mixup_128k e3c368 LVEval_f1 gen 7.85
144
+ LVEval_hotpotwikiqa_mixup_256k e3c368 LVEval_f1 gen 3.42
145
+ ---------------------------------------- - - - -
146
+ LVEval_lic_mixup_16k fdd540 LVEval_f1 gen 21.36
147
+ LVEval_lic_mixup_32k fdd540 LVEval_f1 gen 12.92
148
+ LVEval_lic_mixup_64k fdd540 LVEval_f1 gen 4.62
149
+ LVEval_lic_mixup_128k fdd540 LVEval_f1 gen 4.25
150
+ LVEval_lic_mixup_256k fdd540 LVEval_f1 gen 4.85
151
+ ---------------------------------------- - - - -
152
+ --------- LVEval Factrecall CQA --------- - - - -
153
+ ---------------------------------------- - - - -
154
+ LVEval_factrecall_en_16k fba966 f1 gen 58.33
155
+ LVEval_factrecall_en_32k fba966 f1 gen 32.17
156
+ LVEval_factrecall_en_64k fba966 f1 gen 15.33
157
+ LVEval_factrecall_en_128k fba966 f1 gen 8.50
158
+ LVEval_factrecall_en_256k fba966 f1 gen 4.00
159
+ ---------------------------------------- - - - -
160
+ LVEval_factrecall_zh_16k ef3320 f1 gen 20.00
161
+ LVEval_factrecall_zh_32k ef3320 f1 gen 38.00
162
+ LVEval_factrecall_zh_64k ef3320 f1 gen 20.50
163
+ LVEval_factrecall_zh_128k ef3320 f1 gen 11.00
164
+ LVEval_factrecall_zh_256k ef3320 f1 gen 5.00
165
+ ```
opencompass/configs/datasets/lveval/lveval.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .lvevalcmrc_mixup.lveval_cmrc_mixup_gen import (
5
+ LVEval_cmrc_mixup_datasets,
6
+ )
7
+ from .lvevaldureader_mixup.lveval_dureader_mixup_gen import (
8
+ LVEval_dureader_mixup_datasets,
9
+ )
10
+ from .lvevalfactrecall_en.lveval_factrecall_en_gen import (
11
+ LVEval_factrecall_en_datasets,
12
+ )
13
+ from .lvevalfactrecall_zh.lveval_factrecall_zh_gen import (
14
+ LVEval_factrecall_zh_datasets,
15
+ )
16
+ from .lvevalhotpotwikiqa_mixup.lveval_hotpotwikiqa_mixup_gen import (
17
+ LVEval_hotpotwikiqa_mixup_datasets,
18
+ )
19
+ from .lvevallic_mixup.lveval_lic_mixup_gen import LVEval_lic_mixup_datasets
20
+ from .lvevalloogle_CR_mixup.lveval_loogle_CR_mixup_gen import (
21
+ LVEval_loogle_CR_mixup_datasets,
22
+ )
23
+ from .lvevalloogle_MIR_mixup.lveval_loogle_MIR_mixup_gen import (
24
+ LVEval_loogle_MIR_mixup_datasets,
25
+ )
26
+ from .lvevalloogle_SD_mixup.lveval_loogle_SD_mixup_gen import (
27
+ LVEval_loogle_SD_mixup_datasets,
28
+ )
29
+ from .lvevalmultifieldqa_en_mixup.lveval_multifieldqa_en_mixup_gen import (
30
+ LVEval_multifieldqa_en_mixup_datasets,
31
+ )
32
+ from .lvevalmultifieldqa_zh_mixup.lveval_multifieldqa_zh_mixup_gen import (
33
+ LVEval_multifieldqa_zh_mixup_datasets,
34
+ )
35
+
36
+ LVEval_datasets = sum(
37
+ (v for k, v in locals().items() if k.endswith('_datasets')), []
38
+ )
opencompass/configs/datasets/math401/math401_gen.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mmengine.config import read_base
2
+
3
+ with read_base():
4
+ from .math401_gen_ab5f39 import math401_datasets # noqa: F401, F403
opencompass/configs/datasets/math401/math401_gen_ab5f39.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.openicl.icl_evaluator import AccEvaluator
5
+ from opencompass.datasets import MathBenchDataset, Math401Evaluator, mathbench_postprocess
6
+
7
+ cloze_prompt = [
8
+ dict(role='HUMAN', prompt='Q: Calculate 2.9-0.11.'),
9
+ dict(role='BOT', prompt='A: Let\'s think step by step, 2.9 - 0.11 equals 2.7900. The answer is 2.7900.\n'),
10
+ dict(role='HUMAN', prompt='Q: Calculate 0.15-0.032.'),
11
+ dict(role='BOT', prompt='A: Let\'s think step by step, 0.15 - 0.032 equals 0.1180. The answer is 0.1180.\n'),
12
+ dict(role='HUMAN', prompt='Q: Calculate 78*64.'),
13
+ dict(role='BOT', prompt='A: Let\'s think step by step, 78 multiplied by 64 equals 4992. The answer is 4992.\n'),
14
+ dict(role='HUMAN', prompt='Q: Calculate 62×42.'),
15
+ dict(role='BOT', prompt='A: Let\'s think step by step, 62 multiplied by 42 equals 2604. The answer is 2604.\n'),
16
+ dict(role='HUMAN', prompt='Q: Calculate {question}'),
17
+ dict(role='BOT', prompt='A: {answer}\n')]
18
+
19
+ math401_infer_cfg = dict(
20
+ prompt_template=dict(
21
+ type=PromptTemplate,
22
+ template=dict(
23
+ round=cloze_prompt,
24
+ ),
25
+ ),
26
+ retriever=dict(type=ZeroRetriever),
27
+ inferencer=dict(type=GenInferencer, max_out_len=512),
28
+ )
29
+
30
+ math401_eval_cfg = dict(
31
+ evaluator=dict(type=Math401Evaluator),
32
+ pred_postprocessor=dict(type=mathbench_postprocess, name='en'))
33
+
34
+ math401_datasets = [
35
+ dict(
36
+ abbr='math401',
37
+ type=MathBenchDataset,
38
+ path=f'./data/math401/',
39
+ with_circular=False,
40
+ name='cloze_en',
41
+ reader_cfg=dict(
42
+ input_columns=['question'],
43
+ output_column='answer'
44
+ ),
45
+ infer_cfg=math401_infer_cfg,
46
+ eval_cfg=math401_eval_cfg,
47
+ )]
opencompass/configs/datasets/mbpp/README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MBPP
2
+
3
+ ```bash
4
+ python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
5
+ python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
6
+ ```
7
+
8
+ ## Base Models
9
+
10
+ | model | pass@1 | pass | timeout | failed | wrong_answer |
11
+ |:------------------------:|---------:|-------:|----------:|---------:|---------------:|
12
+ | llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 |
13
+ | llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 |
14
+ | llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 |
15
+ | llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 |
16
+ | llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 |
17
+ | llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 |
18
+ | llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 |
19
+ | llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 |
20
+ | llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 |
21
+ | internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 |
22
+ | internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 |
23
+ | internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 |
24
+ | qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 |
25
+ | qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 |
26
+ | qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 |
27
+ | qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 |
28
+ | qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 |
29
+ | qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 |
30
+ | qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 |
31
+ | qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 |
32
+ | qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 |
33
+ | qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 |
34
+ | qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 |
35
+ | qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 |
36
+ | mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 |
37
+ | mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 |
38
+ | mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 |
39
+ | mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 |
40
+ | yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 |
41
+ | yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 |
42
+ | deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 |
43
+ | deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 |
44
+
45
+ ## Chat Models
46
+
47
+ | model | pass@1 | pass | timeout | failed | wrong_answer |
48
+ |:-----------------------------:|---------:|-------:|----------:|---------:|---------------:|
49
+ | qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 |
50
+ | qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 |
51
+ | qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 |
52
+ | qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 |
53
+ | qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 |
54
+ | qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 |
55
+ | qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 |
56
+ | qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 |
57
+ | internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 |
58
+ | internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 |
59
+ | internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 |
60
+ | internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 |
61
+ | internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 |
62
+ | internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 |
63
+ | llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 |
64
+ | llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 |
65
+ | llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 |
66
+ | llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 |
67
+ | mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 |
68
+ | mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 |
69
+ | mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 |
opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MBPPDataset, MBPPEvaluator
5
+
6
+ mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
7
+
8
+ mbpp_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template=dict(
12
+ round=[
13
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
14
+ dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
15
+
16
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'),
17
+ dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
18
+
19
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
20
+ dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
21
+
22
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'),
23
+ dict(role='BOT', prompt='[BEGIN]\n'),
24
+ ],
25
+ ),
26
+ ),
27
+ retriever=dict(type=ZeroRetriever),
28
+ inferencer=dict(type=GenInferencer, max_out_len=512),
29
+ )
30
+
31
+ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
32
+
33
+ mbpp_datasets = [
34
+ dict(
35
+ type=MBPPDataset,
36
+ abbr='mbpp',
37
+ path='./data/mbpp/mbpp.jsonl',
38
+ reader_cfg=mbpp_reader_cfg,
39
+ infer_cfg=mbpp_infer_cfg,
40
+ eval_cfg=mbpp_eval_cfg,
41
+ )
42
+ ]
opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_6590b0.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MBPPDataset, MBPPEvaluator
5
+
6
+ mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
7
+
8
+ mbpp_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n[BEGIN]\n",
12
+ ),
13
+ retriever=dict(type=ZeroRetriever),
14
+ inferencer=dict(type=GenInferencer, max_out_len=512),
15
+ )
16
+
17
+ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator))
18
+
19
+ mbpp_datasets = [
20
+ dict(
21
+ type=MBPPDataset,
22
+ abbr='mbpp',
23
+ path='./data/mbpp/mbpp.jsonl',
24
+ reader_cfg=mbpp_reader_cfg,
25
+ infer_cfg=mbpp_infer_cfg,
26
+ eval_cfg=mbpp_eval_cfg,
27
+ )
28
+ ]
opencompass/configs/datasets/mbpp/deprecated_mbpp_gen_caa7ab.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import MBPPDataset, MBPPEvaluator
5
+
6
+ mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
7
+
8
+ mbpp_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template=dict(
12
+ round=[
13
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n'),
14
+ dict(role='BOT', prompt='[BEGIN]\ndef similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)\n[DONE] \n\n '),
15
+
16
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n'),
17
+ dict(role='BOT', prompt='[BEGIN]\nimport math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result\n[DONE] \n\n '),
18
+
19
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n'),
20
+ dict(role='BOT', prompt='[BEGIN]\nimport heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums\n[DONE] \n\n '),
21
+
22
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n\nYour code should start with a [BEGIN] tag and end with a [DONE] tag.\n'),
23
+ dict(role='BOT', prompt='[BEGIN]\n'),
24
+ ],
25
+ ),
26
+ ),
27
+ retriever=dict(type=ZeroRetriever),
28
+ inferencer=dict(type=GenInferencer, max_out_len=512),
29
+ )
30
+
31
+ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
32
+
33
+ mbpp_datasets = [
34
+ dict(
35
+ type=MBPPDataset,
36
+ abbr='mbpp',
37
+ path='./data/mbpp/mbpp.jsonl',
38
+ reader_cfg=mbpp_reader_cfg,
39
+ infer_cfg=mbpp_infer_cfg,
40
+ eval_cfg=mbpp_eval_cfg,
41
+ )
42
+ ]
opencompass/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This config is used for pass@k evaluation with dataset repetition
2
+ # That model cannot generate multiple response for single input
3
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
4
+ from opencompass.openicl.icl_retriever import ZeroRetriever
5
+ from opencompass.openicl.icl_inferencer import GenInferencer
6
+ from opencompass.datasets import MBPPDatasetV2, MBPPPassKEvaluator
7
+
8
+ mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
9
+
10
+ mbpp_infer_cfg = dict(
11
+ prompt_template=dict(
12
+ type=PromptTemplate,
13
+ template=dict(
14
+ round=[
15
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
16
+ dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
17
+
18
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'),
19
+ dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
20
+
21
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
22
+ dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
23
+
24
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'),
25
+ dict(role='BOT', prompt='[BEGIN]\n'),
26
+ ],
27
+ ),
28
+ ),
29
+ retriever=dict(type=ZeroRetriever),
30
+ inferencer=dict(type=GenInferencer, max_out_len=512),
31
+ )
32
+
33
+ mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT')
34
+
35
+ mbpp_datasets = [
36
+ dict(
37
+ type=MBPPDatasetV2,
38
+ abbr='mbpp_repeat10',
39
+ path='./data/mbpp/mbpp.jsonl',
40
+ num_repeats=10,
41
+ reader_cfg=mbpp_reader_cfg,
42
+ infer_cfg=mbpp_infer_cfg,
43
+ eval_cfg=mbpp_eval_cfg,
44
+ )
45
+ ]
opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_1e1056.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
5
+
6
+ sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
7
+
8
+ sanitized_mbpp_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template=dict(
12
+ round=[
13
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n',),
14
+ dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n ",),
15
+
16
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n',),
17
+ dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n ",),
18
+
19
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n',),
20
+ dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n ",),
21
+
22
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n',),
23
+ dict(role='BOT', prompt='[BEGIN]\n'),
24
+ ],
25
+ ),
26
+ ),
27
+ retriever=dict(type=ZeroRetriever),
28
+ inferencer=dict(type=GenInferencer, max_out_len=512),
29
+ )
30
+
31
+ sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
32
+
33
+ sanitized_mbpp_datasets = [
34
+ dict(
35
+ type=SanitizedMBPPDataset,
36
+ abbr='sanitized_mbpp',
37
+ path='./data/mbpp/sanitized-mbpp.jsonl',
38
+ reader_cfg=sanitized_mbpp_reader_cfg,
39
+ infer_cfg=sanitized_mbpp_infer_cfg,
40
+ eval_cfg=sanitized_mbpp_eval_cfg,
41
+ )
42
+ ]
opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_cb43ef.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
5
+
6
+ sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
7
+
8
+ sanitized_mbpp_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template='''\
12
+ You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:
13
+
14
+ assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
15
+ assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)
16
+ assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)
17
+
18
+ [BEGIN]
19
+ 'def similar_elements(test_tup1, test_tup2):
20
+ res = tuple(set(test_tup1) & set(test_tup2))
21
+ return (res)'
22
+ [DONE]
23
+
24
+
25
+
26
+ You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:
27
+
28
+ assert is_not_prime(2) == False
29
+ assert is_not_prime(10) == True
30
+ assert is_not_prime(35) == True
31
+
32
+ [BEGIN]
33
+ 'import math
34
+ def is_not_prime(n):
35
+ result = False
36
+ for i in range(2,int(math.sqrt(n)) + 1):
37
+ if n % i == 0:
38
+ result = True
39
+ return result'
40
+ [DONE]
41
+
42
+
43
+
44
+ You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:
45
+
46
+ assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]
47
+ assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]
48
+ assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]
49
+
50
+ [BEGIN]
51
+ 'import heapq as hq
52
+ def heap_queue_largest(nums,n):
53
+ largest_nums = hq.nlargest(n, nums)
54
+ return largest_nums'
55
+ [DONE]
56
+
57
+
58
+
59
+ You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:
60
+
61
+ {test_list}
62
+
63
+ [BEGIN]
64
+ '''
65
+ ),
66
+ retriever=dict(type=ZeroRetriever),
67
+ inferencer=dict(type=GenInferencer, max_out_len=512),
68
+ )
69
+
70
+ sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
71
+
72
+ sanitized_mbpp_datasets = [
73
+ dict(
74
+ type=SanitizedMBPPDataset,
75
+ abbr='sanitized_mbpp',
76
+ path='./data/mbpp/sanitized-mbpp.jsonl',
77
+ reader_cfg=sanitized_mbpp_reader_cfg,
78
+ infer_cfg=sanitized_mbpp_infer_cfg,
79
+ eval_cfg=sanitized_mbpp_eval_cfg,
80
+ )
81
+ ]
opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_passk_gen_1e1056.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
5
+
6
+ sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
7
+
8
+ sanitized_mbpp_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template=dict(
12
+ round=[
13
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
14
+ dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
15
+
16
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'),
17
+ dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
18
+
19
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
20
+ dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
21
+
22
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'),
23
+ dict(role='BOT', prompt='[BEGIN]\n'),
24
+ ],
25
+ ),
26
+ ),
27
+ retriever=dict(type=ZeroRetriever),
28
+ inferencer=dict(type=GenInferencer, max_out_len=512),
29
+ )
30
+
31
+ sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT')
32
+
33
+ sanitized_mbpp_datasets = [
34
+ dict(
35
+ type=SanitizedMBPPDataset,
36
+ abbr='sanitized_mbpp_passk',
37
+ path='./data/mbpp/sanitized-mbpp.jsonl',
38
+ reader_cfg=sanitized_mbpp_reader_cfg,
39
+ infer_cfg=sanitized_mbpp_infer_cfg,
40
+ eval_cfg=sanitized_mbpp_eval_cfg,
41
+ )
42
+ ]
opencompass/configs/datasets/mbpp/deprecated_sanitized_mbpp_repeat10_gen_1e1056.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from opencompass.openicl.icl_prompt_template import PromptTemplate
2
+ from opencompass.openicl.icl_retriever import ZeroRetriever
3
+ from opencompass.openicl.icl_inferencer import GenInferencer
4
+ from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
5
+
6
+ sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_column')
7
+
8
+ sanitized_mbpp_infer_cfg = dict(
9
+ prompt_template=dict(
10
+ type=PromptTemplate,
11
+ template=dict(
12
+ round=[
13
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
14
+ dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
15
+
16
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n'),
17
+ dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
18
+
19
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
20
+ dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
21
+
22
+ dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n'),
23
+ dict(role='BOT', prompt='[BEGIN]\n'),
24
+ ],
25
+ ),
26
+ ),
27
+ retriever=dict(type=ZeroRetriever),
28
+ inferencer=dict(type=GenInferencer, max_out_len=512),
29
+ )
30
+
31
+ sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role='BOT')
32
+
33
+ sanitized_mbpp_datasets = [
34
+ dict(
35
+ type=SanitizedMBPPDataset,
36
+ abbr='sanitized_mbpp_repeat10',
37
+ path='./data/mbpp/sanitized-mbpp.jsonl',
38
+ num_repeats=10,
39
+ reader_cfg=sanitized_mbpp_reader_cfg,
40
+ infer_cfg=sanitized_mbpp_infer_cfg,
41
+ eval_cfg=sanitized_mbpp_eval_cfg,
42
+ )
43
+ ]