zswzswzsw commited on
Commit
933d496
·
verified ·
1 Parent(s): b36749c

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. arena_datasets.py +181 -0
arena_datasets.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ from datasets import load_dataset
3
+ import json
4
+ from tqdm import tqdm
5
+ fw = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/math_sft_bigbig.jsonl", "w+")
6
+ fw2 = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/science_sft_bigbig.jsonl", "w+")
7
+ udict = {}
8
+
9
+ mydata = load_dataset('TIGER-Lab/WebInstruct-verified')
10
+ for item in mydata['train']:
11
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} if item['category']=='Mathematics' else {"loss_mask": [0,1], "topic": "科学", "is_business": 0}
12
+ answer = item['answer']
13
+ new_d['ref_answer'] = answer
14
+ new_d['messages'] = [{"content": item['question'], "role": "user"}, {"content": answer, "role": "assistant"}]
15
+ if new_d['messages'][0]['content'][:50] in udict:
16
+ continue
17
+ else:
18
+ if item['category']=='Mathematics':
19
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
20
+ else:
21
+ fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n")
22
+ for item in mydata['test']:
23
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} if item['category']=='Mathematics' else {"loss_mask": [0,1], "topic": "科学", "is_business": 0}
24
+ answer = item['answer']
25
+ new_d['ref_answer'] = answer
26
+ new_d['messages'] = [{"content": item['question'], "role": "user"}, {"content": answer, "role": "assistant"}]
27
+ if new_d['messages'][0]['content'][:50] in udict:
28
+ continue
29
+ else:
30
+ if item['category']=='Mathematics':
31
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
32
+ else:
33
+ fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n")
34
+
35
+ mydata = load_dataset('Skywork/Skywork-OR1-RL-Data')
36
+
37
+ #fw = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/skywork_deepmath.jsonl", "w+")
38
+ for item in mydata['math']:
39
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
40
+ answer = item['reward_model']['ground_truth']
41
+ new_d['ref_answer'] = answer
42
+ new_d['messages'] = item['prompt']
43
+ new_d['messages'].append({"content": answer, "role": "assistant"})
44
+ if new_d['messages'][0]['content'][:50] in udict:
45
+ continue
46
+ else:
47
+ udict[new_d['messages'][0]['content'][:50]] = 1
48
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
49
+ #break
50
+ mydata = load_dataset('zwhe99/DeepMath-103K')
51
+ for item in mydata['train']:
52
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
53
+ answer = item['final_answer']
54
+ new_d['ref_answer'] = answer
55
+ new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
56
+ if new_d['messages'][0]['content'][:50] in udict:
57
+ continue
58
+ else:
59
+ udict[new_d['messages'][0]['content'][:50]] = 1
60
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
61
+ #break
62
+
63
+
64
+
65
+ lans = ['ar', 'bn', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'pt', 'ru', 'sw', 'te', 'th', 'vi', 'zh']
66
+ for lan in lans:
67
+ mydata = load_dataset('Qwen/PolyMath', lan)
68
+ for item in mydata['top']:
69
+ answer = item['answer']
70
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
71
+ new_d['ref_answer'] = answer
72
+ new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
73
+ if new_d['messages'][0]['content'][:50] in udict:
74
+ continue
75
+ else:
76
+ udict[new_d['messages'][0]['content'][:50]] = 1
77
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
78
+ for item in mydata['high']:
79
+ answer = item['answer']
80
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
81
+ new_d['ref_answer'] = answer
82
+ new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
83
+ if new_d['messages'][0]['content'][:50] in udict:
84
+ continue
85
+ else:
86
+ udict[new_d['messages'][0]['content'][:50]] = 1
87
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
88
+ for item in mydata['medium']:
89
+ answer = item['answer']
90
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
91
+ new_d['ref_answer'] = answer
92
+ new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
93
+ if new_d['messages'][0]['content'][:50] in udict:
94
+ continue
95
+ else:
96
+ udict[new_d['messages'][0]['content'][:50]] = 1
97
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
98
+ for item in mydata['low']:
99
+ answer = item['answer']
100
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
101
+ new_d['ref_answer'] = answer
102
+ new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
103
+ if new_d['messages'][0]['content'][:50] in udict:
104
+ continue
105
+ else:
106
+ udict[new_d['messages'][0]['content'][:50]] = 1
107
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
108
+
109
+
110
+ mydata = load_dataset('nvidia/OpenMathReasoning')
111
+ for item in tqdm(mydata['cot']):
112
+ answer = item['generated_solution']
113
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
114
+ new_d['ref_answer'] = item['expected_answer']
115
+ new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}]
116
+ if new_d['messages'][0]['content'][:50] in udict:
117
+ continue
118
+ else:
119
+ udict[new_d['messages'][0]['content'][:50]] = 1
120
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
121
+ for item in tqdm(mydata['tir']):
122
+ answer = item['generated_solution']
123
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
124
+ new_d['ref_answer'] = item['expected_answer']
125
+ new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}]
126
+ if new_d['messages'][0]['content'][:50] in udict:
127
+ continue
128
+ else:
129
+ udict[new_d['messages'][0]['content'][:50]] = 1
130
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
131
+ for item in tqdm(mydata['genselect']):
132
+ answer = item['generated_solution']
133
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
134
+ new_d['ref_answer'] = item['expected_answer']
135
+ new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}]
136
+ if new_d['messages'][0]['content'][:50] in udict:
137
+ continue
138
+ else:
139
+ udict[new_d['messages'][0]['content'][:50]] = 1
140
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
141
+
142
+
143
+ """
144
+ mydata = load_dataset('nvidia/Nemotron-CrossThink')
145
+ for item in mydata['train_math']:
146
+ answer = item['reward_model']['ground_truth']
147
+ new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
148
+ new_d['ref_answer'] = answer
149
+ new_d['messages'] = [{"content": item["meta_data"]["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
150
+ if new_d['messages'][0]['content'][:50] in udict:
151
+ continue
152
+ else:
153
+ udict[new_d['messages'][0]['content'][:50]] = 1
154
+ fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
155
+ for item in mydata['train_qa']:
156
+ answer = item['reward_model']['ground_truth']
157
+ new_d = {"loss_mask": [0,1], "topic": "科学", "is_business": 0}
158
+ new_d['ref_answer'] = answer
159
+ new_d['messages'] = [{"content": item["meta_data"]["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
160
+ if new_d['messages'][0]['content'][:50] in udict:
161
+ continue
162
+ else:
163
+ udict[new_d['messages'][0]['content'][:50]] = 1
164
+ fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n")
165
+ """
166
+ #mydata = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT')
167
+ lans = ['en','en_mix','zh','zh_mix']
168
+ for lan in lans:
169
+ mydata = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT', lan)
170
+ for item in mydata['train']:
171
+ new_d = {"loss_mask": [0,1], "topic": "科学", "is_business": 0}
172
+ new_d['ref_answer'] = item['Response']
173
+ new_d['messages'] = [{"content": item['Question'], "role": "user"}, {"content": item['Response'], "role": "assistant"}]
174
+ if new_d['messages'][0]['content'][:50] in udict:
175
+ continue
176
+ else:
177
+ udict[new_d['messages'][0]['content'][:50]] = 1
178
+ fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n")
179
+ #new_d['messages'].append({"content": item['output'], "role": "assistant"})
180
+
181
+ #print(mydata)