collect_math_data_code / arena_datasets.py
zswzswzsw's picture
Upload folder using huggingface_hub
933d496 verified
raw
history blame
8.49 kB
import datasets
from datasets import load_dataset
import json
from tqdm import tqdm
fw = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/math_sft_bigbig.jsonl", "w+")
fw2 = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/science_sft_bigbig.jsonl", "w+")
udict = {}
mydata = load_dataset('TIGER-Lab/WebInstruct-verified')
for item in mydata['train']:
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} if item['category']=='Mathematics' else {"loss_mask": [0,1], "topic": "科学", "is_business": 0}
answer = item['answer']
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item['question'], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
if item['category']=='Mathematics':
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
else:
fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n")
for item in mydata['test']:
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} if item['category']=='Mathematics' else {"loss_mask": [0,1], "topic": "科学", "is_business": 0}
answer = item['answer']
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item['question'], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
if item['category']=='Mathematics':
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
else:
fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n")
mydata = load_dataset('Skywork/Skywork-OR1-RL-Data')
#fw = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/skywork_deepmath.jsonl", "w+")
for item in mydata['math']:
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
answer = item['reward_model']['ground_truth']
new_d['ref_answer'] = answer
new_d['messages'] = item['prompt']
new_d['messages'].append({"content": answer, "role": "assistant"})
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
#break
mydata = load_dataset('zwhe99/DeepMath-103K')
for item in mydata['train']:
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
answer = item['final_answer']
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
#break
lans = ['ar', 'bn', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'pt', 'ru', 'sw', 'te', 'th', 'vi', 'zh']
for lan in lans:
mydata = load_dataset('Qwen/PolyMath', lan)
for item in mydata['top']:
answer = item['answer']
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
for item in mydata['high']:
answer = item['answer']
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
for item in mydata['medium']:
answer = item['answer']
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
for item in mydata['low']:
answer = item['answer']
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
mydata = load_dataset('nvidia/OpenMathReasoning')
for item in tqdm(mydata['cot']):
answer = item['generated_solution']
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
new_d['ref_answer'] = item['expected_answer']
new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
for item in tqdm(mydata['tir']):
answer = item['generated_solution']
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
new_d['ref_answer'] = item['expected_answer']
new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
for item in tqdm(mydata['genselect']):
answer = item['generated_solution']
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
new_d['ref_answer'] = item['expected_answer']
new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
"""
mydata = load_dataset('nvidia/Nemotron-CrossThink')
for item in mydata['train_math']:
answer = item['reward_model']['ground_truth']
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0}
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item["meta_data"]["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n")
for item in mydata['train_qa']:
answer = item['reward_model']['ground_truth']
new_d = {"loss_mask": [0,1], "topic": "科学", "is_business": 0}
new_d['ref_answer'] = answer
new_d['messages'] = [{"content": item["meta_data"]["question"], "role": "user"}, {"content": answer, "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n")
"""
#mydata = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT')
lans = ['en','en_mix','zh','zh_mix']
for lan in lans:
mydata = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT', lan)
for item in mydata['train']:
new_d = {"loss_mask": [0,1], "topic": "科学", "is_business": 0}
new_d['ref_answer'] = item['Response']
new_d['messages'] = [{"content": item['Question'], "role": "user"}, {"content": item['Response'], "role": "assistant"}]
if new_d['messages'][0]['content'][:50] in udict:
continue
else:
udict[new_d['messages'][0]['content'][:50]] = 1
fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n")
#new_d['messages'].append({"content": item['output'], "role": "assistant"})
#print(mydata)