import datasets from datasets import load_dataset import json from tqdm import tqdm fw = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/math_sft_bigbig.jsonl", "w+") fw2 = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/science_sft_bigbig.jsonl", "w+") udict = {} mydata = load_dataset('TIGER-Lab/WebInstruct-verified') for item in mydata['train']: new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} if item['category']=='Mathematics' else {"loss_mask": [0,1], "topic": "科学", "is_business": 0} answer = item['answer'] new_d['ref_answer'] = answer new_d['messages'] = [{"content": item['question'], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: if item['category']=='Mathematics': fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") else: fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n") for item in mydata['test']: new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} if item['category']=='Mathematics' else {"loss_mask": [0,1], "topic": "科学", "is_business": 0} answer = item['answer'] new_d['ref_answer'] = answer new_d['messages'] = [{"content": item['question'], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: if item['category']=='Mathematics': fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") else: fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n") mydata = load_dataset('Skywork/Skywork-OR1-RL-Data') #fw = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/skywork_deepmath.jsonl", "w+") for item in mydata['math']: new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} answer = item['reward_model']['ground_truth'] new_d['ref_answer'] = answer new_d['messages'] = item['prompt'] new_d['messages'].append({"content": answer, "role": "assistant"}) if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") #break mydata = load_dataset('zwhe99/DeepMath-103K') for item in mydata['train']: new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} answer = item['final_answer'] new_d['ref_answer'] = answer new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") #break lans = ['ar', 'bn', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'pt', 'ru', 'sw', 'te', 'th', 'vi', 'zh'] for lan in lans: mydata = load_dataset('Qwen/PolyMath', lan) for item in mydata['top']: answer = item['answer'] new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} new_d['ref_answer'] = answer new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") for item in mydata['high']: answer = item['answer'] new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} new_d['ref_answer'] = answer new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") for item in mydata['medium']: answer = item['answer'] new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} new_d['ref_answer'] = answer new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") for item in mydata['low']: answer = item['answer'] new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} new_d['ref_answer'] = answer new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") mydata = load_dataset('nvidia/OpenMathReasoning') for item in tqdm(mydata['cot']): answer = item['generated_solution'] new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} new_d['ref_answer'] = item['expected_answer'] new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") for item in tqdm(mydata['tir']): answer = item['generated_solution'] new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} new_d['ref_answer'] = item['expected_answer'] new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") for item in tqdm(mydata['genselect']): answer = item['generated_solution'] new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} new_d['ref_answer'] = item['expected_answer'] new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") """ mydata = load_dataset('nvidia/Nemotron-CrossThink') for item in mydata['train_math']: answer = item['reward_model']['ground_truth'] new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} new_d['ref_answer'] = answer new_d['messages'] = [{"content": item["meta_data"]["question"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") for item in mydata['train_qa']: answer = item['reward_model']['ground_truth'] new_d = {"loss_mask": [0,1], "topic": "科学", "is_business": 0} new_d['ref_answer'] = answer new_d['messages'] = [{"content": item["meta_data"]["question"], "role": "user"}, {"content": answer, "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n") """ #mydata = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT') lans = ['en','en_mix','zh','zh_mix'] for lan in lans: mydata = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT', lan) for item in mydata['train']: new_d = {"loss_mask": [0,1], "topic": "科学", "is_business": 0} new_d['ref_answer'] = item['Response'] new_d['messages'] = [{"content": item['Question'], "role": "user"}, {"content": item['Response'], "role": "assistant"}] if new_d['messages'][0]['content'][:50] in udict: continue else: udict[new_d['messages'][0]['content'][:50]] = 1 fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n") #new_d['messages'].append({"content": item['output'], "role": "assistant"}) #print(mydata)