|
import datasets |
|
from datasets import load_dataset |
|
import json |
|
from tqdm import tqdm |
|
fw = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/math_sft_bigbig.jsonl", "w+") |
|
fw2 = open("/apdcephfs_gy2/share_303094202/bazzfeng/data/science_sft_bigbig.jsonl", "w+") |
|
udict = {} |
|
|
|
mydata = load_dataset('TIGER-Lab/WebInstruct-verified') |
|
for item in mydata['train']: |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} if item['category']=='Mathematics' else {"loss_mask": [0,1], "topic": "科学", "is_business": 0} |
|
answer = item['answer'] |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item['question'], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
if item['category']=='Mathematics': |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
else: |
|
fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
for item in mydata['test']: |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} if item['category']=='Mathematics' else {"loss_mask": [0,1], "topic": "科学", "is_business": 0} |
|
answer = item['answer'] |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item['question'], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
if item['category']=='Mathematics': |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
else: |
|
fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
|
|
mydata = load_dataset('Skywork/Skywork-OR1-RL-Data') |
|
|
|
|
|
for item in mydata['math']: |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
answer = item['reward_model']['ground_truth'] |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = item['prompt'] |
|
new_d['messages'].append({"content": answer, "role": "assistant"}) |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
|
|
mydata = load_dataset('zwhe99/DeepMath-103K') |
|
for item in mydata['train']: |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
answer = item['final_answer'] |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
|
|
|
|
|
|
|
|
lans = ['ar', 'bn', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'pt', 'ru', 'sw', 'te', 'th', 'vi', 'zh'] |
|
for lan in lans: |
|
mydata = load_dataset('Qwen/PolyMath', lan) |
|
for item in mydata['top']: |
|
answer = item['answer'] |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
for item in mydata['high']: |
|
answer = item['answer'] |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
for item in mydata['medium']: |
|
answer = item['answer'] |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
for item in mydata['low']: |
|
answer = item['answer'] |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item["question"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
|
|
|
|
mydata = load_dataset('nvidia/OpenMathReasoning') |
|
for item in tqdm(mydata['cot']): |
|
answer = item['generated_solution'] |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
new_d['ref_answer'] = item['expected_answer'] |
|
new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
for item in tqdm(mydata['tir']): |
|
answer = item['generated_solution'] |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
new_d['ref_answer'] = item['expected_answer'] |
|
new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
for item in tqdm(mydata['genselect']): |
|
answer = item['generated_solution'] |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
new_d['ref_answer'] = item['expected_answer'] |
|
new_d['messages'] = [{"content": item["problem"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
|
|
|
|
""" |
|
mydata = load_dataset('nvidia/Nemotron-CrossThink') |
|
for item in mydata['train_math']: |
|
answer = item['reward_model']['ground_truth'] |
|
new_d = {"loss_mask": [0,1], "topic": "数学", "is_business": 0} |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item["meta_data"]["question"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
for item in mydata['train_qa']: |
|
answer = item['reward_model']['ground_truth'] |
|
new_d = {"loss_mask": [0,1], "topic": "科学", "is_business": 0} |
|
new_d['ref_answer'] = answer |
|
new_d['messages'] = [{"content": item["meta_data"]["question"], "role": "user"}, {"content": answer, "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
""" |
|
|
|
lans = ['en','en_mix','zh','zh_mix'] |
|
for lan in lans: |
|
mydata = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT', lan) |
|
for item in mydata['train']: |
|
new_d = {"loss_mask": [0,1], "topic": "科学", "is_business": 0} |
|
new_d['ref_answer'] = item['Response'] |
|
new_d['messages'] = [{"content": item['Question'], "role": "user"}, {"content": item['Response'], "role": "assistant"}] |
|
if new_d['messages'][0]['content'][:50] in udict: |
|
continue |
|
else: |
|
udict[new_d['messages'][0]['content'][:50]] = 1 |
|
fw2.write(json.dumps(new_d, ensure_ascii=False)+"\n") |
|
|
|
|
|
|
|
|