{ | |
"model_name": "Qwen/Qwen2.5-3B", | |
"dataset_name": "nvidia/OpenMathInstruct-2", | |
"max_length": 2048, | |
"max_samples": 100000, | |
"batch_size": 3, | |
"gradient_accumulation_steps": 16, | |
"learning_rate": 2e-05, | |
"num_epochs": 3, | |
"warmup_steps": 200, | |
"weight_decay": 0.01, | |
"seed": 42, | |
"max_checkpoints": 3, | |
"save_steps": 10000, | |
"eval_steps": 10000, | |
"output_dir": "./qwen_math_fp8_model", | |
"fp8_backend": "msamp", | |
"msamp_opt_level": "O2", | |
"te_fp8_format": "HYBRID", | |
"te_amax_history_len": 32, | |
"te_amax_compute_algo": "max", | |
"use_generated_solution": true, | |
"solution_field": "generated_solution", | |
"use_wandb": true, | |
"wandb_project": "qwen-math-fp8", | |
"wandb_entity": null, | |
"wandb_run_name": null, | |
"wandb_tags": [ | |
"fp8", | |
"qwen", | |
"math" | |
], | |
"wandb_notes": "", | |
"wandb_resume": false, | |
"wandb_watch_model": false, | |
"wandb_watch_freq": 1000, | |
"wandb_log_freq": 10, | |
"wandb_log_model": false | |
} |