|
{ |
|
"best_global_step": 925, |
|
"best_metric": 3.4105401039123535, |
|
"best_model_checkpoint": "./qwen3moe_tinystories_sft_global_balance/checkpoint-925", |
|
"epoch": 0.9991915925626516, |
|
"eval_steps": 25, |
|
"global_step": 927, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.026946914578280787, |
|
"eval_loss": 10.00758171081543, |
|
"eval_runtime": 119.3793, |
|
"eval_samples_per_second": 104.708, |
|
"eval_steps_per_second": 6.551, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05389382915656157, |
|
"eval_loss": 8.949356079101562, |
|
"eval_runtime": 118.7415, |
|
"eval_samples_per_second": 105.271, |
|
"eval_steps_per_second": 6.586, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08084074373484236, |
|
"eval_loss": 7.982501029968262, |
|
"eval_runtime": 118.833, |
|
"eval_samples_per_second": 105.19, |
|
"eval_steps_per_second": 6.581, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10778765831312315, |
|
"grad_norm": 1888245.875, |
|
"learning_rate": 4.99936149886953e-05, |
|
"loss": 8.9553, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10778765831312315, |
|
"eval_loss": 6.839597702026367, |
|
"eval_runtime": 119.3075, |
|
"eval_samples_per_second": 104.771, |
|
"eval_steps_per_second": 6.554, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13473457289140395, |
|
"eval_loss": 5.939998149871826, |
|
"eval_runtime": 118.9201, |
|
"eval_samples_per_second": 105.113, |
|
"eval_steps_per_second": 6.576, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.16168148746968472, |
|
"eval_loss": 5.392858982086182, |
|
"eval_runtime": 118.6142, |
|
"eval_samples_per_second": 105.384, |
|
"eval_steps_per_second": 6.593, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18862840204796552, |
|
"eval_loss": 5.082247734069824, |
|
"eval_runtime": 118.3215, |
|
"eval_samples_per_second": 105.644, |
|
"eval_steps_per_second": 6.609, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2155753166262463, |
|
"grad_norm": 1668651.875, |
|
"learning_rate": 4.8033420018832464e-05, |
|
"loss": 5.5612, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2155753166262463, |
|
"eval_loss": 4.85417366027832, |
|
"eval_runtime": 118.337, |
|
"eval_samples_per_second": 105.631, |
|
"eval_steps_per_second": 6.608, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2425222312045271, |
|
"eval_loss": 4.635244846343994, |
|
"eval_runtime": 118.9352, |
|
"eval_samples_per_second": 105.099, |
|
"eval_steps_per_second": 6.575, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2694691457828079, |
|
"eval_loss": 4.44834566116333, |
|
"eval_runtime": 119.5901, |
|
"eval_samples_per_second": 104.524, |
|
"eval_steps_per_second": 6.539, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.29641606036108864, |
|
"eval_loss": 4.313559532165527, |
|
"eval_runtime": 119.4227, |
|
"eval_samples_per_second": 104.67, |
|
"eval_steps_per_second": 6.548, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.32336297493936944, |
|
"grad_norm": 1884736.25, |
|
"learning_rate": 4.28433581375477e-05, |
|
"loss": 4.4829, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32336297493936944, |
|
"eval_loss": 4.1981892585754395, |
|
"eval_runtime": 118.721, |
|
"eval_samples_per_second": 105.289, |
|
"eval_steps_per_second": 6.587, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.35030988951765024, |
|
"eval_loss": 4.093081951141357, |
|
"eval_runtime": 119.1798, |
|
"eval_samples_per_second": 104.884, |
|
"eval_steps_per_second": 6.562, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.37725680409593104, |
|
"eval_loss": 3.997279405593872, |
|
"eval_runtime": 118.6147, |
|
"eval_samples_per_second": 105.383, |
|
"eval_steps_per_second": 6.593, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4042037186742118, |
|
"eval_loss": 3.9050540924072266, |
|
"eval_runtime": 119.2145, |
|
"eval_samples_per_second": 104.853, |
|
"eval_steps_per_second": 6.56, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4311506332524926, |
|
"grad_norm": 1581947.375, |
|
"learning_rate": 3.515120703156264e-05, |
|
"loss": 4.0105, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4311506332524926, |
|
"eval_loss": 3.8343873023986816, |
|
"eval_runtime": 119.1988, |
|
"eval_samples_per_second": 104.867, |
|
"eval_steps_per_second": 6.56, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4580975478307734, |
|
"eval_loss": 3.7698652744293213, |
|
"eval_runtime": 119.2112, |
|
"eval_samples_per_second": 104.856, |
|
"eval_steps_per_second": 6.56, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4850444624090542, |
|
"eval_loss": 3.7166643142700195, |
|
"eval_runtime": 118.5648, |
|
"eval_samples_per_second": 105.428, |
|
"eval_steps_per_second": 6.596, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.511991376987335, |
|
"eval_loss": 3.672928810119629, |
|
"eval_runtime": 118.4066, |
|
"eval_samples_per_second": 105.568, |
|
"eval_steps_per_second": 6.604, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5389382915656158, |
|
"grad_norm": 1558245.625, |
|
"learning_rate": 2.6035600456288573e-05, |
|
"loss": 3.7286, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5389382915656158, |
|
"eval_loss": 3.632558822631836, |
|
"eval_runtime": 118.4745, |
|
"eval_samples_per_second": 105.508, |
|
"eval_steps_per_second": 6.601, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5658852061438965, |
|
"eval_loss": 3.5953712463378906, |
|
"eval_runtime": 118.8897, |
|
"eval_samples_per_second": 105.139, |
|
"eval_steps_per_second": 6.578, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5928321207221773, |
|
"eval_loss": 3.5634474754333496, |
|
"eval_runtime": 118.8661, |
|
"eval_samples_per_second": 105.16, |
|
"eval_steps_per_second": 6.579, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6197790353004581, |
|
"eval_loss": 3.5363874435424805, |
|
"eval_runtime": 118.8906, |
|
"eval_samples_per_second": 105.139, |
|
"eval_steps_per_second": 6.577, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6467259498787389, |
|
"grad_norm": 2365406.75, |
|
"learning_rate": 1.677477655573303e-05, |
|
"loss": 3.5716, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6467259498787389, |
|
"eval_loss": 3.5141489505767822, |
|
"eval_runtime": 119.2494, |
|
"eval_samples_per_second": 104.822, |
|
"eval_steps_per_second": 6.558, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6736728644570197, |
|
"eval_loss": 3.4942786693573, |
|
"eval_runtime": 119.25, |
|
"eval_samples_per_second": 104.822, |
|
"eval_steps_per_second": 6.558, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7006197790353005, |
|
"eval_loss": 3.4765748977661133, |
|
"eval_runtime": 119.4591, |
|
"eval_samples_per_second": 104.638, |
|
"eval_steps_per_second": 6.546, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7275666936135813, |
|
"eval_loss": 3.4616143703460693, |
|
"eval_runtime": 119.3357, |
|
"eval_samples_per_second": 104.746, |
|
"eval_steps_per_second": 6.553, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7545136081918621, |
|
"grad_norm": 1150363.875, |
|
"learning_rate": 8.667336608579487e-06, |
|
"loss": 3.4819, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7545136081918621, |
|
"eval_loss": 3.44948410987854, |
|
"eval_runtime": 119.0441, |
|
"eval_samples_per_second": 105.003, |
|
"eval_steps_per_second": 6.569, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7814605227701428, |
|
"eval_loss": 3.4393043518066406, |
|
"eval_runtime": 119.2449, |
|
"eval_samples_per_second": 104.826, |
|
"eval_steps_per_second": 6.558, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8084074373484236, |
|
"eval_loss": 3.4311933517456055, |
|
"eval_runtime": 119.1259, |
|
"eval_samples_per_second": 104.931, |
|
"eval_steps_per_second": 6.564, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8353543519267044, |
|
"eval_loss": 3.4240500926971436, |
|
"eval_runtime": 119.3094, |
|
"eval_samples_per_second": 104.77, |
|
"eval_steps_per_second": 6.554, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8623012665049852, |
|
"grad_norm": 915389.0, |
|
"learning_rate": 2.8501483487659216e-06, |
|
"loss": 3.441, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8623012665049852, |
|
"eval_loss": 3.4186859130859375, |
|
"eval_runtime": 119.0028, |
|
"eval_samples_per_second": 105.04, |
|
"eval_steps_per_second": 6.571, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.889248181083266, |
|
"eval_loss": 3.4150524139404297, |
|
"eval_runtime": 119.1835, |
|
"eval_samples_per_second": 104.88, |
|
"eval_steps_per_second": 6.561, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.9161950956615468, |
|
"eval_loss": 3.4126269817352295, |
|
"eval_runtime": 118.5462, |
|
"eval_samples_per_second": 105.444, |
|
"eval_steps_per_second": 6.597, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9431420102398276, |
|
"eval_loss": 3.411245107650757, |
|
"eval_runtime": 118.0843, |
|
"eval_samples_per_second": 105.857, |
|
"eval_steps_per_second": 6.622, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.9700889248181084, |
|
"grad_norm": 827701.4375, |
|
"learning_rate": 1.38928411621439e-07, |
|
"loss": 3.4196, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9700889248181084, |
|
"eval_loss": 3.410634994506836, |
|
"eval_runtime": 118.1249, |
|
"eval_samples_per_second": 105.82, |
|
"eval_steps_per_second": 6.62, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9970358393963891, |
|
"eval_loss": 3.4105401039123535, |
|
"eval_runtime": 118.8199, |
|
"eval_samples_per_second": 105.201, |
|
"eval_steps_per_second": 6.581, |
|
"step": 925 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 927, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.071708034936832e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|