{ "best_global_step": 925, "best_metric": 3.4105401039123535, "best_model_checkpoint": "./qwen3moe_tinystories_sft_global_balance/checkpoint-925", "epoch": 0.9991915925626516, "eval_steps": 25, "global_step": 927, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026946914578280787, "eval_loss": 10.00758171081543, "eval_runtime": 119.3793, "eval_samples_per_second": 104.708, "eval_steps_per_second": 6.551, "step": 25 }, { "epoch": 0.05389382915656157, "eval_loss": 8.949356079101562, "eval_runtime": 118.7415, "eval_samples_per_second": 105.271, "eval_steps_per_second": 6.586, "step": 50 }, { "epoch": 0.08084074373484236, "eval_loss": 7.982501029968262, "eval_runtime": 118.833, "eval_samples_per_second": 105.19, "eval_steps_per_second": 6.581, "step": 75 }, { "epoch": 0.10778765831312315, "grad_norm": 1888245.875, "learning_rate": 4.99936149886953e-05, "loss": 8.9553, "step": 100 }, { "epoch": 0.10778765831312315, "eval_loss": 6.839597702026367, "eval_runtime": 119.3075, "eval_samples_per_second": 104.771, "eval_steps_per_second": 6.554, "step": 100 }, { "epoch": 0.13473457289140395, "eval_loss": 5.939998149871826, "eval_runtime": 118.9201, "eval_samples_per_second": 105.113, "eval_steps_per_second": 6.576, "step": 125 }, { "epoch": 0.16168148746968472, "eval_loss": 5.392858982086182, "eval_runtime": 118.6142, "eval_samples_per_second": 105.384, "eval_steps_per_second": 6.593, "step": 150 }, { "epoch": 0.18862840204796552, "eval_loss": 5.082247734069824, "eval_runtime": 118.3215, "eval_samples_per_second": 105.644, "eval_steps_per_second": 6.609, "step": 175 }, { "epoch": 0.2155753166262463, "grad_norm": 1668651.875, "learning_rate": 4.8033420018832464e-05, "loss": 5.5612, "step": 200 }, { "epoch": 0.2155753166262463, "eval_loss": 4.85417366027832, "eval_runtime": 118.337, "eval_samples_per_second": 105.631, "eval_steps_per_second": 6.608, "step": 200 }, { "epoch": 0.2425222312045271, "eval_loss": 4.635244846343994, "eval_runtime": 118.9352, "eval_samples_per_second": 105.099, "eval_steps_per_second": 6.575, "step": 225 }, { "epoch": 0.2694691457828079, "eval_loss": 4.44834566116333, "eval_runtime": 119.5901, "eval_samples_per_second": 104.524, "eval_steps_per_second": 6.539, "step": 250 }, { "epoch": 0.29641606036108864, "eval_loss": 4.313559532165527, "eval_runtime": 119.4227, "eval_samples_per_second": 104.67, "eval_steps_per_second": 6.548, "step": 275 }, { "epoch": 0.32336297493936944, "grad_norm": 1884736.25, "learning_rate": 4.28433581375477e-05, "loss": 4.4829, "step": 300 }, { "epoch": 0.32336297493936944, "eval_loss": 4.1981892585754395, "eval_runtime": 118.721, "eval_samples_per_second": 105.289, "eval_steps_per_second": 6.587, "step": 300 }, { "epoch": 0.35030988951765024, "eval_loss": 4.093081951141357, "eval_runtime": 119.1798, "eval_samples_per_second": 104.884, "eval_steps_per_second": 6.562, "step": 325 }, { "epoch": 0.37725680409593104, "eval_loss": 3.997279405593872, "eval_runtime": 118.6147, "eval_samples_per_second": 105.383, "eval_steps_per_second": 6.593, "step": 350 }, { "epoch": 0.4042037186742118, "eval_loss": 3.9050540924072266, "eval_runtime": 119.2145, "eval_samples_per_second": 104.853, "eval_steps_per_second": 6.56, "step": 375 }, { "epoch": 0.4311506332524926, "grad_norm": 1581947.375, "learning_rate": 3.515120703156264e-05, "loss": 4.0105, "step": 400 }, { "epoch": 0.4311506332524926, "eval_loss": 3.8343873023986816, "eval_runtime": 119.1988, "eval_samples_per_second": 104.867, "eval_steps_per_second": 6.56, "step": 400 }, { "epoch": 0.4580975478307734, "eval_loss": 3.7698652744293213, "eval_runtime": 119.2112, "eval_samples_per_second": 104.856, "eval_steps_per_second": 6.56, "step": 425 }, { "epoch": 0.4850444624090542, "eval_loss": 3.7166643142700195, "eval_runtime": 118.5648, "eval_samples_per_second": 105.428, "eval_steps_per_second": 6.596, "step": 450 }, { "epoch": 0.511991376987335, "eval_loss": 3.672928810119629, "eval_runtime": 118.4066, "eval_samples_per_second": 105.568, "eval_steps_per_second": 6.604, "step": 475 }, { "epoch": 0.5389382915656158, "grad_norm": 1558245.625, "learning_rate": 2.6035600456288573e-05, "loss": 3.7286, "step": 500 }, { "epoch": 0.5389382915656158, "eval_loss": 3.632558822631836, "eval_runtime": 118.4745, "eval_samples_per_second": 105.508, "eval_steps_per_second": 6.601, "step": 500 }, { "epoch": 0.5658852061438965, "eval_loss": 3.5953712463378906, "eval_runtime": 118.8897, "eval_samples_per_second": 105.139, "eval_steps_per_second": 6.578, "step": 525 }, { "epoch": 0.5928321207221773, "eval_loss": 3.5634474754333496, "eval_runtime": 118.8661, "eval_samples_per_second": 105.16, "eval_steps_per_second": 6.579, "step": 550 }, { "epoch": 0.6197790353004581, "eval_loss": 3.5363874435424805, "eval_runtime": 118.8906, "eval_samples_per_second": 105.139, "eval_steps_per_second": 6.577, "step": 575 }, { "epoch": 0.6467259498787389, "grad_norm": 2365406.75, "learning_rate": 1.677477655573303e-05, "loss": 3.5716, "step": 600 }, { "epoch": 0.6467259498787389, "eval_loss": 3.5141489505767822, "eval_runtime": 119.2494, "eval_samples_per_second": 104.822, "eval_steps_per_second": 6.558, "step": 600 }, { "epoch": 0.6736728644570197, "eval_loss": 3.4942786693573, "eval_runtime": 119.25, "eval_samples_per_second": 104.822, "eval_steps_per_second": 6.558, "step": 625 }, { "epoch": 0.7006197790353005, "eval_loss": 3.4765748977661133, "eval_runtime": 119.4591, "eval_samples_per_second": 104.638, "eval_steps_per_second": 6.546, "step": 650 }, { "epoch": 0.7275666936135813, "eval_loss": 3.4616143703460693, "eval_runtime": 119.3357, "eval_samples_per_second": 104.746, "eval_steps_per_second": 6.553, "step": 675 }, { "epoch": 0.7545136081918621, "grad_norm": 1150363.875, "learning_rate": 8.667336608579487e-06, "loss": 3.4819, "step": 700 }, { "epoch": 0.7545136081918621, "eval_loss": 3.44948410987854, "eval_runtime": 119.0441, "eval_samples_per_second": 105.003, "eval_steps_per_second": 6.569, "step": 700 }, { "epoch": 0.7814605227701428, "eval_loss": 3.4393043518066406, "eval_runtime": 119.2449, "eval_samples_per_second": 104.826, "eval_steps_per_second": 6.558, "step": 725 }, { "epoch": 0.8084074373484236, "eval_loss": 3.4311933517456055, "eval_runtime": 119.1259, "eval_samples_per_second": 104.931, "eval_steps_per_second": 6.564, "step": 750 }, { "epoch": 0.8353543519267044, "eval_loss": 3.4240500926971436, "eval_runtime": 119.3094, "eval_samples_per_second": 104.77, "eval_steps_per_second": 6.554, "step": 775 }, { "epoch": 0.8623012665049852, "grad_norm": 915389.0, "learning_rate": 2.8501483487659216e-06, "loss": 3.441, "step": 800 }, { "epoch": 0.8623012665049852, "eval_loss": 3.4186859130859375, "eval_runtime": 119.0028, "eval_samples_per_second": 105.04, "eval_steps_per_second": 6.571, "step": 800 }, { "epoch": 0.889248181083266, "eval_loss": 3.4150524139404297, "eval_runtime": 119.1835, "eval_samples_per_second": 104.88, "eval_steps_per_second": 6.561, "step": 825 }, { "epoch": 0.9161950956615468, "eval_loss": 3.4126269817352295, "eval_runtime": 118.5462, "eval_samples_per_second": 105.444, "eval_steps_per_second": 6.597, "step": 850 }, { "epoch": 0.9431420102398276, "eval_loss": 3.411245107650757, "eval_runtime": 118.0843, "eval_samples_per_second": 105.857, "eval_steps_per_second": 6.622, "step": 875 }, { "epoch": 0.9700889248181084, "grad_norm": 827701.4375, "learning_rate": 1.38928411621439e-07, "loss": 3.4196, "step": 900 }, { "epoch": 0.9700889248181084, "eval_loss": 3.410634994506836, "eval_runtime": 118.1249, "eval_samples_per_second": 105.82, "eval_steps_per_second": 6.62, "step": 900 }, { "epoch": 0.9970358393963891, "eval_loss": 3.4105401039123535, "eval_runtime": 118.8199, "eval_samples_per_second": 105.201, "eval_steps_per_second": 6.581, "step": 925 } ], "logging_steps": 100, "max_steps": 927, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.071708034936832e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }