{ "best_global_step": 700, "best_metric": 5.510611534118652, "best_model_checkpoint": "./qwen3moe_tinystories_sft/checkpoint-700", "epoch": 0.9996631862579993, "eval_steps": 100, "global_step": 742, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13472549680026946, "grad_norm": 58221.64453125, "learning_rate": 3.3221476510067115e-05, "loss": 5.6966, "step": 100 }, { "epoch": 0.13472549680026946, "eval_loss": 10.694306373596191, "eval_runtime": 119.9061, "eval_samples_per_second": 41.699, "eval_steps_per_second": 2.61, "step": 100 }, { "epoch": 0.2694509936005389, "grad_norm": 53346.890625, "learning_rate": 4.5784148397976396e-05, "loss": 4.7677, "step": 200 }, { "epoch": 0.2694509936005389, "eval_loss": 8.538475036621094, "eval_runtime": 118.4911, "eval_samples_per_second": 42.197, "eval_steps_per_second": 2.642, "step": 200 }, { "epoch": 0.40417649040080833, "grad_norm": 60593.984375, "learning_rate": 3.735244519392918e-05, "loss": 3.8182, "step": 300 }, { "epoch": 0.40417649040080833, "eval_loss": 6.992630481719971, "eval_runtime": 122.3318, "eval_samples_per_second": 40.872, "eval_steps_per_second": 2.559, "step": 300 }, { "epoch": 0.5389019872010778, "grad_norm": 45884.55078125, "learning_rate": 2.8920741989881955e-05, "loss": 3.268, "step": 400 }, { "epoch": 0.5389019872010778, "eval_loss": 6.24953031539917, "eval_runtime": 122.2793, "eval_samples_per_second": 40.89, "eval_steps_per_second": 2.56, "step": 400 }, { "epoch": 0.6736274840013473, "grad_norm": 37802.12890625, "learning_rate": 2.048903878583474e-05, "loss": 2.9965, "step": 500 }, { "epoch": 0.6736274840013473, "eval_loss": 5.843188762664795, "eval_runtime": 121.956, "eval_samples_per_second": 40.998, "eval_steps_per_second": 2.567, "step": 500 }, { "epoch": 0.8083529808016167, "grad_norm": 34398.84375, "learning_rate": 1.205733558178752e-05, "loss": 2.8499, "step": 600 }, { "epoch": 0.8083529808016167, "eval_loss": 5.611515045166016, "eval_runtime": 121.7403, "eval_samples_per_second": 41.071, "eval_steps_per_second": 2.571, "step": 600 }, { "epoch": 0.9430784776018861, "grad_norm": 29294.10546875, "learning_rate": 3.625632377740304e-06, "loss": 2.7606, "step": 700 }, { "epoch": 0.9430784776018861, "eval_loss": 5.510611534118652, "eval_runtime": 121.6665, "eval_samples_per_second": 41.096, "eval_steps_per_second": 2.573, "step": 700 } ], "logging_steps": 100, "max_steps": 742, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0700859494762496e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }