| { | |
| "best_global_step": 200, | |
| "best_metric": 0.27645987272262573, | |
| "best_model_checkpoint": "./model_weights/llama32-1b-re-kpath-baseline/checkpoint-200", | |
| "epoch": 0.2222222222222222, | |
| "eval_steps": 25, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005555555555555556, | |
| "grad_norm": 2.139066219329834, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 2.687, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.011111111111111112, | |
| "grad_norm": 1.8011914491653442, | |
| "learning_rate": 2e-05, | |
| "loss": 2.8564, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.016666666666666666, | |
| "grad_norm": 1.9902386665344238, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 2.8579, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.022222222222222223, | |
| "grad_norm": 2.4330344200134277, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 2.7422, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.027777777777777776, | |
| "grad_norm": 3.047966957092285, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 2.2828, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.027777777777777776, | |
| "eval_loss": 2.117645263671875, | |
| "eval_runtime": 7.2642, | |
| "eval_samples_per_second": 27.532, | |
| "eval_steps_per_second": 27.532, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03333333333333333, | |
| "grad_norm": 2.9378368854522705, | |
| "learning_rate": 6.444444444444446e-05, | |
| "loss": 1.8589, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03888888888888889, | |
| "grad_norm": 3.6299028396606445, | |
| "learning_rate": 7.555555555555556e-05, | |
| "loss": 1.3353, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.044444444444444446, | |
| "grad_norm": 2.8062655925750732, | |
| "learning_rate": 8.666666666666667e-05, | |
| "loss": 0.9788, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.1027402877807617, | |
| "learning_rate": 9.777777777777778e-05, | |
| "loss": 0.6529, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05555555555555555, | |
| "grad_norm": 4.867608547210693, | |
| "learning_rate": 0.00010888888888888889, | |
| "loss": 0.6028, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05555555555555555, | |
| "eval_loss": 0.5804749131202698, | |
| "eval_runtime": 7.2598, | |
| "eval_samples_per_second": 27.549, | |
| "eval_steps_per_second": 27.549, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06111111111111111, | |
| "grad_norm": 3.358004331588745, | |
| "learning_rate": 0.00012, | |
| "loss": 0.6569, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 3.5663840770721436, | |
| "learning_rate": 0.00013111111111111111, | |
| "loss": 0.4684, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07222222222222222, | |
| "grad_norm": 4.208554267883301, | |
| "learning_rate": 0.00014222222222222224, | |
| "loss": 0.5365, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07777777777777778, | |
| "grad_norm": 1.8781005144119263, | |
| "learning_rate": 0.00015333333333333334, | |
| "loss": 0.4501, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 1.7693791389465332, | |
| "learning_rate": 0.00016444444444444444, | |
| "loss": 0.414, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08333333333333333, | |
| "eval_loss": 0.42754867672920227, | |
| "eval_runtime": 7.3627, | |
| "eval_samples_per_second": 27.164, | |
| "eval_steps_per_second": 27.164, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 1.6324015855789185, | |
| "learning_rate": 0.00017555555555555556, | |
| "loss": 0.4589, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09444444444444444, | |
| "grad_norm": 1.6526750326156616, | |
| "learning_rate": 0.0001866666666666667, | |
| "loss": 0.3764, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.409635066986084, | |
| "learning_rate": 0.00019777777777777778, | |
| "loss": 0.4764, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10555555555555556, | |
| "grad_norm": 1.594348669052124, | |
| "learning_rate": 0.0001999972998023366, | |
| "loss": 0.3086, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 2.8289716243743896, | |
| "learning_rate": 0.0001999863304992469, | |
| "loss": 0.3952, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "eval_loss": 0.34535014629364014, | |
| "eval_runtime": 7.6969, | |
| "eval_samples_per_second": 25.984, | |
| "eval_steps_per_second": 25.984, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11666666666666667, | |
| "grad_norm": 1.6983684301376343, | |
| "learning_rate": 0.00019996692425326533, | |
| "loss": 0.4116, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.12222222222222222, | |
| "grad_norm": 1.1369361877441406, | |
| "learning_rate": 0.0001999390827019096, | |
| "loss": 0.3407, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12777777777777777, | |
| "grad_norm": 1.4411648511886597, | |
| "learning_rate": 0.0001999028081944766, | |
| "loss": 0.318, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 1.4671945571899414, | |
| "learning_rate": 0.00019985810379184426, | |
| "loss": 0.2887, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1388888888888889, | |
| "grad_norm": 1.9020946025848389, | |
| "learning_rate": 0.00019980497326621316, | |
| "loss": 0.4495, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.1388888888888889, | |
| "eval_loss": 0.2982136905193329, | |
| "eval_runtime": 7.4623, | |
| "eval_samples_per_second": 26.801, | |
| "eval_steps_per_second": 26.801, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.14444444444444443, | |
| "grad_norm": 2.5713489055633545, | |
| "learning_rate": 0.00019974342110078817, | |
| "loss": 0.3463, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.6249582767486572, | |
| "learning_rate": 0.00019967345248940034, | |
| "loss": 0.3235, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.15555555555555556, | |
| "grad_norm": 0.5932977199554443, | |
| "learning_rate": 0.00019959507333606853, | |
| "loss": 0.3018, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16111111111111112, | |
| "grad_norm": 1.2969094514846802, | |
| "learning_rate": 0.00019950829025450114, | |
| "loss": 0.2389, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 2.421125888824463, | |
| "learning_rate": 0.00019941311056753826, | |
| "loss": 0.3315, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "eval_loss": 0.2966170310974121, | |
| "eval_runtime": 7.463, | |
| "eval_samples_per_second": 26.799, | |
| "eval_steps_per_second": 26.799, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.17222222222222222, | |
| "grad_norm": 2.2616477012634277, | |
| "learning_rate": 0.00019930954230653355, | |
| "loss": 0.3305, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 2.539074659347534, | |
| "learning_rate": 0.0001991975942106767, | |
| "loss": 0.351, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18333333333333332, | |
| "grad_norm": 1.5154845714569092, | |
| "learning_rate": 0.0001990772757262558, | |
| "loss": 0.2465, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.18888888888888888, | |
| "grad_norm": 1.7008557319641113, | |
| "learning_rate": 0.00019894859700586047, | |
| "loss": 0.2776, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.19444444444444445, | |
| "grad_norm": 0.9177167415618896, | |
| "learning_rate": 0.00019881156890752517, | |
| "loss": 0.2448, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.19444444444444445, | |
| "eval_loss": 0.27951088547706604, | |
| "eval_runtime": 7.3091, | |
| "eval_samples_per_second": 27.363, | |
| "eval_steps_per_second": 27.363, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.8103035688400269, | |
| "learning_rate": 0.00019866620299381285, | |
| "loss": 0.27, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20555555555555555, | |
| "grad_norm": 2.015127658843994, | |
| "learning_rate": 0.0001985125115308393, | |
| "loss": 0.2856, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2111111111111111, | |
| "grad_norm": 1.7647919654846191, | |
| "learning_rate": 0.00019835050748723824, | |
| "loss": 0.312, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.21666666666666667, | |
| "grad_norm": 1.8039538860321045, | |
| "learning_rate": 0.00019818020453306697, | |
| "loss": 0.3223, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 1.753201961517334, | |
| "learning_rate": 0.00019800161703865282, | |
| "loss": 0.2553, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "eval_loss": 0.27645987272262573, | |
| "eval_runtime": 7.272, | |
| "eval_samples_per_second": 27.503, | |
| "eval_steps_per_second": 27.503, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1800, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.005 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 1 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1202727577190400.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |