{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.68, "eval_steps": 500, "global_step": 48, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 21.84854475798701, "learning_rate": 2.0000000000000003e-06, "loss": 2.7666, "step": 1 }, { "epoch": 0.32, "grad_norm": 21.08410666706505, "learning_rate": 4.000000000000001e-06, "loss": 2.7148, "step": 2 }, { "epoch": 0.48, "grad_norm": 19.817509197835424, "learning_rate": 6e-06, "loss": 2.7169, "step": 3 }, { "epoch": 0.64, "grad_norm": 14.56433231107304, "learning_rate": 8.000000000000001e-06, "loss": 2.5238, "step": 4 }, { "epoch": 0.8, "grad_norm": 7.910921625273512, "learning_rate": 1e-05, "loss": 2.3043, "step": 5 }, { "epoch": 0.96, "grad_norm": 8.065606760172034, "learning_rate": 1.2e-05, "loss": 2.1385, "step": 6 }, { "epoch": 1.12, "grad_norm": 6.92294371298597, "learning_rate": 1.4000000000000001e-05, "loss": 1.8466, "step": 7 }, { "epoch": 1.28, "grad_norm": 7.374155243701679, "learning_rate": 1.6000000000000003e-05, "loss": 1.696, "step": 8 }, { "epoch": 1.44, "grad_norm": 5.529647167446027, "learning_rate": 1.8e-05, "loss": 1.3348, "step": 9 }, { "epoch": 1.6, "grad_norm": 5.470706825652522, "learning_rate": 2e-05, "loss": 1.2157, "step": 10 }, { "epoch": 1.76, "grad_norm": 5.744223049944512, "learning_rate": 2.2000000000000003e-05, "loss": 1.0832, "step": 11 }, { "epoch": 1.92, "grad_norm": 7.00854296530452, "learning_rate": 2.4e-05, "loss": 0.9879, "step": 12 }, { "epoch": 2.08, "grad_norm": 5.336567904924906, "learning_rate": 2.6000000000000002e-05, "loss": 0.5616, "step": 13 }, { "epoch": 2.24, "grad_norm": 4.476173984946472, "learning_rate": 2.8000000000000003e-05, "loss": 0.3954, "step": 14 }, { "epoch": 2.4, "grad_norm": 4.943428331011744, "learning_rate": 3e-05, "loss": 0.4176, "step": 15 }, { "epoch": 2.56, "grad_norm": 4.390162860701155, "learning_rate": 3.2000000000000005e-05, "loss": 0.3047, "step": 16 }, { "epoch": 2.7199999999999998, "grad_norm": 3.6439162726811514, "learning_rate": 3.4000000000000007e-05, "loss": 0.3502, "step": 17 }, { "epoch": 2.88, "grad_norm": 3.7835908637146614, "learning_rate": 3.6e-05, "loss": 0.4052, "step": 18 }, { "epoch": 3.04, "grad_norm": 3.045882254850862, "learning_rate": 3.8e-05, "loss": 0.299, "step": 19 }, { "epoch": 3.2, "grad_norm": 3.026990390481408, "learning_rate": 4e-05, "loss": 0.1532, "step": 20 }, { "epoch": 3.36, "grad_norm": 2.7854589808693317, "learning_rate": 4.2e-05, "loss": 0.1184, "step": 21 }, { "epoch": 3.52, "grad_norm": 3.934431288327914, "learning_rate": 4.4000000000000006e-05, "loss": 0.1605, "step": 22 }, { "epoch": 3.68, "grad_norm": 2.5384235711882246, "learning_rate": 4.600000000000001e-05, "loss": 0.1256, "step": 23 }, { "epoch": 3.84, "grad_norm": 3.0549054370297046, "learning_rate": 4.8e-05, "loss": 0.1509, "step": 24 }, { "epoch": 4.0, "grad_norm": 2.742034577764137, "learning_rate": 5e-05, "loss": 0.1955, "step": 25 }, { "epoch": 4.16, "grad_norm": 2.7052086450717185, "learning_rate": 5.2000000000000004e-05, "loss": 0.0883, "step": 26 }, { "epoch": 4.32, "grad_norm": 2.141311755890699, "learning_rate": 5.4000000000000005e-05, "loss": 0.0982, "step": 27 }, { "epoch": 4.48, "grad_norm": 2.405168716519016, "learning_rate": 5.6000000000000006e-05, "loss": 0.1167, "step": 28 }, { "epoch": 4.64, "grad_norm": 2.1040576646851976, "learning_rate": 5.8e-05, "loss": 0.1011, "step": 29 }, { "epoch": 4.8, "grad_norm": 2.1434366495348742, "learning_rate": 6e-05, "loss": 0.1093, "step": 30 }, { "epoch": 4.96, "grad_norm": 2.4885010419741356, "learning_rate": 6.2e-05, "loss": 0.126, "step": 31 }, { "epoch": 5.12, "grad_norm": 2.5185987231548705, "learning_rate": 6.400000000000001e-05, "loss": 0.0798, "step": 32 }, { "epoch": 5.28, "grad_norm": 2.4229963679451823, "learning_rate": 6.6e-05, "loss": 0.0777, "step": 33 }, { "epoch": 5.44, "grad_norm": 2.35757612847466, "learning_rate": 6.800000000000001e-05, "loss": 0.1035, "step": 34 }, { "epoch": 5.6, "grad_norm": 2.261180396997714, "learning_rate": 7e-05, "loss": 0.095, "step": 35 }, { "epoch": 5.76, "grad_norm": 2.397540189265323, "learning_rate": 7.2e-05, "loss": 0.1024, "step": 36 }, { "epoch": 5.92, "grad_norm": 2.440377463659988, "learning_rate": 7.4e-05, "loss": 0.1276, "step": 37 }, { "epoch": 6.08, "grad_norm": 1.9038257788340682, "learning_rate": 7.6e-05, "loss": 0.0839, "step": 38 }, { "epoch": 6.24, "grad_norm": 3.613615935736148, "learning_rate": 7.800000000000001e-05, "loss": 0.061, "step": 39 }, { "epoch": 6.4, "grad_norm": 2.225128405341641, "learning_rate": 8e-05, "loss": 0.0956, "step": 40 }, { "epoch": 6.5600000000000005, "grad_norm": 2.184522719061086, "learning_rate": 8.2e-05, "loss": 0.0882, "step": 41 }, { "epoch": 6.72, "grad_norm": 2.865012553917456, "learning_rate": 8.4e-05, "loss": 0.1075, "step": 42 }, { "epoch": 6.88, "grad_norm": 2.4858897371258255, "learning_rate": 8.6e-05, "loss": 0.1397, "step": 43 }, { "epoch": 7.04, "grad_norm": 2.5069104688577597, "learning_rate": 8.800000000000001e-05, "loss": 0.1138, "step": 44 }, { "epoch": 7.2, "grad_norm": 2.0377663889161544, "learning_rate": 9e-05, "loss": 0.0679, "step": 45 }, { "epoch": 7.36, "grad_norm": 1.7737003765568096, "learning_rate": 9.200000000000001e-05, "loss": 0.0625, "step": 46 }, { "epoch": 7.52, "grad_norm": 2.546223689211966, "learning_rate": 9.4e-05, "loss": 0.107, "step": 47 }, { "epoch": 7.68, "grad_norm": 2.629646137634436, "learning_rate": 9.6e-05, "loss": 0.1055, "step": 48 } ], "logging_steps": 1, "max_steps": 48, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 39991514234880.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }