{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9937888198757764, "eval_steps": 500, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024844720496894408, "grad_norm": 0.22360646249577132, "learning_rate": 0.0001, "loss": 0.3155, "step": 1 }, { "epoch": 0.049689440993788817, "grad_norm": 1.6354713380395525, "learning_rate": 0.0001, "loss": 0.4894, "step": 2 }, { "epoch": 0.07453416149068323, "grad_norm": 21.686343056691, "learning_rate": 0.0001, "loss": 5.1274, "step": 3 }, { "epoch": 0.09937888198757763, "grad_norm": 14.478086289344557, "learning_rate": 0.0001, "loss": 0.6788, "step": 4 }, { "epoch": 0.12422360248447205, "grad_norm": 10.40331474659814, "learning_rate": 0.0001, "loss": 0.7381, "step": 5 }, { "epoch": 0.14906832298136646, "grad_norm": 14.828952914633255, "learning_rate": 0.0001, "loss": 1.0551, "step": 6 }, { "epoch": 0.17391304347826086, "grad_norm": 1.0287292990470505, "learning_rate": 0.0001, "loss": 0.4616, "step": 7 }, { "epoch": 0.19875776397515527, "grad_norm": 4.424881730530812, "learning_rate": 0.0001, "loss": 0.7389, "step": 8 }, { "epoch": 0.2236024844720497, "grad_norm": 1.5788138903378481, "learning_rate": 0.0001, "loss": 0.5233, "step": 9 }, { "epoch": 0.2484472049689441, "grad_norm": 1.6155451789191035, "learning_rate": 0.0001, "loss": 0.4922, "step": 10 }, { "epoch": 0.2732919254658385, "grad_norm": 53.16385537396071, "learning_rate": 0.0001, "loss": 1.0959, "step": 11 }, { "epoch": 0.2981366459627329, "grad_norm": 2.0002784882087092, "learning_rate": 0.0001, "loss": 0.6879, "step": 12 }, { "epoch": 0.32298136645962733, "grad_norm": 1.1683199154602835, "learning_rate": 0.0001, "loss": 0.4369, "step": 13 }, { "epoch": 0.34782608695652173, "grad_norm": 1.033205019062668, "learning_rate": 0.0001, "loss": 0.3917, "step": 14 }, { "epoch": 0.37267080745341613, "grad_norm": 0.4412023447127328, "learning_rate": 0.0001, "loss": 0.3448, "step": 15 }, { "epoch": 0.39751552795031053, "grad_norm": 0.9578202521179224, "learning_rate": 0.0001, "loss": 0.3459, "step": 16 }, { "epoch": 0.422360248447205, "grad_norm": 2.289275193987449, "learning_rate": 0.0001, "loss": 0.534, "step": 17 }, { "epoch": 0.4472049689440994, "grad_norm": 0.6666094964330221, "learning_rate": 0.0001, "loss": 0.3535, "step": 18 }, { "epoch": 0.4720496894409938, "grad_norm": 0.7930206978533701, "learning_rate": 0.0001, "loss": 0.3496, "step": 19 }, { "epoch": 0.4968944099378882, "grad_norm": 0.3055608278925833, "learning_rate": 0.0001, "loss": 0.3339, "step": 20 }, { "epoch": 0.5217391304347826, "grad_norm": 0.29784796389686363, "learning_rate": 0.0001, "loss": 0.31, "step": 21 }, { "epoch": 0.546583850931677, "grad_norm": 0.21503207187750886, "learning_rate": 0.0001, "loss": 0.2922, "step": 22 }, { "epoch": 0.5714285714285714, "grad_norm": 0.15247931417135197, "learning_rate": 0.0001, "loss": 0.292, "step": 23 }, { "epoch": 0.5962732919254659, "grad_norm": 0.19200002221931217, "learning_rate": 0.0001, "loss": 0.2937, "step": 24 }, { "epoch": 0.6211180124223602, "grad_norm": 0.12937354448998015, "learning_rate": 0.0001, "loss": 0.2701, "step": 25 }, { "epoch": 0.6459627329192547, "grad_norm": 0.7041267128561529, "learning_rate": 0.0001, "loss": 0.2806, "step": 26 }, { "epoch": 0.6708074534161491, "grad_norm": 0.1703017837650162, "learning_rate": 0.0001, "loss": 0.2733, "step": 27 }, { "epoch": 0.6956521739130435, "grad_norm": 0.1670950080367148, "learning_rate": 0.0001, "loss": 0.2691, "step": 28 }, { "epoch": 0.7204968944099379, "grad_norm": 0.15191029267235273, "learning_rate": 0.0001, "loss": 0.2611, "step": 29 }, { "epoch": 0.7453416149068323, "grad_norm": 0.10352109095733353, "learning_rate": 0.0001, "loss": 0.2764, "step": 30 }, { "epoch": 0.7701863354037267, "grad_norm": 0.11621572352545091, "learning_rate": 0.0001, "loss": 0.259, "step": 31 }, { "epoch": 0.7950310559006211, "grad_norm": 0.1047835528239364, "learning_rate": 0.0001, "loss": 0.2507, "step": 32 }, { "epoch": 0.8198757763975155, "grad_norm": 0.10515270423855971, "learning_rate": 0.0001, "loss": 0.2743, "step": 33 }, { "epoch": 0.84472049689441, "grad_norm": 0.05663550910043551, "learning_rate": 0.0001, "loss": 0.2565, "step": 34 }, { "epoch": 0.8695652173913043, "grad_norm": 0.09940447058540708, "learning_rate": 0.0001, "loss": 0.2558, "step": 35 }, { "epoch": 0.8944099378881988, "grad_norm": 0.09817461399434704, "learning_rate": 0.0001, "loss": 0.2483, "step": 36 }, { "epoch": 0.9192546583850931, "grad_norm": 0.06027821970777435, "learning_rate": 0.0001, "loss": 0.2516, "step": 37 }, { "epoch": 0.9440993788819876, "grad_norm": 0.08188481055174467, "learning_rate": 0.0001, "loss": 0.2305, "step": 38 }, { "epoch": 0.968944099378882, "grad_norm": 0.09993015738582685, "learning_rate": 0.0001, "loss": 0.2559, "step": 39 }, { "epoch": 0.9937888198757764, "grad_norm": 0.09361624235588561, "learning_rate": 0.0001, "loss": 0.2556, "step": 40 }, { "epoch": 0.9937888198757764, "step": 40, "total_flos": 55331389440000.0, "train_loss": 0.5212820250540972, "train_runtime": 2252.6119, "train_samples_per_second": 3.42, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 40, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 55331389440000.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }