{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.151798357405834, "eval_steps": 354, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05664117813650524, "grad_norm": 0.7603653073310852, "learning_rate": 0.0001978110599078341, "loss": 0.9425, "step": 50 }, { "epoch": 0.11328235627301048, "grad_norm": 0.6873273849487305, "learning_rate": 0.00019205069124423964, "loss": 0.6078, "step": 100 }, { "epoch": 0.16992353440951571, "grad_norm": 0.6323167085647583, "learning_rate": 0.00018629032258064517, "loss": 0.6748, "step": 150 }, { "epoch": 0.22656471254602095, "grad_norm": 1.0095610618591309, "learning_rate": 0.0001805299539170507, "loss": 0.6594, "step": 200 }, { "epoch": 0.2832058906825262, "grad_norm": 0.5822212100028992, "learning_rate": 0.00017476958525345623, "loss": 0.6317, "step": 250 }, { "epoch": 0.33984706881903143, "grad_norm": 0.8490907549858093, "learning_rate": 0.00016900921658986176, "loss": 0.5742, "step": 300 }, { "epoch": 0.3964882469555367, "grad_norm": 0.6252707242965698, "learning_rate": 0.0001632488479262673, "loss": 0.5502, "step": 350 }, { "epoch": 0.4010195412064571, "eval_loss": 0.6019027233123779, "eval_runtime": 159.9351, "eval_samples_per_second": 9.81, "eval_steps_per_second": 2.457, "step": 354 }, { "epoch": 0.4531294250920419, "grad_norm": 0.656812310218811, "learning_rate": 0.00015748847926267282, "loss": 0.5686, "step": 400 }, { "epoch": 0.5097706032285472, "grad_norm": 0.7391073703765869, "learning_rate": 0.00015172811059907835, "loss": 0.5701, "step": 450 }, { "epoch": 0.5664117813650524, "grad_norm": 0.9210707545280457, "learning_rate": 0.00014596774193548388, "loss": 0.6397, "step": 500 }, { "epoch": 0.6230529595015576, "grad_norm": 0.8228403329849243, "learning_rate": 0.00014020737327188939, "loss": 0.5822, "step": 550 }, { "epoch": 0.6796941376380629, "grad_norm": 0.716748833656311, "learning_rate": 0.00013444700460829494, "loss": 0.5881, "step": 600 }, { "epoch": 0.7363353157745681, "grad_norm": 0.7144941091537476, "learning_rate": 0.00012868663594470047, "loss": 0.6032, "step": 650 }, { "epoch": 0.7929764939110734, "grad_norm": 1.016291618347168, "learning_rate": 0.000122926267281106, "loss": 0.6377, "step": 700 }, { "epoch": 0.8020390824129142, "eval_loss": 0.5828524827957153, "eval_runtime": 150.8754, "eval_samples_per_second": 10.399, "eval_steps_per_second": 2.605, "step": 708 }, { "epoch": 0.8496176720475785, "grad_norm": 1.0243154764175415, "learning_rate": 0.00011716589861751153, "loss": 0.6005, "step": 750 }, { "epoch": 0.9062588501840838, "grad_norm": 0.6541144251823425, "learning_rate": 0.00011140552995391706, "loss": 0.5723, "step": 800 }, { "epoch": 0.9629000283205891, "grad_norm": 1.0017038583755493, "learning_rate": 0.00010564516129032258, "loss": 0.5801, "step": 850 }, { "epoch": 1.0192580005664118, "grad_norm": 0.7527189254760742, "learning_rate": 9.988479262672812e-05, "loss": 0.5511, "step": 900 }, { "epoch": 1.075899178702917, "grad_norm": 0.7966899871826172, "learning_rate": 9.412442396313365e-05, "loss": 0.49, "step": 950 }, { "epoch": 1.1325403568394223, "grad_norm": 0.7110822796821594, "learning_rate": 8.836405529953917e-05, "loss": 0.4725, "step": 1000 }, { "epoch": 1.1891815349759276, "grad_norm": 0.7837777733802795, "learning_rate": 8.26036866359447e-05, "loss": 0.527, "step": 1050 }, { "epoch": 1.2027754177286887, "eval_loss": 0.5833637714385986, "eval_runtime": 150.6223, "eval_samples_per_second": 10.417, "eval_steps_per_second": 2.609, "step": 1062 }, { "epoch": 1.2458227131124326, "grad_norm": 0.8119267821311951, "learning_rate": 7.684331797235024e-05, "loss": 0.4892, "step": 1100 }, { "epoch": 1.302463891248938, "grad_norm": 0.8631129860877991, "learning_rate": 7.108294930875576e-05, "loss": 0.5124, "step": 1150 }, { "epoch": 1.3591050693854432, "grad_norm": 0.8685782551765442, "learning_rate": 6.532258064516129e-05, "loss": 0.4927, "step": 1200 }, { "epoch": 1.4157462475219484, "grad_norm": 0.8397710919380188, "learning_rate": 5.956221198156682e-05, "loss": 0.5125, "step": 1250 }, { "epoch": 1.4723874256584537, "grad_norm": 0.7606781721115112, "learning_rate": 5.3801843317972355e-05, "loss": 0.4826, "step": 1300 }, { "epoch": 1.529028603794959, "grad_norm": 1.1354798078536987, "learning_rate": 4.8041474654377885e-05, "loss": 0.5101, "step": 1350 }, { "epoch": 1.5856697819314642, "grad_norm": 1.28499174118042, "learning_rate": 4.228110599078341e-05, "loss": 0.4687, "step": 1400 }, { "epoch": 1.603794958935146, "eval_loss": 0.5791710615158081, "eval_runtime": 150.7437, "eval_samples_per_second": 10.408, "eval_steps_per_second": 2.607, "step": 1416 }, { "epoch": 1.6423109600679693, "grad_norm": 0.7527874708175659, "learning_rate": 3.6520737327188945e-05, "loss": 0.4992, "step": 1450 }, { "epoch": 1.6989521382044748, "grad_norm": 0.9351261854171753, "learning_rate": 3.076036866359447e-05, "loss": 0.4873, "step": 1500 }, { "epoch": 1.7555933163409798, "grad_norm": 1.0196998119354248, "learning_rate": 2.5e-05, "loss": 0.4946, "step": 1550 }, { "epoch": 1.812234494477485, "grad_norm": 0.9896508455276489, "learning_rate": 1.923963133640553e-05, "loss": 0.4895, "step": 1600 }, { "epoch": 1.8688756726139903, "grad_norm": 1.150964617729187, "learning_rate": 1.3479262672811061e-05, "loss": 0.5164, "step": 1650 }, { "epoch": 1.9255168507504956, "grad_norm": 0.8384917378425598, "learning_rate": 7.71889400921659e-06, "loss": 0.4914, "step": 1700 }, { "epoch": 1.9821580288870009, "grad_norm": 0.9231545329093933, "learning_rate": 1.9585253456221198e-06, "loss": 0.4939, "step": 1750 }, { "epoch": 2.0045312942509206, "eval_loss": 0.5844214558601379, "eval_runtime": 166.1298, "eval_samples_per_second": 9.444, "eval_steps_per_second": 2.366, "step": 1770 }, { "epoch": 2.0385160011328236, "grad_norm": 0.9673134088516235, "learning_rate": 0.00014248089741505446, "loss": 0.4303, "step": 1800 }, { "epoch": 2.0951571792693287, "grad_norm": 1.33237624168396, "learning_rate": 0.00014085514550479596, "loss": 0.4594, "step": 1850 }, { "epoch": 2.151798357405834, "grad_norm": 1.068943738937378, "learning_rate": 0.0001392293935945375, "loss": 0.4939, "step": 1900 } ], "logging_steps": 50, "max_steps": 6181, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.411721688746214e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }