{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.454262248654772, "eval_steps": 354, "global_step": 3050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05664117813650524, "grad_norm": 0.7603653073310852, "learning_rate": 0.0001978110599078341, "loss": 0.9425, "step": 50 }, { "epoch": 0.11328235627301048, "grad_norm": 0.6873273849487305, "learning_rate": 0.00019205069124423964, "loss": 0.6078, "step": 100 }, { "epoch": 0.16992353440951571, "grad_norm": 0.6323167085647583, "learning_rate": 0.00018629032258064517, "loss": 0.6748, "step": 150 }, { "epoch": 0.22656471254602095, "grad_norm": 1.0095610618591309, "learning_rate": 0.0001805299539170507, "loss": 0.6594, "step": 200 }, { "epoch": 0.2832058906825262, "grad_norm": 0.5822212100028992, "learning_rate": 0.00017476958525345623, "loss": 0.6317, "step": 250 }, { "epoch": 0.33984706881903143, "grad_norm": 0.8490907549858093, "learning_rate": 0.00016900921658986176, "loss": 0.5742, "step": 300 }, { "epoch": 0.3964882469555367, "grad_norm": 0.6252707242965698, "learning_rate": 0.0001632488479262673, "loss": 0.5502, "step": 350 }, { "epoch": 0.4010195412064571, "eval_loss": 0.6019027233123779, "eval_runtime": 159.9351, "eval_samples_per_second": 9.81, "eval_steps_per_second": 2.457, "step": 354 }, { "epoch": 0.4531294250920419, "grad_norm": 0.656812310218811, "learning_rate": 0.00015748847926267282, "loss": 0.5686, "step": 400 }, { "epoch": 0.5097706032285472, "grad_norm": 0.7391073703765869, "learning_rate": 0.00015172811059907835, "loss": 0.5701, "step": 450 }, { "epoch": 0.5664117813650524, "grad_norm": 0.9210707545280457, "learning_rate": 0.00014596774193548388, "loss": 0.6397, "step": 500 }, { "epoch": 0.6230529595015576, "grad_norm": 0.8228403329849243, "learning_rate": 0.00014020737327188939, "loss": 0.5822, "step": 550 }, { "epoch": 0.6796941376380629, "grad_norm": 0.716748833656311, "learning_rate": 0.00013444700460829494, "loss": 0.5881, "step": 600 }, { "epoch": 0.7363353157745681, "grad_norm": 0.7144941091537476, "learning_rate": 0.00012868663594470047, "loss": 0.6032, "step": 650 }, { "epoch": 0.7929764939110734, "grad_norm": 1.016291618347168, "learning_rate": 0.000122926267281106, "loss": 0.6377, "step": 700 }, { "epoch": 0.8020390824129142, "eval_loss": 0.5828524827957153, "eval_runtime": 150.8754, "eval_samples_per_second": 10.399, "eval_steps_per_second": 2.605, "step": 708 }, { "epoch": 0.8496176720475785, "grad_norm": 1.0243154764175415, "learning_rate": 0.00011716589861751153, "loss": 0.6005, "step": 750 }, { "epoch": 0.9062588501840838, "grad_norm": 0.6541144251823425, "learning_rate": 0.00011140552995391706, "loss": 0.5723, "step": 800 }, { "epoch": 0.9629000283205891, "grad_norm": 1.0017038583755493, "learning_rate": 0.00010564516129032258, "loss": 0.5801, "step": 850 }, { "epoch": 1.0192580005664118, "grad_norm": 0.7527189254760742, "learning_rate": 9.988479262672812e-05, "loss": 0.5511, "step": 900 }, { "epoch": 1.075899178702917, "grad_norm": 0.7966899871826172, "learning_rate": 9.412442396313365e-05, "loss": 0.49, "step": 950 }, { "epoch": 1.1325403568394223, "grad_norm": 0.7110822796821594, "learning_rate": 8.836405529953917e-05, "loss": 0.4725, "step": 1000 }, { "epoch": 1.1891815349759276, "grad_norm": 0.7837777733802795, "learning_rate": 8.26036866359447e-05, "loss": 0.527, "step": 1050 }, { "epoch": 1.2027754177286887, "eval_loss": 0.5833637714385986, "eval_runtime": 150.6223, "eval_samples_per_second": 10.417, "eval_steps_per_second": 2.609, "step": 1062 }, { "epoch": 1.2458227131124326, "grad_norm": 0.8119267821311951, "learning_rate": 7.684331797235024e-05, "loss": 0.4892, "step": 1100 }, { "epoch": 1.302463891248938, "grad_norm": 0.8631129860877991, "learning_rate": 7.108294930875576e-05, "loss": 0.5124, "step": 1150 }, { "epoch": 1.3591050693854432, "grad_norm": 0.8685782551765442, "learning_rate": 6.532258064516129e-05, "loss": 0.4927, "step": 1200 }, { "epoch": 1.4157462475219484, "grad_norm": 0.8397710919380188, "learning_rate": 5.956221198156682e-05, "loss": 0.5125, "step": 1250 }, { "epoch": 1.4723874256584537, "grad_norm": 0.7606781721115112, "learning_rate": 5.3801843317972355e-05, "loss": 0.4826, "step": 1300 }, { "epoch": 1.529028603794959, "grad_norm": 1.1354798078536987, "learning_rate": 4.8041474654377885e-05, "loss": 0.5101, "step": 1350 }, { "epoch": 1.5856697819314642, "grad_norm": 1.28499174118042, "learning_rate": 4.228110599078341e-05, "loss": 0.4687, "step": 1400 }, { "epoch": 1.603794958935146, "eval_loss": 0.5791710615158081, "eval_runtime": 150.7437, "eval_samples_per_second": 10.408, "eval_steps_per_second": 2.607, "step": 1416 }, { "epoch": 1.6423109600679693, "grad_norm": 0.7527874708175659, "learning_rate": 3.6520737327188945e-05, "loss": 0.4992, "step": 1450 }, { "epoch": 1.6989521382044748, "grad_norm": 0.9351261854171753, "learning_rate": 3.076036866359447e-05, "loss": 0.4873, "step": 1500 }, { "epoch": 1.7555933163409798, "grad_norm": 1.0196998119354248, "learning_rate": 2.5e-05, "loss": 0.4946, "step": 1550 }, { "epoch": 1.812234494477485, "grad_norm": 0.9896508455276489, "learning_rate": 1.923963133640553e-05, "loss": 0.4895, "step": 1600 }, { "epoch": 1.8688756726139903, "grad_norm": 1.150964617729187, "learning_rate": 1.3479262672811061e-05, "loss": 0.5164, "step": 1650 }, { "epoch": 1.9255168507504956, "grad_norm": 0.8384917378425598, "learning_rate": 7.71889400921659e-06, "loss": 0.4914, "step": 1700 }, { "epoch": 1.9821580288870009, "grad_norm": 0.9231545329093933, "learning_rate": 1.9585253456221198e-06, "loss": 0.4939, "step": 1750 }, { "epoch": 2.0045312942509206, "eval_loss": 0.5844214558601379, "eval_runtime": 166.1298, "eval_samples_per_second": 9.444, "eval_steps_per_second": 2.366, "step": 1770 }, { "epoch": 2.0385160011328236, "grad_norm": 0.9673134088516235, "learning_rate": 0.00014248089741505446, "loss": 0.4303, "step": 1800 }, { "epoch": 2.0951571792693287, "grad_norm": 1.33237624168396, "learning_rate": 0.00014085514550479596, "loss": 0.4594, "step": 1850 }, { "epoch": 2.151798357405834, "grad_norm": 1.068943738937378, "learning_rate": 0.0001392293935945375, "loss": 0.4939, "step": 1900 }, { "epoch": 2.208439535542339, "grad_norm": 1.1625093221664429, "learning_rate": 0.00013760364168427899, "loss": 0.4757, "step": 1950 }, { "epoch": 2.2650807136788447, "grad_norm": 1.080735683441162, "learning_rate": 0.00013597788977402048, "loss": 0.4724, "step": 2000 }, { "epoch": 2.3217218918153497, "grad_norm": 0.8823259472846985, "learning_rate": 0.00013435213786376198, "loss": 0.4821, "step": 2050 }, { "epoch": 2.378363069951855, "grad_norm": 1.0513312816619873, "learning_rate": 0.0001327263859535035, "loss": 0.479, "step": 2100 }, { "epoch": 2.4055508354573774, "eval_loss": 0.6049736738204956, "eval_runtime": 156.3245, "eval_samples_per_second": 10.037, "eval_steps_per_second": 2.514, "step": 2124 }, { "epoch": 2.4350042480883602, "grad_norm": 1.0902981758117676, "learning_rate": 0.000131100634043245, "loss": 0.4749, "step": 2150 }, { "epoch": 2.4916454262248653, "grad_norm": 0.9050194025039673, "learning_rate": 0.0001294748821329865, "loss": 0.4346, "step": 2200 }, { "epoch": 2.5482866043613708, "grad_norm": 1.0356699228286743, "learning_rate": 0.00012784913022272803, "loss": 0.4685, "step": 2250 }, { "epoch": 2.604927782497876, "grad_norm": 1.0071344375610352, "learning_rate": 0.00012622337831246953, "loss": 0.5, "step": 2300 }, { "epoch": 2.6615689606343813, "grad_norm": 1.0409235954284668, "learning_rate": 0.00012459762640221103, "loss": 0.4908, "step": 2350 }, { "epoch": 2.7182101387708864, "grad_norm": 0.8756324052810669, "learning_rate": 0.00012297187449195252, "loss": 0.4765, "step": 2400 }, { "epoch": 2.774851316907392, "grad_norm": 0.6662527918815613, "learning_rate": 0.00012134612258169405, "loss": 0.4524, "step": 2450 }, { "epoch": 2.8065703766638346, "eval_loss": 0.5904644727706909, "eval_runtime": 157.1505, "eval_samples_per_second": 9.984, "eval_steps_per_second": 2.501, "step": 2478 }, { "epoch": 2.831492495043897, "grad_norm": 1.0368993282318115, "learning_rate": 0.00011972037067143556, "loss": 0.4661, "step": 2500 }, { "epoch": 2.888133673180402, "grad_norm": 1.0678080320358276, "learning_rate": 0.00011809461876117705, "loss": 0.4812, "step": 2550 }, { "epoch": 2.9447748513169074, "grad_norm": 1.2212059497833252, "learning_rate": 0.00011646886685091856, "loss": 0.482, "step": 2600 }, { "epoch": 3.00113282356273, "grad_norm": 0.757663369178772, "learning_rate": 0.00011484311494066007, "loss": 0.5062, "step": 2650 }, { "epoch": 3.057774001699235, "grad_norm": 0.8512151837348938, "learning_rate": 0.00011321736303040155, "loss": 0.3651, "step": 2700 }, { "epoch": 3.1144151798357407, "grad_norm": 0.9610685706138611, "learning_rate": 0.00011159161112014307, "loss": 0.3626, "step": 2750 }, { "epoch": 3.1710563579722457, "grad_norm": 1.0879671573638916, "learning_rate": 0.00010996585920988458, "loss": 0.3637, "step": 2800 }, { "epoch": 3.207306711979609, "eval_loss": 0.6150493025779724, "eval_runtime": 157.6799, "eval_samples_per_second": 9.951, "eval_steps_per_second": 2.492, "step": 2832 }, { "epoch": 3.227697536108751, "grad_norm": 1.1237863302230835, "learning_rate": 0.00010834010729962609, "loss": 0.4015, "step": 2850 }, { "epoch": 3.2843387142452563, "grad_norm": 0.7940804958343506, "learning_rate": 0.00010671435538936759, "loss": 0.3572, "step": 2900 }, { "epoch": 3.3409798923817613, "grad_norm": 1.7029999494552612, "learning_rate": 0.0001050886034791091, "loss": 0.3902, "step": 2950 }, { "epoch": 3.397621070518267, "grad_norm": 0.8232925534248352, "learning_rate": 0.00010346285156885061, "loss": 0.3814, "step": 3000 }, { "epoch": 3.454262248654772, "grad_norm": 0.9208984375, "learning_rate": 0.0001018370996585921, "loss": 0.377, "step": 3050 } ], "logging_steps": 50, "max_steps": 6181, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0296992167681997e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }