oumi-l8b-ultrachat / trainer_state.json
penfever's picture
Upload folder using huggingface_hub
b661f28 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998652991321415,
"eval_steps": 500,
"global_step": 6495,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015394384898108415,
"grad_norm": 1.9140625,
"learning_rate": 1.9692070823710548e-05,
"loss": 1.1459,
"num_input_tokens_seen": 3815960,
"step": 100
},
{
"epoch": 0.03078876979621683,
"grad_norm": 1.5703125,
"learning_rate": 1.9384141647421097e-05,
"loss": 1.0878,
"num_input_tokens_seen": 7588127,
"step": 200
},
{
"epoch": 0.04618315469432525,
"grad_norm": 1.625,
"learning_rate": 1.907621247113164e-05,
"loss": 1.0942,
"num_input_tokens_seen": 11373466,
"step": 300
},
{
"epoch": 0.06157753959243366,
"grad_norm": 1.6328125,
"learning_rate": 1.876828329484219e-05,
"loss": 1.0788,
"num_input_tokens_seen": 15148940,
"step": 400
},
{
"epoch": 0.07697192449054208,
"grad_norm": 1.703125,
"learning_rate": 1.8460354118552735e-05,
"loss": 1.0807,
"num_input_tokens_seen": 18941722,
"step": 500
},
{
"epoch": 0.0923663093886505,
"grad_norm": 1.765625,
"learning_rate": 1.815242494226328e-05,
"loss": 1.0628,
"num_input_tokens_seen": 22691439,
"step": 600
},
{
"epoch": 0.1077606942867589,
"grad_norm": 1.890625,
"learning_rate": 1.7844495765973827e-05,
"loss": 1.057,
"num_input_tokens_seen": 26519584,
"step": 700
},
{
"epoch": 0.12315507918486732,
"grad_norm": 1.59375,
"learning_rate": 1.7536566589684373e-05,
"loss": 1.0591,
"num_input_tokens_seen": 30335003,
"step": 800
},
{
"epoch": 0.13854946408297575,
"grad_norm": 1.5859375,
"learning_rate": 1.722863741339492e-05,
"loss": 1.0597,
"num_input_tokens_seen": 34122220,
"step": 900
},
{
"epoch": 0.15394384898108415,
"grad_norm": 2.140625,
"learning_rate": 1.6920708237105468e-05,
"loss": 1.0784,
"num_input_tokens_seen": 37937918,
"step": 1000
},
{
"epoch": 0.16933823387919256,
"grad_norm": 1.7265625,
"learning_rate": 1.6612779060816014e-05,
"loss": 1.0598,
"num_input_tokens_seen": 41762422,
"step": 1100
},
{
"epoch": 0.184732618777301,
"grad_norm": 1.65625,
"learning_rate": 1.630484988452656e-05,
"loss": 1.0658,
"num_input_tokens_seen": 45541351,
"step": 1200
},
{
"epoch": 0.2001270036754094,
"grad_norm": 1.9453125,
"learning_rate": 1.5996920708237106e-05,
"loss": 1.0557,
"num_input_tokens_seen": 49317373,
"step": 1300
},
{
"epoch": 0.2155213885735178,
"grad_norm": 1.765625,
"learning_rate": 1.5688991531947652e-05,
"loss": 1.0597,
"num_input_tokens_seen": 53117895,
"step": 1400
},
{
"epoch": 0.23091577347162623,
"grad_norm": 1.765625,
"learning_rate": 1.53810623556582e-05,
"loss": 1.0616,
"num_input_tokens_seen": 56839161,
"step": 1500
},
{
"epoch": 0.24631015836973463,
"grad_norm": 1.609375,
"learning_rate": 1.5073133179368746e-05,
"loss": 1.053,
"num_input_tokens_seen": 60623928,
"step": 1600
},
{
"epoch": 0.26170454326784304,
"grad_norm": 1.609375,
"learning_rate": 1.4765204003079292e-05,
"loss": 1.0483,
"num_input_tokens_seen": 64408460,
"step": 1700
},
{
"epoch": 0.2770989281659515,
"grad_norm": 1.6640625,
"learning_rate": 1.445727482678984e-05,
"loss": 1.049,
"num_input_tokens_seen": 68223027,
"step": 1800
},
{
"epoch": 0.2924933130640599,
"grad_norm": 1.953125,
"learning_rate": 1.4149345650500385e-05,
"loss": 1.0482,
"num_input_tokens_seen": 71994158,
"step": 1900
},
{
"epoch": 0.3078876979621683,
"grad_norm": 1.765625,
"learning_rate": 1.3841416474210933e-05,
"loss": 1.0497,
"num_input_tokens_seen": 75865760,
"step": 2000
},
{
"epoch": 0.3232820828602767,
"grad_norm": 2.015625,
"learning_rate": 1.3533487297921479e-05,
"loss": 1.0409,
"num_input_tokens_seen": 79623921,
"step": 2100
},
{
"epoch": 0.3386764677583851,
"grad_norm": 1.7890625,
"learning_rate": 1.3225558121632025e-05,
"loss": 1.0501,
"num_input_tokens_seen": 83411874,
"step": 2200
},
{
"epoch": 0.3540708526564935,
"grad_norm": 1.6328125,
"learning_rate": 1.2917628945342572e-05,
"loss": 1.0542,
"num_input_tokens_seen": 87182489,
"step": 2300
},
{
"epoch": 0.369465237554602,
"grad_norm": 1.8671875,
"learning_rate": 1.2609699769053118e-05,
"loss": 1.0482,
"num_input_tokens_seen": 90989837,
"step": 2400
},
{
"epoch": 0.3848596224527104,
"grad_norm": 1.9375,
"learning_rate": 1.2301770592763664e-05,
"loss": 1.0404,
"num_input_tokens_seen": 94853551,
"step": 2500
},
{
"epoch": 0.4002540073508188,
"grad_norm": 2.015625,
"learning_rate": 1.1993841416474212e-05,
"loss": 1.0401,
"num_input_tokens_seen": 98649900,
"step": 2600
},
{
"epoch": 0.4156483922489272,
"grad_norm": 1.9609375,
"learning_rate": 1.1685912240184758e-05,
"loss": 1.0453,
"num_input_tokens_seen": 102455430,
"step": 2700
},
{
"epoch": 0.4310427771470356,
"grad_norm": 1.75,
"learning_rate": 1.1377983063895306e-05,
"loss": 1.0335,
"num_input_tokens_seen": 106279858,
"step": 2800
},
{
"epoch": 0.44643716204514405,
"grad_norm": 1.6484375,
"learning_rate": 1.1070053887605852e-05,
"loss": 1.0486,
"num_input_tokens_seen": 110042375,
"step": 2900
},
{
"epoch": 0.46183154694325246,
"grad_norm": 1.703125,
"learning_rate": 1.0762124711316398e-05,
"loss": 1.0347,
"num_input_tokens_seen": 113836688,
"step": 3000
},
{
"epoch": 0.47722593184136086,
"grad_norm": 1.7265625,
"learning_rate": 1.0454195535026945e-05,
"loss": 1.041,
"num_input_tokens_seen": 117581458,
"step": 3100
},
{
"epoch": 0.49262031673946927,
"grad_norm": 1.8359375,
"learning_rate": 1.0146266358737491e-05,
"loss": 1.0357,
"num_input_tokens_seen": 121341275,
"step": 3200
},
{
"epoch": 0.5080147016375777,
"grad_norm": 1.546875,
"learning_rate": 9.838337182448037e-06,
"loss": 1.0374,
"num_input_tokens_seen": 125088162,
"step": 3300
},
{
"epoch": 0.5234090865356861,
"grad_norm": 1.4453125,
"learning_rate": 9.530408006158585e-06,
"loss": 1.0255,
"num_input_tokens_seen": 128901749,
"step": 3400
},
{
"epoch": 0.5388034714337945,
"grad_norm": 1.46875,
"learning_rate": 9.22247882986913e-06,
"loss": 1.0282,
"num_input_tokens_seen": 132736866,
"step": 3500
},
{
"epoch": 0.554197856331903,
"grad_norm": 1.6953125,
"learning_rate": 8.914549653579677e-06,
"loss": 1.0398,
"num_input_tokens_seen": 136595429,
"step": 3600
},
{
"epoch": 0.5695922412300114,
"grad_norm": 1.7578125,
"learning_rate": 8.606620477290224e-06,
"loss": 1.025,
"num_input_tokens_seen": 140426462,
"step": 3700
},
{
"epoch": 0.5849866261281198,
"grad_norm": 1.5859375,
"learning_rate": 8.29869130100077e-06,
"loss": 1.0291,
"num_input_tokens_seen": 144234914,
"step": 3800
},
{
"epoch": 0.6003810110262282,
"grad_norm": 2.453125,
"learning_rate": 7.990762124711316e-06,
"loss": 1.0233,
"num_input_tokens_seen": 148032058,
"step": 3900
},
{
"epoch": 0.6157753959243366,
"grad_norm": 1.734375,
"learning_rate": 7.682832948421864e-06,
"loss": 1.0347,
"num_input_tokens_seen": 151814536,
"step": 4000
},
{
"epoch": 0.631169780822445,
"grad_norm": 1.7578125,
"learning_rate": 7.37490377213241e-06,
"loss": 1.0457,
"num_input_tokens_seen": 155598931,
"step": 4100
},
{
"epoch": 0.6465641657205534,
"grad_norm": 1.453125,
"learning_rate": 7.066974595842957e-06,
"loss": 1.0473,
"num_input_tokens_seen": 159326528,
"step": 4200
},
{
"epoch": 0.6619585506186618,
"grad_norm": 1.7109375,
"learning_rate": 6.7590454195535035e-06,
"loss": 1.0482,
"num_input_tokens_seen": 163084806,
"step": 4300
},
{
"epoch": 0.6773529355167702,
"grad_norm": 1.734375,
"learning_rate": 6.4511162432640495e-06,
"loss": 1.0267,
"num_input_tokens_seen": 166891971,
"step": 4400
},
{
"epoch": 0.6927473204148786,
"grad_norm": 1.703125,
"learning_rate": 6.143187066974596e-06,
"loss": 1.0165,
"num_input_tokens_seen": 170700927,
"step": 4500
},
{
"epoch": 0.708141705312987,
"grad_norm": 1.7421875,
"learning_rate": 5.835257890685143e-06,
"loss": 1.0386,
"num_input_tokens_seen": 174509215,
"step": 4600
},
{
"epoch": 0.7235360902110956,
"grad_norm": 1.859375,
"learning_rate": 5.52732871439569e-06,
"loss": 1.0286,
"num_input_tokens_seen": 178355672,
"step": 4700
},
{
"epoch": 0.738930475109204,
"grad_norm": 1.609375,
"learning_rate": 5.219399538106236e-06,
"loss": 1.0199,
"num_input_tokens_seen": 182172598,
"step": 4800
},
{
"epoch": 0.7543248600073124,
"grad_norm": 1.7734375,
"learning_rate": 4.911470361816783e-06,
"loss": 1.023,
"num_input_tokens_seen": 185945174,
"step": 4900
},
{
"epoch": 0.7697192449054208,
"grad_norm": 1.4765625,
"learning_rate": 4.6035411855273295e-06,
"loss": 1.0317,
"num_input_tokens_seen": 189723364,
"step": 5000
},
{
"epoch": 0.7851136298035292,
"grad_norm": 1.53125,
"learning_rate": 4.2956120092378755e-06,
"loss": 1.0262,
"num_input_tokens_seen": 193515488,
"step": 5100
},
{
"epoch": 0.8005080147016376,
"grad_norm": 1.7109375,
"learning_rate": 3.987682832948422e-06,
"loss": 1.0276,
"num_input_tokens_seen": 197294619,
"step": 5200
},
{
"epoch": 0.815902399599746,
"grad_norm": 1.6640625,
"learning_rate": 3.6797536566589687e-06,
"loss": 1.0141,
"num_input_tokens_seen": 201079998,
"step": 5300
},
{
"epoch": 0.8312967844978544,
"grad_norm": 1.875,
"learning_rate": 3.3718244803695155e-06,
"loss": 1.0406,
"num_input_tokens_seen": 204892665,
"step": 5400
},
{
"epoch": 0.8466911693959628,
"grad_norm": 1.75,
"learning_rate": 3.063895304080062e-06,
"loss": 1.0144,
"num_input_tokens_seen": 208692509,
"step": 5500
},
{
"epoch": 0.8620855542940712,
"grad_norm": 1.625,
"learning_rate": 2.7559661277906087e-06,
"loss": 1.0328,
"num_input_tokens_seen": 212482079,
"step": 5600
},
{
"epoch": 0.8774799391921797,
"grad_norm": 1.4453125,
"learning_rate": 2.4480369515011547e-06,
"loss": 1.0158,
"num_input_tokens_seen": 216265581,
"step": 5700
},
{
"epoch": 0.8928743240902881,
"grad_norm": 1.421875,
"learning_rate": 2.1401077752117015e-06,
"loss": 1.0376,
"num_input_tokens_seen": 220058453,
"step": 5800
},
{
"epoch": 0.9082687089883965,
"grad_norm": 1.875,
"learning_rate": 1.8321785989222479e-06,
"loss": 1.0116,
"num_input_tokens_seen": 223849666,
"step": 5900
},
{
"epoch": 0.9236630938865049,
"grad_norm": 1.671875,
"learning_rate": 1.5242494226327945e-06,
"loss": 1.0319,
"num_input_tokens_seen": 227586204,
"step": 6000
},
{
"epoch": 0.9390574787846133,
"grad_norm": 1.578125,
"learning_rate": 1.216320246343341e-06,
"loss": 1.0248,
"num_input_tokens_seen": 231383644,
"step": 6100
},
{
"epoch": 0.9544518636827217,
"grad_norm": 1.71875,
"learning_rate": 9.083910700538877e-07,
"loss": 1.0252,
"num_input_tokens_seen": 235171998,
"step": 6200
},
{
"epoch": 0.9698462485808301,
"grad_norm": 1.59375,
"learning_rate": 6.004618937644343e-07,
"loss": 1.033,
"num_input_tokens_seen": 238903492,
"step": 6300
},
{
"epoch": 0.9852406334789385,
"grad_norm": 1.671875,
"learning_rate": 2.9253271747498076e-07,
"loss": 1.0206,
"num_input_tokens_seen": 242682149,
"step": 6400
},
{
"epoch": 0.9998652991321415,
"num_input_tokens_seen": 246267812,
"step": 6495,
"total_flos": 1.1089328326722978e+19,
"train_loss": 1.0435700872478528,
"train_runtime": 38249.836,
"train_samples_per_second": 5.434,
"train_steps_per_second": 0.17,
"train_tokens_per_second": 1606.468
}
],
"logging_steps": 100,
"max_steps": 6495,
"num_input_tokens_seen": 246267812,
"num_train_epochs": 1,
"save_steps": 800,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1089328326722978e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}