|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998652991321415, |
|
"eval_steps": 500, |
|
"global_step": 6495, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015394384898108415, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.9692070823710548e-05, |
|
"loss": 1.1459, |
|
"num_input_tokens_seen": 3815960, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03078876979621683, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.9384141647421097e-05, |
|
"loss": 1.0878, |
|
"num_input_tokens_seen": 7588127, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04618315469432525, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.907621247113164e-05, |
|
"loss": 1.0942, |
|
"num_input_tokens_seen": 11373466, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06157753959243366, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.876828329484219e-05, |
|
"loss": 1.0788, |
|
"num_input_tokens_seen": 15148940, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07697192449054208, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.8460354118552735e-05, |
|
"loss": 1.0807, |
|
"num_input_tokens_seen": 18941722, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0923663093886505, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.815242494226328e-05, |
|
"loss": 1.0628, |
|
"num_input_tokens_seen": 22691439, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1077606942867589, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.7844495765973827e-05, |
|
"loss": 1.057, |
|
"num_input_tokens_seen": 26519584, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.12315507918486732, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 1.7536566589684373e-05, |
|
"loss": 1.0591, |
|
"num_input_tokens_seen": 30335003, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.13854946408297575, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 1.722863741339492e-05, |
|
"loss": 1.0597, |
|
"num_input_tokens_seen": 34122220, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.15394384898108415, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.6920708237105468e-05, |
|
"loss": 1.0784, |
|
"num_input_tokens_seen": 37937918, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16933823387919256, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.6612779060816014e-05, |
|
"loss": 1.0598, |
|
"num_input_tokens_seen": 41762422, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.184732618777301, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.630484988452656e-05, |
|
"loss": 1.0658, |
|
"num_input_tokens_seen": 45541351, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2001270036754094, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 1.5996920708237106e-05, |
|
"loss": 1.0557, |
|
"num_input_tokens_seen": 49317373, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2155213885735178, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.5688991531947652e-05, |
|
"loss": 1.0597, |
|
"num_input_tokens_seen": 53117895, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.23091577347162623, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.53810623556582e-05, |
|
"loss": 1.0616, |
|
"num_input_tokens_seen": 56839161, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24631015836973463, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.5073133179368746e-05, |
|
"loss": 1.053, |
|
"num_input_tokens_seen": 60623928, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.26170454326784304, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 1.4765204003079292e-05, |
|
"loss": 1.0483, |
|
"num_input_tokens_seen": 64408460, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.2770989281659515, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 1.445727482678984e-05, |
|
"loss": 1.049, |
|
"num_input_tokens_seen": 68223027, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2924933130640599, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.4149345650500385e-05, |
|
"loss": 1.0482, |
|
"num_input_tokens_seen": 71994158, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3078876979621683, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.3841416474210933e-05, |
|
"loss": 1.0497, |
|
"num_input_tokens_seen": 75865760, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3232820828602767, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.3533487297921479e-05, |
|
"loss": 1.0409, |
|
"num_input_tokens_seen": 79623921, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3386764677583851, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.3225558121632025e-05, |
|
"loss": 1.0501, |
|
"num_input_tokens_seen": 83411874, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3540708526564935, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.2917628945342572e-05, |
|
"loss": 1.0542, |
|
"num_input_tokens_seen": 87182489, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.369465237554602, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 1.2609699769053118e-05, |
|
"loss": 1.0482, |
|
"num_input_tokens_seen": 90989837, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.3848596224527104, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.2301770592763664e-05, |
|
"loss": 1.0404, |
|
"num_input_tokens_seen": 94853551, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4002540073508188, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.1993841416474212e-05, |
|
"loss": 1.0401, |
|
"num_input_tokens_seen": 98649900, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4156483922489272, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 1.1685912240184758e-05, |
|
"loss": 1.0453, |
|
"num_input_tokens_seen": 102455430, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4310427771470356, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.1377983063895306e-05, |
|
"loss": 1.0335, |
|
"num_input_tokens_seen": 106279858, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.44643716204514405, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 1.1070053887605852e-05, |
|
"loss": 1.0486, |
|
"num_input_tokens_seen": 110042375, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.46183154694325246, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 1.0762124711316398e-05, |
|
"loss": 1.0347, |
|
"num_input_tokens_seen": 113836688, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.47722593184136086, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 1.0454195535026945e-05, |
|
"loss": 1.041, |
|
"num_input_tokens_seen": 117581458, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.49262031673946927, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 1.0146266358737491e-05, |
|
"loss": 1.0357, |
|
"num_input_tokens_seen": 121341275, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5080147016375777, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 9.838337182448037e-06, |
|
"loss": 1.0374, |
|
"num_input_tokens_seen": 125088162, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5234090865356861, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 9.530408006158585e-06, |
|
"loss": 1.0255, |
|
"num_input_tokens_seen": 128901749, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5388034714337945, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.22247882986913e-06, |
|
"loss": 1.0282, |
|
"num_input_tokens_seen": 132736866, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.554197856331903, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 8.914549653579677e-06, |
|
"loss": 1.0398, |
|
"num_input_tokens_seen": 136595429, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5695922412300114, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 8.606620477290224e-06, |
|
"loss": 1.025, |
|
"num_input_tokens_seen": 140426462, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.5849866261281198, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 8.29869130100077e-06, |
|
"loss": 1.0291, |
|
"num_input_tokens_seen": 144234914, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6003810110262282, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 7.990762124711316e-06, |
|
"loss": 1.0233, |
|
"num_input_tokens_seen": 148032058, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6157753959243366, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 7.682832948421864e-06, |
|
"loss": 1.0347, |
|
"num_input_tokens_seen": 151814536, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.631169780822445, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 7.37490377213241e-06, |
|
"loss": 1.0457, |
|
"num_input_tokens_seen": 155598931, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.6465641657205534, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 7.066974595842957e-06, |
|
"loss": 1.0473, |
|
"num_input_tokens_seen": 159326528, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.6619585506186618, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 6.7590454195535035e-06, |
|
"loss": 1.0482, |
|
"num_input_tokens_seen": 163084806, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.6773529355167702, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 6.4511162432640495e-06, |
|
"loss": 1.0267, |
|
"num_input_tokens_seen": 166891971, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.6927473204148786, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 6.143187066974596e-06, |
|
"loss": 1.0165, |
|
"num_input_tokens_seen": 170700927, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.708141705312987, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 5.835257890685143e-06, |
|
"loss": 1.0386, |
|
"num_input_tokens_seen": 174509215, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.7235360902110956, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 5.52732871439569e-06, |
|
"loss": 1.0286, |
|
"num_input_tokens_seen": 178355672, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.738930475109204, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 5.219399538106236e-06, |
|
"loss": 1.0199, |
|
"num_input_tokens_seen": 182172598, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7543248600073124, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.911470361816783e-06, |
|
"loss": 1.023, |
|
"num_input_tokens_seen": 185945174, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.7697192449054208, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 4.6035411855273295e-06, |
|
"loss": 1.0317, |
|
"num_input_tokens_seen": 189723364, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7851136298035292, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 4.2956120092378755e-06, |
|
"loss": 1.0262, |
|
"num_input_tokens_seen": 193515488, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.8005080147016376, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 3.987682832948422e-06, |
|
"loss": 1.0276, |
|
"num_input_tokens_seen": 197294619, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.815902399599746, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 3.6797536566589687e-06, |
|
"loss": 1.0141, |
|
"num_input_tokens_seen": 201079998, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.8312967844978544, |
|
"grad_norm": 1.875, |
|
"learning_rate": 3.3718244803695155e-06, |
|
"loss": 1.0406, |
|
"num_input_tokens_seen": 204892665, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.8466911693959628, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.063895304080062e-06, |
|
"loss": 1.0144, |
|
"num_input_tokens_seen": 208692509, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.8620855542940712, |
|
"grad_norm": 1.625, |
|
"learning_rate": 2.7559661277906087e-06, |
|
"loss": 1.0328, |
|
"num_input_tokens_seen": 212482079, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.8774799391921797, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.4480369515011547e-06, |
|
"loss": 1.0158, |
|
"num_input_tokens_seen": 216265581, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.8928743240902881, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.1401077752117015e-06, |
|
"loss": 1.0376, |
|
"num_input_tokens_seen": 220058453, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.9082687089883965, |
|
"grad_norm": 1.875, |
|
"learning_rate": 1.8321785989222479e-06, |
|
"loss": 1.0116, |
|
"num_input_tokens_seen": 223849666, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.9236630938865049, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 1.5242494226327945e-06, |
|
"loss": 1.0319, |
|
"num_input_tokens_seen": 227586204, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9390574787846133, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.216320246343341e-06, |
|
"loss": 1.0248, |
|
"num_input_tokens_seen": 231383644, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.9544518636827217, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 9.083910700538877e-07, |
|
"loss": 1.0252, |
|
"num_input_tokens_seen": 235171998, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.9698462485808301, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 6.004618937644343e-07, |
|
"loss": 1.033, |
|
"num_input_tokens_seen": 238903492, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.9852406334789385, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 2.9253271747498076e-07, |
|
"loss": 1.0206, |
|
"num_input_tokens_seen": 242682149, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.9998652991321415, |
|
"num_input_tokens_seen": 246267812, |
|
"step": 6495, |
|
"total_flos": 1.1089328326722978e+19, |
|
"train_loss": 1.0435700872478528, |
|
"train_runtime": 38249.836, |
|
"train_samples_per_second": 5.434, |
|
"train_steps_per_second": 0.17, |
|
"train_tokens_per_second": 1606.468 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 6495, |
|
"num_input_tokens_seen": 246267812, |
|
"num_train_epochs": 1, |
|
"save_steps": 800, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1089328326722978e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|