|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 29370, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.17024174327545114, |
|
"grad_norm": 0.6650531888008118, |
|
"learning_rate": 1.96595165134491e-05, |
|
"loss": 0.2305, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.34048348655090227, |
|
"grad_norm": 2.3951523303985596, |
|
"learning_rate": 1.9319033026898198e-05, |
|
"loss": 0.1633, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5107252298263534, |
|
"grad_norm": 0.9463224411010742, |
|
"learning_rate": 1.8978549540347296e-05, |
|
"loss": 0.1548, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6809669731018045, |
|
"grad_norm": 1.2913334369659424, |
|
"learning_rate": 1.8638066053796395e-05, |
|
"loss": 0.1487, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8512087163772557, |
|
"grad_norm": 1.0490585565567017, |
|
"learning_rate": 1.829758256724549e-05, |
|
"loss": 0.1435, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.0214504596527068, |
|
"grad_norm": 1.5652408599853516, |
|
"learning_rate": 1.7957099080694588e-05, |
|
"loss": 0.1397, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.191692202928158, |
|
"grad_norm": 1.1205641031265259, |
|
"learning_rate": 1.7616615594143686e-05, |
|
"loss": 0.1283, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.361933946203609, |
|
"grad_norm": 1.0744216442108154, |
|
"learning_rate": 1.727613210759278e-05, |
|
"loss": 0.1292, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.5321756894790601, |
|
"grad_norm": 1.089113712310791, |
|
"learning_rate": 1.693564862104188e-05, |
|
"loss": 0.1273, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.7024174327545114, |
|
"grad_norm": 2.334705114364624, |
|
"learning_rate": 1.6595165134490977e-05, |
|
"loss": 0.1275, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.8726591760299627, |
|
"grad_norm": 1.1323754787445068, |
|
"learning_rate": 1.6254681647940076e-05, |
|
"loss": 0.1251, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.0429009193054135, |
|
"grad_norm": 0.8757261633872986, |
|
"learning_rate": 1.5914198161389174e-05, |
|
"loss": 0.1213, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.213142662580865, |
|
"grad_norm": 1.1232839822769165, |
|
"learning_rate": 1.5573714674838272e-05, |
|
"loss": 0.1104, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.383384405856316, |
|
"grad_norm": 0.8715490698814392, |
|
"learning_rate": 1.5233231188287369e-05, |
|
"loss": 0.1099, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.553626149131767, |
|
"grad_norm": 1.2656769752502441, |
|
"learning_rate": 1.4892747701736467e-05, |
|
"loss": 0.1102, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.723867892407218, |
|
"grad_norm": 1.1669204235076904, |
|
"learning_rate": 1.4552264215185565e-05, |
|
"loss": 0.1101, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.8941096356826694, |
|
"grad_norm": 1.0073705911636353, |
|
"learning_rate": 1.4211780728634664e-05, |
|
"loss": 0.1085, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.0643513789581207, |
|
"grad_norm": 1.1393821239471436, |
|
"learning_rate": 1.3871297242083762e-05, |
|
"loss": 0.1027, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.2345931222335715, |
|
"grad_norm": 1.4679887294769287, |
|
"learning_rate": 1.3530813755532857e-05, |
|
"loss": 0.0926, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.404834865509023, |
|
"grad_norm": 0.8374710083007812, |
|
"learning_rate": 1.3190330268981955e-05, |
|
"loss": 0.0925, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.575076608784474, |
|
"grad_norm": 1.2514032125473022, |
|
"learning_rate": 1.2849846782431053e-05, |
|
"loss": 0.0927, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.7453183520599254, |
|
"grad_norm": 1.5251351594924927, |
|
"learning_rate": 1.250936329588015e-05, |
|
"loss": 0.0929, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.915560095335376, |
|
"grad_norm": 1.0668872594833374, |
|
"learning_rate": 1.2168879809329248e-05, |
|
"loss": 0.0923, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 4.085801838610827, |
|
"grad_norm": 1.0528796911239624, |
|
"learning_rate": 1.1828396322778346e-05, |
|
"loss": 0.0848, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.256043581886279, |
|
"grad_norm": 1.316041111946106, |
|
"learning_rate": 1.1487912836227445e-05, |
|
"loss": 0.0767, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.42628532516173, |
|
"grad_norm": 1.6180927753448486, |
|
"learning_rate": 1.1147429349676541e-05, |
|
"loss": 0.077, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.596527068437181, |
|
"grad_norm": 1.2156362533569336, |
|
"learning_rate": 1.080694586312564e-05, |
|
"loss": 0.0773, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.766768811712632, |
|
"grad_norm": 1.621887445449829, |
|
"learning_rate": 1.0466462376574738e-05, |
|
"loss": 0.0773, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.937010554988083, |
|
"grad_norm": 1.5306437015533447, |
|
"learning_rate": 1.0125978890023836e-05, |
|
"loss": 0.0774, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 5.107252298263534, |
|
"grad_norm": 22.37914276123047, |
|
"learning_rate": 9.785495403472932e-06, |
|
"loss": 0.0678, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 5.2774940415389855, |
|
"grad_norm": 1.3330860137939453, |
|
"learning_rate": 9.44501191692203e-06, |
|
"loss": 0.0634, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 5.447735784814436, |
|
"grad_norm": 1.9692567586898804, |
|
"learning_rate": 9.104528430371127e-06, |
|
"loss": 0.0634, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 5.617977528089888, |
|
"grad_norm": 1.3089221715927124, |
|
"learning_rate": 8.764044943820226e-06, |
|
"loss": 0.0635, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.788219271365339, |
|
"grad_norm": 1.5806821584701538, |
|
"learning_rate": 8.423561457269324e-06, |
|
"loss": 0.0637, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.95846101464079, |
|
"grad_norm": 1.579941987991333, |
|
"learning_rate": 8.08307797071842e-06, |
|
"loss": 0.0633, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 6.128702757916241, |
|
"grad_norm": 1.5726784467697144, |
|
"learning_rate": 7.742594484167519e-06, |
|
"loss": 0.054, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 6.298944501191692, |
|
"grad_norm": 1.140791654586792, |
|
"learning_rate": 7.402110997616616e-06, |
|
"loss": 0.052, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 6.469186244467143, |
|
"grad_norm": 1.6548409461975098, |
|
"learning_rate": 7.061627511065714e-06, |
|
"loss": 0.0516, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 6.639427987742595, |
|
"grad_norm": 1.3514069318771362, |
|
"learning_rate": 6.721144024514812e-06, |
|
"loss": 0.0522, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 6.809669731018046, |
|
"grad_norm": 1.5590009689331055, |
|
"learning_rate": 6.38066053796391e-06, |
|
"loss": 0.0518, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 6.9799114742934965, |
|
"grad_norm": 1.2986799478530884, |
|
"learning_rate": 6.0401770514130066e-06, |
|
"loss": 0.0524, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 7.150153217568948, |
|
"grad_norm": 1.5317639112472534, |
|
"learning_rate": 5.699693564862104e-06, |
|
"loss": 0.044, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 7.320394960844399, |
|
"grad_norm": 2.344708204269409, |
|
"learning_rate": 5.359210078311202e-06, |
|
"loss": 0.0415, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 7.49063670411985, |
|
"grad_norm": 3.3057548999786377, |
|
"learning_rate": 5.0187265917603005e-06, |
|
"loss": 0.0418, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 7.6608784473953015, |
|
"grad_norm": 1.3382242918014526, |
|
"learning_rate": 4.678243105209398e-06, |
|
"loss": 0.0419, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 7.831120190670752, |
|
"grad_norm": 1.7018738985061646, |
|
"learning_rate": 4.337759618658495e-06, |
|
"loss": 0.0421, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 8.001361933946203, |
|
"grad_norm": 0.9316732883453369, |
|
"learning_rate": 3.997276132107593e-06, |
|
"loss": 0.0414, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 8.171603677221654, |
|
"grad_norm": 1.4249956607818604, |
|
"learning_rate": 3.656792645556691e-06, |
|
"loss": 0.0346, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 8.341845420497107, |
|
"grad_norm": 1.263279914855957, |
|
"learning_rate": 3.3163091590057884e-06, |
|
"loss": 0.0345, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 8.512087163772557, |
|
"grad_norm": 2.6162939071655273, |
|
"learning_rate": 2.9758256724548862e-06, |
|
"loss": 0.0342, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 8.682328907048008, |
|
"grad_norm": 1.2574002742767334, |
|
"learning_rate": 2.635342185903984e-06, |
|
"loss": 0.0345, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 8.85257065032346, |
|
"grad_norm": 5.4230732917785645, |
|
"learning_rate": 2.2948586993530815e-06, |
|
"loss": 0.0344, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 9.02281239359891, |
|
"grad_norm": 0.885810136795044, |
|
"learning_rate": 1.9543752128021793e-06, |
|
"loss": 0.0333, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 9.19305413687436, |
|
"grad_norm": 1.7516717910766602, |
|
"learning_rate": 1.6138917262512767e-06, |
|
"loss": 0.0291, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 9.363295880149813, |
|
"grad_norm": 1.1372159719467163, |
|
"learning_rate": 1.2734082397003748e-06, |
|
"loss": 0.0293, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 9.533537623425264, |
|
"grad_norm": 0.9269993305206299, |
|
"learning_rate": 9.329247531494723e-07, |
|
"loss": 0.0294, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 9.703779366700715, |
|
"grad_norm": 1.229074239730835, |
|
"learning_rate": 5.9244126659857e-07, |
|
"loss": 0.0291, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 9.874021109976166, |
|
"grad_norm": 2.4099299907684326, |
|
"learning_rate": 2.519577800476677e-07, |
|
"loss": 0.0289, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 29370, |
|
"total_flos": 8.758967154215731e+17, |
|
"train_loss": 0.07915824020562384, |
|
"train_runtime": 29556.1772, |
|
"train_samples_per_second": 31.797, |
|
"train_steps_per_second": 0.994 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 29370, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.758967154215731e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|