|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9807398349128706, |
|
"eval_steps": 500, |
|
"global_step": 19500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07642922653622745, |
|
"grad_norm": 4.5165534019470215, |
|
"learning_rate": 4.8738917762152245e-05, |
|
"loss": 0.89, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07642922653622745, |
|
"eval_loss": 0.8767776489257812, |
|
"eval_runtime": 159.519, |
|
"eval_samples_per_second": 36.46, |
|
"eval_steps_per_second": 4.557, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1528584530724549, |
|
"grad_norm": 3.430340528488159, |
|
"learning_rate": 4.746509731988179e-05, |
|
"loss": 0.8656, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1528584530724549, |
|
"eval_loss": 0.8811877965927124, |
|
"eval_runtime": 157.6806, |
|
"eval_samples_per_second": 36.885, |
|
"eval_steps_per_second": 4.611, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.22928767960868235, |
|
"grad_norm": 3.6226563453674316, |
|
"learning_rate": 4.619127687761133e-05, |
|
"loss": 0.8897, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.22928767960868235, |
|
"eval_loss": 0.8792645931243896, |
|
"eval_runtime": 157.5757, |
|
"eval_samples_per_second": 36.909, |
|
"eval_steps_per_second": 4.614, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3057169061449098, |
|
"grad_norm": 2.8311519622802734, |
|
"learning_rate": 4.4917456435340875e-05, |
|
"loss": 0.8921, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3057169061449098, |
|
"eval_loss": 0.8744860291481018, |
|
"eval_runtime": 157.645, |
|
"eval_samples_per_second": 36.893, |
|
"eval_steps_per_second": 4.612, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3821461326811373, |
|
"grad_norm": 2.6471974849700928, |
|
"learning_rate": 4.364363599307042e-05, |
|
"loss": 0.8826, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3821461326811373, |
|
"eval_loss": 0.8666115403175354, |
|
"eval_runtime": 157.6483, |
|
"eval_samples_per_second": 36.892, |
|
"eval_steps_per_second": 4.612, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4585753592173647, |
|
"grad_norm": 3.0229456424713135, |
|
"learning_rate": 4.236981555079996e-05, |
|
"loss": 0.8613, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4585753592173647, |
|
"eval_loss": 0.8605388402938843, |
|
"eval_runtime": 157.698, |
|
"eval_samples_per_second": 36.881, |
|
"eval_steps_per_second": 4.61, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5350045857535922, |
|
"grad_norm": 3.2055766582489014, |
|
"learning_rate": 4.1095995108529505e-05, |
|
"loss": 0.8648, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5350045857535922, |
|
"eval_loss": 0.8576663732528687, |
|
"eval_runtime": 157.7251, |
|
"eval_samples_per_second": 36.874, |
|
"eval_steps_per_second": 4.609, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6114338122898196, |
|
"grad_norm": 2.2706174850463867, |
|
"learning_rate": 3.982217466625904e-05, |
|
"loss": 0.8607, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6114338122898196, |
|
"eval_loss": 0.8507756590843201, |
|
"eval_runtime": 157.6341, |
|
"eval_samples_per_second": 36.896, |
|
"eval_steps_per_second": 4.612, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.687863038826047, |
|
"grad_norm": 3.0524044036865234, |
|
"learning_rate": 3.8548354223988585e-05, |
|
"loss": 0.863, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.687863038826047, |
|
"eval_loss": 0.8432514667510986, |
|
"eval_runtime": 157.6289, |
|
"eval_samples_per_second": 36.897, |
|
"eval_steps_per_second": 4.612, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7642922653622746, |
|
"grad_norm": 2.707669258117676, |
|
"learning_rate": 3.727453378171813e-05, |
|
"loss": 0.8444, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7642922653622746, |
|
"eval_loss": 0.8389096856117249, |
|
"eval_runtime": 157.6926, |
|
"eval_samples_per_second": 36.882, |
|
"eval_steps_per_second": 4.61, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.840721491898502, |
|
"grad_norm": 3.0052075386047363, |
|
"learning_rate": 3.600071333944767e-05, |
|
"loss": 0.871, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.840721491898502, |
|
"eval_loss": 0.8305906057357788, |
|
"eval_runtime": 157.7772, |
|
"eval_samples_per_second": 36.862, |
|
"eval_steps_per_second": 4.608, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9171507184347294, |
|
"grad_norm": 1.7623426914215088, |
|
"learning_rate": 3.4726892897177216e-05, |
|
"loss": 0.8328, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9171507184347294, |
|
"eval_loss": 0.8280592560768127, |
|
"eval_runtime": 157.7986, |
|
"eval_samples_per_second": 36.857, |
|
"eval_steps_per_second": 4.607, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9935799449709569, |
|
"grad_norm": 2.850409746170044, |
|
"learning_rate": 3.345307245490676e-05, |
|
"loss": 0.835, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9935799449709569, |
|
"eval_loss": 0.8225808143615723, |
|
"eval_runtime": 157.7059, |
|
"eval_samples_per_second": 36.879, |
|
"eval_steps_per_second": 4.61, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.0700091715071844, |
|
"grad_norm": 2.08107590675354, |
|
"learning_rate": 3.21792520126363e-05, |
|
"loss": 0.5759, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.0700091715071844, |
|
"eval_loss": 0.8543522357940674, |
|
"eval_runtime": 157.8069, |
|
"eval_samples_per_second": 36.855, |
|
"eval_steps_per_second": 4.607, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.1464383980434119, |
|
"grad_norm": 2.4801783561706543, |
|
"learning_rate": 3.0905431570365846e-05, |
|
"loss": 0.5493, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.1464383980434119, |
|
"eval_loss": 0.8509367108345032, |
|
"eval_runtime": 157.6691, |
|
"eval_samples_per_second": 36.887, |
|
"eval_steps_per_second": 4.611, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.2228676245796393, |
|
"grad_norm": 2.688427686691284, |
|
"learning_rate": 2.963161112809539e-05, |
|
"loss": 0.5516, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.2228676245796393, |
|
"eval_loss": 0.8434808254241943, |
|
"eval_runtime": 157.6951, |
|
"eval_samples_per_second": 36.881, |
|
"eval_steps_per_second": 4.61, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.2992968511158667, |
|
"grad_norm": 2.8583438396453857, |
|
"learning_rate": 2.8357790685824926e-05, |
|
"loss": 0.5608, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.2992968511158667, |
|
"eval_loss": 0.8415189981460571, |
|
"eval_runtime": 157.7043, |
|
"eval_samples_per_second": 36.879, |
|
"eval_steps_per_second": 4.61, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.375726077652094, |
|
"grad_norm": 2.8310320377349854, |
|
"learning_rate": 2.708397024355447e-05, |
|
"loss": 0.5468, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.375726077652094, |
|
"eval_loss": 0.8396986126899719, |
|
"eval_runtime": 157.7062, |
|
"eval_samples_per_second": 36.879, |
|
"eval_steps_per_second": 4.61, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.4521553041883215, |
|
"grad_norm": 3.0906243324279785, |
|
"learning_rate": 2.5810149801284013e-05, |
|
"loss": 0.5499, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.4521553041883215, |
|
"eval_loss": 0.8367328643798828, |
|
"eval_runtime": 157.7916, |
|
"eval_samples_per_second": 36.859, |
|
"eval_steps_per_second": 4.607, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.5285845307245491, |
|
"grad_norm": 3.706326723098755, |
|
"learning_rate": 2.4536329359013556e-05, |
|
"loss": 0.5503, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.5285845307245491, |
|
"eval_loss": 0.8307807445526123, |
|
"eval_runtime": 157.6389, |
|
"eval_samples_per_second": 36.894, |
|
"eval_steps_per_second": 4.612, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.6050137572607766, |
|
"grad_norm": 2.6108150482177734, |
|
"learning_rate": 2.3262508916743096e-05, |
|
"loss": 0.5388, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.6050137572607766, |
|
"eval_loss": 0.8295947313308716, |
|
"eval_runtime": 157.6638, |
|
"eval_samples_per_second": 36.889, |
|
"eval_steps_per_second": 4.611, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.681442983797004, |
|
"grad_norm": 1.6078243255615234, |
|
"learning_rate": 2.1991236115357182e-05, |
|
"loss": 0.5473, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.681442983797004, |
|
"eval_loss": 0.8229663372039795, |
|
"eval_runtime": 157.6846, |
|
"eval_samples_per_second": 36.884, |
|
"eval_steps_per_second": 4.61, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.7578722103332314, |
|
"grad_norm": 3.049797773361206, |
|
"learning_rate": 2.0717415673086722e-05, |
|
"loss": 0.5496, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.7578722103332314, |
|
"eval_loss": 0.8267400860786438, |
|
"eval_runtime": 157.7336, |
|
"eval_samples_per_second": 36.872, |
|
"eval_steps_per_second": 4.609, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.8343014368694588, |
|
"grad_norm": 2.292538642883301, |
|
"learning_rate": 1.9443595230816262e-05, |
|
"loss": 0.5448, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.8343014368694588, |
|
"eval_loss": 0.8191345930099487, |
|
"eval_runtime": 158.4293, |
|
"eval_samples_per_second": 36.71, |
|
"eval_steps_per_second": 4.589, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.9107306634056864, |
|
"grad_norm": 2.1699585914611816, |
|
"learning_rate": 1.8169774788545806e-05, |
|
"loss": 0.5419, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.9107306634056864, |
|
"eval_loss": 0.8131210803985596, |
|
"eval_runtime": 157.6629, |
|
"eval_samples_per_second": 36.889, |
|
"eval_steps_per_second": 4.611, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.9871598899419138, |
|
"grad_norm": 3.1323328018188477, |
|
"learning_rate": 1.689595434627535e-05, |
|
"loss": 0.5369, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.9871598899419138, |
|
"eval_loss": 0.8066145777702332, |
|
"eval_runtime": 157.7959, |
|
"eval_samples_per_second": 36.858, |
|
"eval_steps_per_second": 4.607, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.0635891164781412, |
|
"grad_norm": 3.656402826309204, |
|
"learning_rate": 1.562468154488943e-05, |
|
"loss": 0.3304, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.0635891164781412, |
|
"eval_loss": 0.9408266544342041, |
|
"eval_runtime": 157.7674, |
|
"eval_samples_per_second": 36.864, |
|
"eval_steps_per_second": 4.608, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.140018343014369, |
|
"grad_norm": 2.4427192211151123, |
|
"learning_rate": 1.4350861102618977e-05, |
|
"loss": 0.2759, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.140018343014369, |
|
"eval_loss": 0.942986011505127, |
|
"eval_runtime": 157.7087, |
|
"eval_samples_per_second": 36.878, |
|
"eval_steps_per_second": 4.61, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.216447569550596, |
|
"grad_norm": 1.6041910648345947, |
|
"learning_rate": 1.3077040660348518e-05, |
|
"loss": 0.2873, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.216447569550596, |
|
"eval_loss": 0.9449612498283386, |
|
"eval_runtime": 157.767, |
|
"eval_samples_per_second": 36.865, |
|
"eval_steps_per_second": 4.608, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.2928767960868237, |
|
"grad_norm": 3.233290433883667, |
|
"learning_rate": 1.180322021807806e-05, |
|
"loss": 0.2818, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.2928767960868237, |
|
"eval_loss": 0.9387638568878174, |
|
"eval_runtime": 157.7297, |
|
"eval_samples_per_second": 36.873, |
|
"eval_steps_per_second": 4.609, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.369306022623051, |
|
"grad_norm": 3.9653565883636475, |
|
"learning_rate": 1.0529399775807602e-05, |
|
"loss": 0.2795, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.369306022623051, |
|
"eval_loss": 0.9435889720916748, |
|
"eval_runtime": 157.7236, |
|
"eval_samples_per_second": 36.875, |
|
"eval_steps_per_second": 4.609, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.4457352491592785, |
|
"grad_norm": 2.453057289123535, |
|
"learning_rate": 9.255579333537145e-06, |
|
"loss": 0.2801, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.4457352491592785, |
|
"eval_loss": 0.9410313963890076, |
|
"eval_runtime": 157.7031, |
|
"eval_samples_per_second": 36.879, |
|
"eval_steps_per_second": 4.61, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.522164475695506, |
|
"grad_norm": 2.9924492835998535, |
|
"learning_rate": 7.981758891266687e-06, |
|
"loss": 0.2788, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.522164475695506, |
|
"eval_loss": 0.9427609443664551, |
|
"eval_runtime": 157.6635, |
|
"eval_samples_per_second": 36.889, |
|
"eval_steps_per_second": 4.611, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.5985937022317334, |
|
"grad_norm": 2.626593828201294, |
|
"learning_rate": 6.7079384489962305e-06, |
|
"loss": 0.2752, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.5985937022317334, |
|
"eval_loss": 0.9421259164810181, |
|
"eval_runtime": 157.689, |
|
"eval_samples_per_second": 36.883, |
|
"eval_steps_per_second": 4.61, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.675022928767961, |
|
"grad_norm": 2.623121500015259, |
|
"learning_rate": 5.436665647610313e-06, |
|
"loss": 0.2695, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.675022928767961, |
|
"eval_loss": 0.9395164251327515, |
|
"eval_runtime": 157.7665, |
|
"eval_samples_per_second": 36.865, |
|
"eval_steps_per_second": 4.608, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.751452155304188, |
|
"grad_norm": 2.4113175868988037, |
|
"learning_rate": 4.1653928462243965e-06, |
|
"loss": 0.2697, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.751452155304188, |
|
"eval_loss": 0.9405816197395325, |
|
"eval_runtime": 157.6703, |
|
"eval_samples_per_second": 36.887, |
|
"eval_steps_per_second": 4.611, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.827881381840416, |
|
"grad_norm": 3.3730709552764893, |
|
"learning_rate": 2.8915724039539386e-06, |
|
"loss": 0.2769, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.827881381840416, |
|
"eval_loss": 0.9389672875404358, |
|
"eval_runtime": 157.689, |
|
"eval_samples_per_second": 36.883, |
|
"eval_steps_per_second": 4.61, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.904310608376643, |
|
"grad_norm": 3.464594841003418, |
|
"learning_rate": 1.6177519616834812e-06, |
|
"loss": 0.271, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.904310608376643, |
|
"eval_loss": 0.9393123984336853, |
|
"eval_runtime": 157.8608, |
|
"eval_samples_per_second": 36.843, |
|
"eval_steps_per_second": 4.605, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.9807398349128706, |
|
"grad_norm": 2.9936461448669434, |
|
"learning_rate": 3.439315194130236e-07, |
|
"loss": 0.2584, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.9807398349128706, |
|
"eval_loss": 0.9363918900489807, |
|
"eval_runtime": 157.724, |
|
"eval_samples_per_second": 36.875, |
|
"eval_steps_per_second": 4.609, |
|
"step": 19500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 19626, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.11086099873792e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|