MNLP_M2_rag_model / checkpoint-19500 /trainer_state.json
qchapp's picture
Upload folder using huggingface_hub
f46b8d2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9807398349128706,
"eval_steps": 500,
"global_step": 19500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07642922653622745,
"grad_norm": 4.5165534019470215,
"learning_rate": 4.8738917762152245e-05,
"loss": 0.89,
"step": 500
},
{
"epoch": 0.07642922653622745,
"eval_loss": 0.8767776489257812,
"eval_runtime": 159.519,
"eval_samples_per_second": 36.46,
"eval_steps_per_second": 4.557,
"step": 500
},
{
"epoch": 0.1528584530724549,
"grad_norm": 3.430340528488159,
"learning_rate": 4.746509731988179e-05,
"loss": 0.8656,
"step": 1000
},
{
"epoch": 0.1528584530724549,
"eval_loss": 0.8811877965927124,
"eval_runtime": 157.6806,
"eval_samples_per_second": 36.885,
"eval_steps_per_second": 4.611,
"step": 1000
},
{
"epoch": 0.22928767960868235,
"grad_norm": 3.6226563453674316,
"learning_rate": 4.619127687761133e-05,
"loss": 0.8897,
"step": 1500
},
{
"epoch": 0.22928767960868235,
"eval_loss": 0.8792645931243896,
"eval_runtime": 157.5757,
"eval_samples_per_second": 36.909,
"eval_steps_per_second": 4.614,
"step": 1500
},
{
"epoch": 0.3057169061449098,
"grad_norm": 2.8311519622802734,
"learning_rate": 4.4917456435340875e-05,
"loss": 0.8921,
"step": 2000
},
{
"epoch": 0.3057169061449098,
"eval_loss": 0.8744860291481018,
"eval_runtime": 157.645,
"eval_samples_per_second": 36.893,
"eval_steps_per_second": 4.612,
"step": 2000
},
{
"epoch": 0.3821461326811373,
"grad_norm": 2.6471974849700928,
"learning_rate": 4.364363599307042e-05,
"loss": 0.8826,
"step": 2500
},
{
"epoch": 0.3821461326811373,
"eval_loss": 0.8666115403175354,
"eval_runtime": 157.6483,
"eval_samples_per_second": 36.892,
"eval_steps_per_second": 4.612,
"step": 2500
},
{
"epoch": 0.4585753592173647,
"grad_norm": 3.0229456424713135,
"learning_rate": 4.236981555079996e-05,
"loss": 0.8613,
"step": 3000
},
{
"epoch": 0.4585753592173647,
"eval_loss": 0.8605388402938843,
"eval_runtime": 157.698,
"eval_samples_per_second": 36.881,
"eval_steps_per_second": 4.61,
"step": 3000
},
{
"epoch": 0.5350045857535922,
"grad_norm": 3.2055766582489014,
"learning_rate": 4.1095995108529505e-05,
"loss": 0.8648,
"step": 3500
},
{
"epoch": 0.5350045857535922,
"eval_loss": 0.8576663732528687,
"eval_runtime": 157.7251,
"eval_samples_per_second": 36.874,
"eval_steps_per_second": 4.609,
"step": 3500
},
{
"epoch": 0.6114338122898196,
"grad_norm": 2.2706174850463867,
"learning_rate": 3.982217466625904e-05,
"loss": 0.8607,
"step": 4000
},
{
"epoch": 0.6114338122898196,
"eval_loss": 0.8507756590843201,
"eval_runtime": 157.6341,
"eval_samples_per_second": 36.896,
"eval_steps_per_second": 4.612,
"step": 4000
},
{
"epoch": 0.687863038826047,
"grad_norm": 3.0524044036865234,
"learning_rate": 3.8548354223988585e-05,
"loss": 0.863,
"step": 4500
},
{
"epoch": 0.687863038826047,
"eval_loss": 0.8432514667510986,
"eval_runtime": 157.6289,
"eval_samples_per_second": 36.897,
"eval_steps_per_second": 4.612,
"step": 4500
},
{
"epoch": 0.7642922653622746,
"grad_norm": 2.707669258117676,
"learning_rate": 3.727453378171813e-05,
"loss": 0.8444,
"step": 5000
},
{
"epoch": 0.7642922653622746,
"eval_loss": 0.8389096856117249,
"eval_runtime": 157.6926,
"eval_samples_per_second": 36.882,
"eval_steps_per_second": 4.61,
"step": 5000
},
{
"epoch": 0.840721491898502,
"grad_norm": 3.0052075386047363,
"learning_rate": 3.600071333944767e-05,
"loss": 0.871,
"step": 5500
},
{
"epoch": 0.840721491898502,
"eval_loss": 0.8305906057357788,
"eval_runtime": 157.7772,
"eval_samples_per_second": 36.862,
"eval_steps_per_second": 4.608,
"step": 5500
},
{
"epoch": 0.9171507184347294,
"grad_norm": 1.7623426914215088,
"learning_rate": 3.4726892897177216e-05,
"loss": 0.8328,
"step": 6000
},
{
"epoch": 0.9171507184347294,
"eval_loss": 0.8280592560768127,
"eval_runtime": 157.7986,
"eval_samples_per_second": 36.857,
"eval_steps_per_second": 4.607,
"step": 6000
},
{
"epoch": 0.9935799449709569,
"grad_norm": 2.850409746170044,
"learning_rate": 3.345307245490676e-05,
"loss": 0.835,
"step": 6500
},
{
"epoch": 0.9935799449709569,
"eval_loss": 0.8225808143615723,
"eval_runtime": 157.7059,
"eval_samples_per_second": 36.879,
"eval_steps_per_second": 4.61,
"step": 6500
},
{
"epoch": 1.0700091715071844,
"grad_norm": 2.08107590675354,
"learning_rate": 3.21792520126363e-05,
"loss": 0.5759,
"step": 7000
},
{
"epoch": 1.0700091715071844,
"eval_loss": 0.8543522357940674,
"eval_runtime": 157.8069,
"eval_samples_per_second": 36.855,
"eval_steps_per_second": 4.607,
"step": 7000
},
{
"epoch": 1.1464383980434119,
"grad_norm": 2.4801783561706543,
"learning_rate": 3.0905431570365846e-05,
"loss": 0.5493,
"step": 7500
},
{
"epoch": 1.1464383980434119,
"eval_loss": 0.8509367108345032,
"eval_runtime": 157.6691,
"eval_samples_per_second": 36.887,
"eval_steps_per_second": 4.611,
"step": 7500
},
{
"epoch": 1.2228676245796393,
"grad_norm": 2.688427686691284,
"learning_rate": 2.963161112809539e-05,
"loss": 0.5516,
"step": 8000
},
{
"epoch": 1.2228676245796393,
"eval_loss": 0.8434808254241943,
"eval_runtime": 157.6951,
"eval_samples_per_second": 36.881,
"eval_steps_per_second": 4.61,
"step": 8000
},
{
"epoch": 1.2992968511158667,
"grad_norm": 2.8583438396453857,
"learning_rate": 2.8357790685824926e-05,
"loss": 0.5608,
"step": 8500
},
{
"epoch": 1.2992968511158667,
"eval_loss": 0.8415189981460571,
"eval_runtime": 157.7043,
"eval_samples_per_second": 36.879,
"eval_steps_per_second": 4.61,
"step": 8500
},
{
"epoch": 1.375726077652094,
"grad_norm": 2.8310320377349854,
"learning_rate": 2.708397024355447e-05,
"loss": 0.5468,
"step": 9000
},
{
"epoch": 1.375726077652094,
"eval_loss": 0.8396986126899719,
"eval_runtime": 157.7062,
"eval_samples_per_second": 36.879,
"eval_steps_per_second": 4.61,
"step": 9000
},
{
"epoch": 1.4521553041883215,
"grad_norm": 3.0906243324279785,
"learning_rate": 2.5810149801284013e-05,
"loss": 0.5499,
"step": 9500
},
{
"epoch": 1.4521553041883215,
"eval_loss": 0.8367328643798828,
"eval_runtime": 157.7916,
"eval_samples_per_second": 36.859,
"eval_steps_per_second": 4.607,
"step": 9500
},
{
"epoch": 1.5285845307245491,
"grad_norm": 3.706326723098755,
"learning_rate": 2.4536329359013556e-05,
"loss": 0.5503,
"step": 10000
},
{
"epoch": 1.5285845307245491,
"eval_loss": 0.8307807445526123,
"eval_runtime": 157.6389,
"eval_samples_per_second": 36.894,
"eval_steps_per_second": 4.612,
"step": 10000
},
{
"epoch": 1.6050137572607766,
"grad_norm": 2.6108150482177734,
"learning_rate": 2.3262508916743096e-05,
"loss": 0.5388,
"step": 10500
},
{
"epoch": 1.6050137572607766,
"eval_loss": 0.8295947313308716,
"eval_runtime": 157.6638,
"eval_samples_per_second": 36.889,
"eval_steps_per_second": 4.611,
"step": 10500
},
{
"epoch": 1.681442983797004,
"grad_norm": 1.6078243255615234,
"learning_rate": 2.1991236115357182e-05,
"loss": 0.5473,
"step": 11000
},
{
"epoch": 1.681442983797004,
"eval_loss": 0.8229663372039795,
"eval_runtime": 157.6846,
"eval_samples_per_second": 36.884,
"eval_steps_per_second": 4.61,
"step": 11000
},
{
"epoch": 1.7578722103332314,
"grad_norm": 3.049797773361206,
"learning_rate": 2.0717415673086722e-05,
"loss": 0.5496,
"step": 11500
},
{
"epoch": 1.7578722103332314,
"eval_loss": 0.8267400860786438,
"eval_runtime": 157.7336,
"eval_samples_per_second": 36.872,
"eval_steps_per_second": 4.609,
"step": 11500
},
{
"epoch": 1.8343014368694588,
"grad_norm": 2.292538642883301,
"learning_rate": 1.9443595230816262e-05,
"loss": 0.5448,
"step": 12000
},
{
"epoch": 1.8343014368694588,
"eval_loss": 0.8191345930099487,
"eval_runtime": 158.4293,
"eval_samples_per_second": 36.71,
"eval_steps_per_second": 4.589,
"step": 12000
},
{
"epoch": 1.9107306634056864,
"grad_norm": 2.1699585914611816,
"learning_rate": 1.8169774788545806e-05,
"loss": 0.5419,
"step": 12500
},
{
"epoch": 1.9107306634056864,
"eval_loss": 0.8131210803985596,
"eval_runtime": 157.6629,
"eval_samples_per_second": 36.889,
"eval_steps_per_second": 4.611,
"step": 12500
},
{
"epoch": 1.9871598899419138,
"grad_norm": 3.1323328018188477,
"learning_rate": 1.689595434627535e-05,
"loss": 0.5369,
"step": 13000
},
{
"epoch": 1.9871598899419138,
"eval_loss": 0.8066145777702332,
"eval_runtime": 157.7959,
"eval_samples_per_second": 36.858,
"eval_steps_per_second": 4.607,
"step": 13000
},
{
"epoch": 2.0635891164781412,
"grad_norm": 3.656402826309204,
"learning_rate": 1.562468154488943e-05,
"loss": 0.3304,
"step": 13500
},
{
"epoch": 2.0635891164781412,
"eval_loss": 0.9408266544342041,
"eval_runtime": 157.7674,
"eval_samples_per_second": 36.864,
"eval_steps_per_second": 4.608,
"step": 13500
},
{
"epoch": 2.140018343014369,
"grad_norm": 2.4427192211151123,
"learning_rate": 1.4350861102618977e-05,
"loss": 0.2759,
"step": 14000
},
{
"epoch": 2.140018343014369,
"eval_loss": 0.942986011505127,
"eval_runtime": 157.7087,
"eval_samples_per_second": 36.878,
"eval_steps_per_second": 4.61,
"step": 14000
},
{
"epoch": 2.216447569550596,
"grad_norm": 1.6041910648345947,
"learning_rate": 1.3077040660348518e-05,
"loss": 0.2873,
"step": 14500
},
{
"epoch": 2.216447569550596,
"eval_loss": 0.9449612498283386,
"eval_runtime": 157.767,
"eval_samples_per_second": 36.865,
"eval_steps_per_second": 4.608,
"step": 14500
},
{
"epoch": 2.2928767960868237,
"grad_norm": 3.233290433883667,
"learning_rate": 1.180322021807806e-05,
"loss": 0.2818,
"step": 15000
},
{
"epoch": 2.2928767960868237,
"eval_loss": 0.9387638568878174,
"eval_runtime": 157.7297,
"eval_samples_per_second": 36.873,
"eval_steps_per_second": 4.609,
"step": 15000
},
{
"epoch": 2.369306022623051,
"grad_norm": 3.9653565883636475,
"learning_rate": 1.0529399775807602e-05,
"loss": 0.2795,
"step": 15500
},
{
"epoch": 2.369306022623051,
"eval_loss": 0.9435889720916748,
"eval_runtime": 157.7236,
"eval_samples_per_second": 36.875,
"eval_steps_per_second": 4.609,
"step": 15500
},
{
"epoch": 2.4457352491592785,
"grad_norm": 2.453057289123535,
"learning_rate": 9.255579333537145e-06,
"loss": 0.2801,
"step": 16000
},
{
"epoch": 2.4457352491592785,
"eval_loss": 0.9410313963890076,
"eval_runtime": 157.7031,
"eval_samples_per_second": 36.879,
"eval_steps_per_second": 4.61,
"step": 16000
},
{
"epoch": 2.522164475695506,
"grad_norm": 2.9924492835998535,
"learning_rate": 7.981758891266687e-06,
"loss": 0.2788,
"step": 16500
},
{
"epoch": 2.522164475695506,
"eval_loss": 0.9427609443664551,
"eval_runtime": 157.6635,
"eval_samples_per_second": 36.889,
"eval_steps_per_second": 4.611,
"step": 16500
},
{
"epoch": 2.5985937022317334,
"grad_norm": 2.626593828201294,
"learning_rate": 6.7079384489962305e-06,
"loss": 0.2752,
"step": 17000
},
{
"epoch": 2.5985937022317334,
"eval_loss": 0.9421259164810181,
"eval_runtime": 157.689,
"eval_samples_per_second": 36.883,
"eval_steps_per_second": 4.61,
"step": 17000
},
{
"epoch": 2.675022928767961,
"grad_norm": 2.623121500015259,
"learning_rate": 5.436665647610313e-06,
"loss": 0.2695,
"step": 17500
},
{
"epoch": 2.675022928767961,
"eval_loss": 0.9395164251327515,
"eval_runtime": 157.7665,
"eval_samples_per_second": 36.865,
"eval_steps_per_second": 4.608,
"step": 17500
},
{
"epoch": 2.751452155304188,
"grad_norm": 2.4113175868988037,
"learning_rate": 4.1653928462243965e-06,
"loss": 0.2697,
"step": 18000
},
{
"epoch": 2.751452155304188,
"eval_loss": 0.9405816197395325,
"eval_runtime": 157.6703,
"eval_samples_per_second": 36.887,
"eval_steps_per_second": 4.611,
"step": 18000
},
{
"epoch": 2.827881381840416,
"grad_norm": 3.3730709552764893,
"learning_rate": 2.8915724039539386e-06,
"loss": 0.2769,
"step": 18500
},
{
"epoch": 2.827881381840416,
"eval_loss": 0.9389672875404358,
"eval_runtime": 157.689,
"eval_samples_per_second": 36.883,
"eval_steps_per_second": 4.61,
"step": 18500
},
{
"epoch": 2.904310608376643,
"grad_norm": 3.464594841003418,
"learning_rate": 1.6177519616834812e-06,
"loss": 0.271,
"step": 19000
},
{
"epoch": 2.904310608376643,
"eval_loss": 0.9393123984336853,
"eval_runtime": 157.8608,
"eval_samples_per_second": 36.843,
"eval_steps_per_second": 4.605,
"step": 19000
},
{
"epoch": 2.9807398349128706,
"grad_norm": 2.9936461448669434,
"learning_rate": 3.439315194130236e-07,
"loss": 0.2584,
"step": 19500
},
{
"epoch": 2.9807398349128706,
"eval_loss": 0.9363918900489807,
"eval_runtime": 157.724,
"eval_samples_per_second": 36.875,
"eval_steps_per_second": 4.609,
"step": 19500
}
],
"logging_steps": 500,
"max_steps": 19626,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.11086099873792e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}