|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9994666666666667, |
|
"eval_steps": 500, |
|
"global_step": 8436, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0035555555555555557, |
|
"grad_norm": 172.0995169680502, |
|
"learning_rate": 1.1848341232227489e-07, |
|
"loss": 2.2225, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0071111111111111115, |
|
"grad_norm": 104.28584294834684, |
|
"learning_rate": 2.3696682464454978e-07, |
|
"loss": 2.0266, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 23.951684052753993, |
|
"learning_rate": 3.5545023696682467e-07, |
|
"loss": 1.7378, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014222222222222223, |
|
"grad_norm": 28.63768523364588, |
|
"learning_rate": 4.7393364928909956e-07, |
|
"loss": 1.4898, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017777777777777778, |
|
"grad_norm": 13.504890360680221, |
|
"learning_rate": 5.924170616113745e-07, |
|
"loss": 1.1851, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 6.1972205960321585, |
|
"learning_rate": 7.109004739336493e-07, |
|
"loss": 0.988, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.024888888888888887, |
|
"grad_norm": 5.065573416801065, |
|
"learning_rate": 8.293838862559242e-07, |
|
"loss": 0.862, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.028444444444444446, |
|
"grad_norm": 4.1523834288926516, |
|
"learning_rate": 9.478672985781991e-07, |
|
"loss": 0.7813, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 4.2769222488911405, |
|
"learning_rate": 1.0663507109004742e-06, |
|
"loss": 0.6638, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 3.7926520111038613, |
|
"learning_rate": 1.184834123222749e-06, |
|
"loss": 0.6073, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03911111111111111, |
|
"grad_norm": 4.5354775101779605, |
|
"learning_rate": 1.303317535545024e-06, |
|
"loss": 0.5392, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 9.972389478302686, |
|
"learning_rate": 1.4218009478672987e-06, |
|
"loss": 0.5798, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04622222222222222, |
|
"grad_norm": 3.3720288764197903, |
|
"learning_rate": 1.5402843601895737e-06, |
|
"loss": 0.5119, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.049777777777777775, |
|
"grad_norm": 2.9830238004238674, |
|
"learning_rate": 1.6587677725118483e-06, |
|
"loss": 0.4432, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 3.0823358079048395, |
|
"learning_rate": 1.7772511848341234e-06, |
|
"loss": 0.4637, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05688888888888889, |
|
"grad_norm": 2.7399052383817493, |
|
"learning_rate": 1.8957345971563982e-06, |
|
"loss": 0.4623, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.060444444444444446, |
|
"grad_norm": 2.5949470941499886, |
|
"learning_rate": 2.0142180094786733e-06, |
|
"loss": 0.4909, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 2.860434581778304, |
|
"learning_rate": 2.1327014218009483e-06, |
|
"loss": 0.4522, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06755555555555555, |
|
"grad_norm": 2.4794062920348514, |
|
"learning_rate": 2.251184834123223e-06, |
|
"loss": 0.4683, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 3.5898381841290385, |
|
"learning_rate": 2.369668246445498e-06, |
|
"loss": 0.4544, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 2.8271249937433893, |
|
"learning_rate": 2.4881516587677726e-06, |
|
"loss": 0.452, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07822222222222222, |
|
"grad_norm": 2.820485688519842, |
|
"learning_rate": 2.606635071090048e-06, |
|
"loss": 0.4594, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08177777777777778, |
|
"grad_norm": 2.879680482909577, |
|
"learning_rate": 2.7251184834123223e-06, |
|
"loss": 0.4079, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 2.2760447333960547, |
|
"learning_rate": 2.8436018957345973e-06, |
|
"loss": 0.4586, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 2.831009166917502, |
|
"learning_rate": 2.9620853080568724e-06, |
|
"loss": 0.4143, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09244444444444444, |
|
"grad_norm": 3.2359232461275895, |
|
"learning_rate": 3.0805687203791474e-06, |
|
"loss": 0.454, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 2.7067735723833932, |
|
"learning_rate": 3.1990521327014216e-06, |
|
"loss": 0.3998, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09955555555555555, |
|
"grad_norm": 2.550645136169034, |
|
"learning_rate": 3.3175355450236967e-06, |
|
"loss": 0.397, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.10311111111111111, |
|
"grad_norm": 2.6477271129566162, |
|
"learning_rate": 3.4360189573459717e-06, |
|
"loss": 0.416, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 2.744124645461815, |
|
"learning_rate": 3.5545023696682468e-06, |
|
"loss": 0.4521, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11022222222222222, |
|
"grad_norm": 2.499585309198425, |
|
"learning_rate": 3.672985781990522e-06, |
|
"loss": 0.4023, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.11377777777777778, |
|
"grad_norm": 2.6278096303414467, |
|
"learning_rate": 3.7914691943127964e-06, |
|
"loss": 0.4191, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 2.4188835712940326, |
|
"learning_rate": 3.9099526066350715e-06, |
|
"loss": 0.4122, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12088888888888889, |
|
"grad_norm": 2.553975268194503, |
|
"learning_rate": 4.0284360189573465e-06, |
|
"loss": 0.3498, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12444444444444444, |
|
"grad_norm": 2.834535859400579, |
|
"learning_rate": 4.146919431279622e-06, |
|
"loss": 0.4094, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 2.533973817990368, |
|
"learning_rate": 4.265402843601897e-06, |
|
"loss": 0.4298, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13155555555555556, |
|
"grad_norm": 2.813906241826433, |
|
"learning_rate": 4.383886255924171e-06, |
|
"loss": 0.4216, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1351111111111111, |
|
"grad_norm": 2.102931563969342, |
|
"learning_rate": 4.502369668246446e-06, |
|
"loss": 0.3808, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 2.4379289560773896, |
|
"learning_rate": 4.620853080568721e-06, |
|
"loss": 0.3618, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 2.3557567609798777, |
|
"learning_rate": 4.739336492890996e-06, |
|
"loss": 0.4044, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14577777777777778, |
|
"grad_norm": 2.2820973068522514, |
|
"learning_rate": 4.857819905213271e-06, |
|
"loss": 0.4071, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 2.6709678530509993, |
|
"learning_rate": 4.976303317535545e-06, |
|
"loss": 0.4272, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.15288888888888888, |
|
"grad_norm": 2.332134363712532, |
|
"learning_rate": 5.09478672985782e-06, |
|
"loss": 0.434, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.15644444444444444, |
|
"grad_norm": 2.9162979668749047, |
|
"learning_rate": 5.213270142180096e-06, |
|
"loss": 0.3695, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.2427213677361655, |
|
"learning_rate": 5.33175355450237e-06, |
|
"loss": 0.3723, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16355555555555557, |
|
"grad_norm": 2.5901865124993, |
|
"learning_rate": 5.4502369668246446e-06, |
|
"loss": 0.4147, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1671111111111111, |
|
"grad_norm": 2.56419802107506, |
|
"learning_rate": 5.5687203791469205e-06, |
|
"loss": 0.4083, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 2.145912482611642, |
|
"learning_rate": 5.687203791469195e-06, |
|
"loss": 0.3631, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.17422222222222222, |
|
"grad_norm": 2.1572804538302983, |
|
"learning_rate": 5.8056872037914706e-06, |
|
"loss": 0.3838, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 2.7221208940412955, |
|
"learning_rate": 5.924170616113745e-06, |
|
"loss": 0.3703, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"eval_loss": 0.27164188027381897, |
|
"eval_runtime": 561.686, |
|
"eval_samples_per_second": 17.804, |
|
"eval_steps_per_second": 4.451, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 2.128760220417613, |
|
"learning_rate": 6.042654028436019e-06, |
|
"loss": 0.3936, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.18488888888888888, |
|
"grad_norm": 2.37349559892131, |
|
"learning_rate": 6.161137440758295e-06, |
|
"loss": 0.4097, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.18844444444444444, |
|
"grad_norm": 2.1546814583393954, |
|
"learning_rate": 6.279620853080569e-06, |
|
"loss": 0.3487, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 2.5691866709112174, |
|
"learning_rate": 6.398104265402843e-06, |
|
"loss": 0.3795, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.19555555555555557, |
|
"grad_norm": 2.511088780500042, |
|
"learning_rate": 6.516587677725119e-06, |
|
"loss": 0.3592, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1991111111111111, |
|
"grad_norm": 2.1980105108863306, |
|
"learning_rate": 6.635071090047393e-06, |
|
"loss": 0.3759, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 2.0372925079256508, |
|
"learning_rate": 6.753554502369669e-06, |
|
"loss": 0.3372, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.20622222222222222, |
|
"grad_norm": 2.4474157501007188, |
|
"learning_rate": 6.8720379146919435e-06, |
|
"loss": 0.3821, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.20977777777777779, |
|
"grad_norm": 2.6150488990545813, |
|
"learning_rate": 6.990521327014218e-06, |
|
"loss": 0.4033, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 2.218675478875157, |
|
"learning_rate": 7.1090047393364935e-06, |
|
"loss": 0.3498, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.21688888888888888, |
|
"grad_norm": 2.60194848198847, |
|
"learning_rate": 7.227488151658768e-06, |
|
"loss": 0.3974, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.22044444444444444, |
|
"grad_norm": 2.4008012422084883, |
|
"learning_rate": 7.345971563981044e-06, |
|
"loss": 0.3522, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 2.370019125222766, |
|
"learning_rate": 7.464454976303318e-06, |
|
"loss": 0.3843, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.22755555555555557, |
|
"grad_norm": 2.319127909040294, |
|
"learning_rate": 7.582938388625593e-06, |
|
"loss": 0.3852, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2311111111111111, |
|
"grad_norm": 2.0344327356388963, |
|
"learning_rate": 7.701421800947868e-06, |
|
"loss": 0.3753, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 2.0974945886124274, |
|
"learning_rate": 7.819905213270143e-06, |
|
"loss": 0.3622, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.23822222222222222, |
|
"grad_norm": 2.3710225236326656, |
|
"learning_rate": 7.938388625592418e-06, |
|
"loss": 0.3776, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.24177777777777779, |
|
"grad_norm": 2.1972590118602353, |
|
"learning_rate": 8.056872037914693e-06, |
|
"loss": 0.4131, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 2.124563531807995, |
|
"learning_rate": 8.175355450236966e-06, |
|
"loss": 0.4041, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 2.186519973081525, |
|
"learning_rate": 8.293838862559243e-06, |
|
"loss": 0.4342, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.25244444444444447, |
|
"grad_norm": 2.2098045409685785, |
|
"learning_rate": 8.412322274881517e-06, |
|
"loss": 0.3753, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 2.364680759422569, |
|
"learning_rate": 8.530805687203793e-06, |
|
"loss": 0.3499, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.25955555555555554, |
|
"grad_norm": 2.0592638534598975, |
|
"learning_rate": 8.649289099526067e-06, |
|
"loss": 0.3676, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.26311111111111113, |
|
"grad_norm": 2.076874300192435, |
|
"learning_rate": 8.767772511848342e-06, |
|
"loss": 0.3882, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 2.256989717343507, |
|
"learning_rate": 8.886255924170617e-06, |
|
"loss": 0.3906, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2702222222222222, |
|
"grad_norm": 2.2777259263170753, |
|
"learning_rate": 9.004739336492892e-06, |
|
"loss": 0.3881, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.2737777777777778, |
|
"grad_norm": 2.0191108991103452, |
|
"learning_rate": 9.123222748815167e-06, |
|
"loss": 0.3598, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 2.1955719220241114, |
|
"learning_rate": 9.241706161137442e-06, |
|
"loss": 0.3411, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2808888888888889, |
|
"grad_norm": 1.8450512554264078, |
|
"learning_rate": 9.360189573459715e-06, |
|
"loss": 0.3989, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 2.011115441632504, |
|
"learning_rate": 9.478672985781992e-06, |
|
"loss": 0.3982, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.8704472001913133, |
|
"learning_rate": 9.597156398104265e-06, |
|
"loss": 0.414, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.29155555555555557, |
|
"grad_norm": 1.9254101904021153, |
|
"learning_rate": 9.715639810426542e-06, |
|
"loss": 0.3767, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2951111111111111, |
|
"grad_norm": 1.9015728855115495, |
|
"learning_rate": 9.834123222748815e-06, |
|
"loss": 0.3775, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 1.928562219171237, |
|
"learning_rate": 9.95260663507109e-06, |
|
"loss": 0.3955, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3022222222222222, |
|
"grad_norm": 1.5585912642130104, |
|
"learning_rate": 9.999984589042141e-06, |
|
"loss": 0.3897, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.30577777777777776, |
|
"grad_norm": 2.088285655295682, |
|
"learning_rate": 9.999890411310363e-06, |
|
"loss": 0.3657, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 1.7831321620409892, |
|
"learning_rate": 9.999710619100732e-06, |
|
"loss": 0.3699, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.3128888888888889, |
|
"grad_norm": 1.8859386777237288, |
|
"learning_rate": 9.999445215491888e-06, |
|
"loss": 0.3675, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3164444444444444, |
|
"grad_norm": 1.793847189739239, |
|
"learning_rate": 9.999094205028403e-06, |
|
"loss": 0.3804, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.8588345423039347, |
|
"learning_rate": 9.998657593720726e-06, |
|
"loss": 0.3628, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.32355555555555554, |
|
"grad_norm": 1.904522383364726, |
|
"learning_rate": 9.998135389045071e-06, |
|
"loss": 0.3832, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.32711111111111113, |
|
"grad_norm": 1.7658830671737389, |
|
"learning_rate": 9.997527599943288e-06, |
|
"loss": 0.3931, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 1.8645179401650172, |
|
"learning_rate": 9.996834236822718e-06, |
|
"loss": 0.3587, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3342222222222222, |
|
"grad_norm": 1.8432627384605438, |
|
"learning_rate": 9.996055311556002e-06, |
|
"loss": 0.4065, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3377777777777778, |
|
"grad_norm": 1.8436289309250031, |
|
"learning_rate": 9.99519083748089e-06, |
|
"loss": 0.3861, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 1.9136369003670703, |
|
"learning_rate": 9.994240829400006e-06, |
|
"loss": 0.3794, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3448888888888889, |
|
"grad_norm": 1.559884705708754, |
|
"learning_rate": 9.993205303580596e-06, |
|
"loss": 0.3675, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.34844444444444445, |
|
"grad_norm": 2.0844042937670317, |
|
"learning_rate": 9.992084277754246e-06, |
|
"loss": 0.3725, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.4001469351101974, |
|
"learning_rate": 9.990877771116588e-06, |
|
"loss": 0.3526, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 1.7302242639985734, |
|
"learning_rate": 9.989585804326963e-06, |
|
"loss": 0.3451, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"eval_loss": 0.2586575448513031, |
|
"eval_runtime": 561.7755, |
|
"eval_samples_per_second": 17.801, |
|
"eval_steps_per_second": 4.45, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3591111111111111, |
|
"grad_norm": 2.1060593962865832, |
|
"learning_rate": 9.988208399508064e-06, |
|
"loss": 0.3923, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 1.6475744003826194, |
|
"learning_rate": 9.986745580245569e-06, |
|
"loss": 0.3077, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.3662222222222222, |
|
"grad_norm": 1.9521091866638012, |
|
"learning_rate": 9.985197371587732e-06, |
|
"loss": 0.389, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.36977777777777776, |
|
"grad_norm": 1.7609515675334448, |
|
"learning_rate": 9.983563800044942e-06, |
|
"loss": 0.3424, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 1.7210920690658038, |
|
"learning_rate": 9.981844893589294e-06, |
|
"loss": 0.3558, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3768888888888889, |
|
"grad_norm": 1.823734659697161, |
|
"learning_rate": 9.980040681654085e-06, |
|
"loss": 0.3693, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3804444444444444, |
|
"grad_norm": 2.102269417162816, |
|
"learning_rate": 9.978151195133326e-06, |
|
"loss": 0.3638, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.8033749091845297, |
|
"learning_rate": 9.976176466381205e-06, |
|
"loss": 0.3484, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.38755555555555554, |
|
"grad_norm": 1.8854677696591007, |
|
"learning_rate": 9.974116529211539e-06, |
|
"loss": 0.3967, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 2.0272157520267218, |
|
"learning_rate": 9.971971418897189e-06, |
|
"loss": 0.3741, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 2.0179018140684555, |
|
"learning_rate": 9.969741172169461e-06, |
|
"loss": 0.3904, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3982222222222222, |
|
"grad_norm": 1.6226992565101939, |
|
"learning_rate": 9.967425827217473e-06, |
|
"loss": 0.3485, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4017777777777778, |
|
"grad_norm": 1.9028497690136488, |
|
"learning_rate": 9.965025423687505e-06, |
|
"loss": 0.346, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 1.694320712579824, |
|
"learning_rate": 9.962540002682314e-06, |
|
"loss": 0.3635, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4088888888888889, |
|
"grad_norm": 1.6440393469215313, |
|
"learning_rate": 9.95996960676044e-06, |
|
"loss": 0.3794, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.41244444444444445, |
|
"grad_norm": 1.9859711063744807, |
|
"learning_rate": 9.957314279935467e-06, |
|
"loss": 0.3727, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 1.5764827911729749, |
|
"learning_rate": 9.954574067675276e-06, |
|
"loss": 0.3472, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.41955555555555557, |
|
"grad_norm": 2.0270228575955938, |
|
"learning_rate": 9.951749016901266e-06, |
|
"loss": 0.3651, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4231111111111111, |
|
"grad_norm": 1.4711992564971241, |
|
"learning_rate": 9.948839175987543e-06, |
|
"loss": 0.4007, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 1.6555299578050973, |
|
"learning_rate": 9.945844594760104e-06, |
|
"loss": 0.3662, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.43022222222222223, |
|
"grad_norm": 1.6087449112246428, |
|
"learning_rate": 9.94276532449597e-06, |
|
"loss": 0.3266, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.43377777777777776, |
|
"grad_norm": 1.7938918985177508, |
|
"learning_rate": 9.939601417922326e-06, |
|
"loss": 0.367, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 1.9419042479062267, |
|
"learning_rate": 9.936352929215598e-06, |
|
"loss": 0.3479, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.4408888888888889, |
|
"grad_norm": 1.7389871732986788, |
|
"learning_rate": 9.933019914000537e-06, |
|
"loss": 0.3991, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 1.954697966163684, |
|
"learning_rate": 9.929602429349267e-06, |
|
"loss": 0.387, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.9390505686602657, |
|
"learning_rate": 9.926100533780304e-06, |
|
"loss": 0.3623, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.45155555555555554, |
|
"grad_norm": 1.6639481540933314, |
|
"learning_rate": 9.922514287257553e-06, |
|
"loss": 0.3758, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.45511111111111113, |
|
"grad_norm": 1.722757928957694, |
|
"learning_rate": 9.918843751189285e-06, |
|
"loss": 0.3355, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 1.845850757530145, |
|
"learning_rate": 9.915088988427085e-06, |
|
"loss": 0.3698, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 1.44128404254532, |
|
"learning_rate": 9.911250063264768e-06, |
|
"loss": 0.4047, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4657777777777778, |
|
"grad_norm": 1.7671518160334596, |
|
"learning_rate": 9.907327041437295e-06, |
|
"loss": 0.3692, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 1.8380352484481248, |
|
"learning_rate": 9.903319990119629e-06, |
|
"loss": 0.36, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.4728888888888889, |
|
"grad_norm": 1.76427459962676, |
|
"learning_rate": 9.899228977925594e-06, |
|
"loss": 0.3741, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.47644444444444445, |
|
"grad_norm": 1.4897822709650264, |
|
"learning_rate": 9.895054074906703e-06, |
|
"loss": 0.3407, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.8107592753421746, |
|
"learning_rate": 9.890795352550949e-06, |
|
"loss": 0.3737, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.48355555555555557, |
|
"grad_norm": 1.7814141617442254, |
|
"learning_rate": 9.886452883781588e-06, |
|
"loss": 0.3706, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4871111111111111, |
|
"grad_norm": 1.6423771491979522, |
|
"learning_rate": 9.882026742955892e-06, |
|
"loss": 0.3593, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 1.9926182163486512, |
|
"learning_rate": 9.877517005863865e-06, |
|
"loss": 0.388, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.49422222222222223, |
|
"grad_norm": 1.6527200649892368, |
|
"learning_rate": 9.872923749726959e-06, |
|
"loss": 0.3825, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 1.800321612826116, |
|
"learning_rate": 9.868247053196744e-06, |
|
"loss": 0.3406, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 1.8998896812539383, |
|
"learning_rate": 9.86348699635356e-06, |
|
"loss": 0.3718, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5048888888888889, |
|
"grad_norm": 1.8642598101048677, |
|
"learning_rate": 9.85864366070515e-06, |
|
"loss": 0.3728, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5084444444444445, |
|
"grad_norm": 2.04147924521036, |
|
"learning_rate": 9.853717129185262e-06, |
|
"loss": 0.3371, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 1.765175754873959, |
|
"learning_rate": 9.848707486152231e-06, |
|
"loss": 0.3468, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5155555555555555, |
|
"grad_norm": 1.7955950262413882, |
|
"learning_rate": 9.843614817387531e-06, |
|
"loss": 0.3456, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.5191111111111111, |
|
"grad_norm": 1.4037783734962412, |
|
"learning_rate": 9.838439210094309e-06, |
|
"loss": 0.3244, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 1.8006249556531597, |
|
"learning_rate": 9.833180752895887e-06, |
|
"loss": 0.3391, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5262222222222223, |
|
"grad_norm": 1.7020622735675546, |
|
"learning_rate": 9.827839535834258e-06, |
|
"loss": 0.3922, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5297777777777778, |
|
"grad_norm": 1.6034083398484584, |
|
"learning_rate": 9.822415650368525e-06, |
|
"loss": 0.304, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 1.7309514997235147, |
|
"learning_rate": 9.816909189373347e-06, |
|
"loss": 0.3531, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"eval_loss": 0.24488620460033417, |
|
"eval_runtime": 562.1833, |
|
"eval_samples_per_second": 17.788, |
|
"eval_steps_per_second": 4.447, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5368888888888889, |
|
"grad_norm": 1.4581125274966544, |
|
"learning_rate": 9.81132024713735e-06, |
|
"loss": 0.3771, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5404444444444444, |
|
"grad_norm": 1.6490332212552936, |
|
"learning_rate": 9.805648919361505e-06, |
|
"loss": 0.3848, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.7512970600212527, |
|
"learning_rate": 9.799895303157492e-06, |
|
"loss": 0.3694, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.5475555555555556, |
|
"grad_norm": 1.7421405313188358, |
|
"learning_rate": 9.794059497046043e-06, |
|
"loss": 0.3553, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5511111111111111, |
|
"grad_norm": 1.7340918047507783, |
|
"learning_rate": 9.788141600955244e-06, |
|
"loss": 0.3357, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 1.657973523226739, |
|
"learning_rate": 9.782141716218832e-06, |
|
"loss": 0.3448, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5582222222222222, |
|
"grad_norm": 1.7266109549753084, |
|
"learning_rate": 9.77605994557446e-06, |
|
"loss": 0.3336, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.5617777777777778, |
|
"grad_norm": 1.7634795513841868, |
|
"learning_rate": 9.769896393161937e-06, |
|
"loss": 0.336, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 1.7328448062964845, |
|
"learning_rate": 9.763651164521436e-06, |
|
"loss": 0.3505, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 1.7601349288429824, |
|
"learning_rate": 9.7573243665917e-06, |
|
"loss": 0.3816, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5724444444444444, |
|
"grad_norm": 1.887857912509665, |
|
"learning_rate": 9.750916107708205e-06, |
|
"loss": 0.358, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.8940080571652895, |
|
"learning_rate": 9.744426497601305e-06, |
|
"loss": 0.363, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.5795555555555556, |
|
"grad_norm": 1.5744873206102685, |
|
"learning_rate": 9.737855647394346e-06, |
|
"loss": 0.3544, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5831111111111111, |
|
"grad_norm": 1.5744080074196256, |
|
"learning_rate": 9.73120366960178e-06, |
|
"loss": 0.375, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 1.6398095171132219, |
|
"learning_rate": 9.724470678127226e-06, |
|
"loss": 0.3649, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5902222222222222, |
|
"grad_norm": 1.4310246627875627, |
|
"learning_rate": 9.717656788261519e-06, |
|
"loss": 0.3716, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.5937777777777777, |
|
"grad_norm": 1.490999227794774, |
|
"learning_rate": 9.71076211668074e-06, |
|
"loss": 0.352, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 1.6484132205325386, |
|
"learning_rate": 9.703786781444218e-06, |
|
"loss": 0.3555, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6008888888888889, |
|
"grad_norm": 1.3854857319423775, |
|
"learning_rate": 9.69673090199251e-06, |
|
"loss": 0.3348, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 1.6107410705301848, |
|
"learning_rate": 9.689594599145348e-06, |
|
"loss": 0.3499, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 1.520886748403311, |
|
"learning_rate": 9.682377995099581e-06, |
|
"loss": 0.3389, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.6115555555555555, |
|
"grad_norm": 1.4556730210725268, |
|
"learning_rate": 9.675081213427076e-06, |
|
"loss": 0.3412, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.6151111111111112, |
|
"grad_norm": 1.476388303700134, |
|
"learning_rate": 9.667704379072597e-06, |
|
"loss": 0.3363, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 1.2168509424846436, |
|
"learning_rate": 9.660247618351683e-06, |
|
"loss": 0.3328, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 1.395468629739029, |
|
"learning_rate": 9.652711058948463e-06, |
|
"loss": 0.3509, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6257777777777778, |
|
"grad_norm": 1.586845461880222, |
|
"learning_rate": 9.645094829913487e-06, |
|
"loss": 0.3471, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 1.5411518795473231, |
|
"learning_rate": 9.637399061661507e-06, |
|
"loss": 0.3246, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.6328888888888888, |
|
"grad_norm": 1.658660033117339, |
|
"learning_rate": 9.62962388596925e-06, |
|
"loss": 0.3399, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6364444444444445, |
|
"grad_norm": 1.313159566501215, |
|
"learning_rate": 9.621769435973152e-06, |
|
"loss": 0.3478, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.8380402091451324, |
|
"learning_rate": 9.61383584616709e-06, |
|
"loss": 0.3251, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6435555555555555, |
|
"grad_norm": 1.6180991422896933, |
|
"learning_rate": 9.60582325240007e-06, |
|
"loss": 0.3553, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.6471111111111111, |
|
"grad_norm": 1.8283857342608776, |
|
"learning_rate": 9.597731791873907e-06, |
|
"loss": 0.3594, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 1.4175489521300049, |
|
"learning_rate": 9.58956160314087e-06, |
|
"loss": 0.3549, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.6542222222222223, |
|
"grad_norm": 1.6783488504498176, |
|
"learning_rate": 9.581312826101315e-06, |
|
"loss": 0.3813, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.6577777777777778, |
|
"grad_norm": 1.6351873747299641, |
|
"learning_rate": 9.572985602001283e-06, |
|
"loss": 0.3518, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 1.3790848679324303, |
|
"learning_rate": 9.56458007343009e-06, |
|
"loss": 0.3303, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.6648888888888889, |
|
"grad_norm": 1.6322052333334587, |
|
"learning_rate": 9.556096384317878e-06, |
|
"loss": 0.3403, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.6684444444444444, |
|
"grad_norm": 1.788030342136729, |
|
"learning_rate": 9.547534679933155e-06, |
|
"loss": 0.3717, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 1.4934586402235337, |
|
"learning_rate": 9.538895106880302e-06, |
|
"loss": 0.3468, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 1.9556398213487334, |
|
"learning_rate": 9.53017781309707e-06, |
|
"loss": 0.3495, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6791111111111111, |
|
"grad_norm": 1.4201698189636593, |
|
"learning_rate": 9.521382947852042e-06, |
|
"loss": 0.3631, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 1.8176078337580701, |
|
"learning_rate": 9.512510661742078e-06, |
|
"loss": 0.366, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.6862222222222222, |
|
"grad_norm": 1.5895629439283847, |
|
"learning_rate": 9.503561106689736e-06, |
|
"loss": 0.3165, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.6897777777777778, |
|
"grad_norm": 1.7257922798447645, |
|
"learning_rate": 9.494534435940668e-06, |
|
"loss": 0.3199, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 1.3859470273389864, |
|
"learning_rate": 9.485430804061009e-06, |
|
"loss": 0.3244, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6968888888888889, |
|
"grad_norm": 1.3389192102707597, |
|
"learning_rate": 9.476250366934708e-06, |
|
"loss": 0.3557, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.7004444444444444, |
|
"grad_norm": 1.761133913330945, |
|
"learning_rate": 9.466993281760879e-06, |
|
"loss": 0.3367, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 1.5576575807000288, |
|
"learning_rate": 9.457659707051099e-06, |
|
"loss": 0.335, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.7075555555555556, |
|
"grad_norm": 1.5125566207561287, |
|
"learning_rate": 9.448249802626696e-06, |
|
"loss": 0.3286, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 1.7236714219097393, |
|
"learning_rate": 9.43876372961601e-06, |
|
"loss": 0.3544, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"eval_loss": 0.23682241141796112, |
|
"eval_runtime": 560.8939, |
|
"eval_samples_per_second": 17.829, |
|
"eval_steps_per_second": 4.457, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 1.7803508157706263, |
|
"learning_rate": 9.429201650451642e-06, |
|
"loss": 0.3218, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.7182222222222222, |
|
"grad_norm": 1.6971031315045289, |
|
"learning_rate": 9.419563728867663e-06, |
|
"loss": 0.3417, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.7217777777777777, |
|
"grad_norm": 1.9366329088516083, |
|
"learning_rate": 9.409850129896812e-06, |
|
"loss": 0.3104, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 1.85452483851228, |
|
"learning_rate": 9.40006101986768e-06, |
|
"loss": 0.3371, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.7288888888888889, |
|
"grad_norm": 1.4768370143060883, |
|
"learning_rate": 9.390196566401844e-06, |
|
"loss": 0.3324, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7324444444444445, |
|
"grad_norm": 1.3195137184227357, |
|
"learning_rate": 9.38025693841102e-06, |
|
"loss": 0.3384, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 1.7121308917693614, |
|
"learning_rate": 9.370242306094141e-06, |
|
"loss": 0.3339, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.7395555555555555, |
|
"grad_norm": 1.3801023810052373, |
|
"learning_rate": 9.360152840934477e-06, |
|
"loss": 0.3449, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.7431111111111111, |
|
"grad_norm": 1.4391167681264767, |
|
"learning_rate": 9.349988715696671e-06, |
|
"loss": 0.3444, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 1.840759552395967, |
|
"learning_rate": 9.33975010442379e-06, |
|
"loss": 0.3496, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7502222222222222, |
|
"grad_norm": 1.348141880287597, |
|
"learning_rate": 9.329437182434351e-06, |
|
"loss": 0.3202, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.7537777777777778, |
|
"grad_norm": 1.528620379748828, |
|
"learning_rate": 9.31905012631931e-06, |
|
"loss": 0.3545, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 1.502678851982848, |
|
"learning_rate": 9.30858911393904e-06, |
|
"loss": 0.3457, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.7608888888888888, |
|
"grad_norm": 1.591416150002211, |
|
"learning_rate": 9.298054324420294e-06, |
|
"loss": 0.3125, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.7644444444444445, |
|
"grad_norm": 1.5254470204546493, |
|
"learning_rate": 9.287445938153121e-06, |
|
"loss": 0.3596, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.230432920766134, |
|
"learning_rate": 9.276764136787798e-06, |
|
"loss": 0.3352, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.7715555555555556, |
|
"grad_norm": 1.8112353212418606, |
|
"learning_rate": 9.266009103231702e-06, |
|
"loss": 0.3504, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.7751111111111111, |
|
"grad_norm": 1.6435932354458154, |
|
"learning_rate": 9.255181021646182e-06, |
|
"loss": 0.3289, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 1.3388409038180085, |
|
"learning_rate": 9.244280077443417e-06, |
|
"loss": 0.3542, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 1.5875341933538416, |
|
"learning_rate": 9.233306457283223e-06, |
|
"loss": 0.3516, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7857777777777778, |
|
"grad_norm": 1.5094881761609635, |
|
"learning_rate": 9.222260349069874e-06, |
|
"loss": 0.3489, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 1.477094884348464, |
|
"learning_rate": 9.211141941948872e-06, |
|
"loss": 0.3581, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.7928888888888889, |
|
"grad_norm": 1.4717030162478277, |
|
"learning_rate": 9.199951426303711e-06, |
|
"loss": 0.3415, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.7964444444444444, |
|
"grad_norm": 1.5752422305129774, |
|
"learning_rate": 9.188688993752626e-06, |
|
"loss": 0.3355, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.5354049474859641, |
|
"learning_rate": 9.177354837145298e-06, |
|
"loss": 0.3394, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.8035555555555556, |
|
"grad_norm": 1.8308300488763203, |
|
"learning_rate": 9.165949150559561e-06, |
|
"loss": 0.3545, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.8071111111111111, |
|
"grad_norm": 1.7274391712847685, |
|
"learning_rate": 9.154472129298075e-06, |
|
"loss": 0.363, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 1.663966013940676, |
|
"learning_rate": 9.142923969884984e-06, |
|
"loss": 0.3395, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.8142222222222222, |
|
"grad_norm": 1.631283026660004, |
|
"learning_rate": 9.131304870062554e-06, |
|
"loss": 0.3486, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 1.6552982308578106, |
|
"learning_rate": 9.119615028787771e-06, |
|
"loss": 0.3509, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 1.7276297897533288, |
|
"learning_rate": 9.107854646228961e-06, |
|
"loss": 0.325, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.8248888888888889, |
|
"grad_norm": 1.445647497408194, |
|
"learning_rate": 9.096023923762333e-06, |
|
"loss": 0.3149, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.8284444444444444, |
|
"grad_norm": 1.531947731156783, |
|
"learning_rate": 9.08412306396856e-06, |
|
"loss": 0.348, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.3576987022774867, |
|
"learning_rate": 9.072152270629281e-06, |
|
"loss": 0.3096, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.8355555555555556, |
|
"grad_norm": 1.4298680216684836, |
|
"learning_rate": 9.060111748723639e-06, |
|
"loss": 0.3609, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8391111111111111, |
|
"grad_norm": 1.5782942370819155, |
|
"learning_rate": 9.048001704424747e-06, |
|
"loss": 0.3307, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 1.6461644102732529, |
|
"learning_rate": 9.035822345096177e-06, |
|
"loss": 0.3327, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.8462222222222222, |
|
"grad_norm": 1.5843145785651733, |
|
"learning_rate": 9.023573879288394e-06, |
|
"loss": 0.3312, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.8497777777777777, |
|
"grad_norm": 1.5152546857205669, |
|
"learning_rate": 9.0112565167352e-06, |
|
"loss": 0.3298, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 1.7304070586423994, |
|
"learning_rate": 8.99887046835013e-06, |
|
"loss": 0.3404, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8568888888888889, |
|
"grad_norm": 1.461299493248939, |
|
"learning_rate": 8.986415946222843e-06, |
|
"loss": 0.3351, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.8604444444444445, |
|
"grad_norm": 1.6967152528749099, |
|
"learning_rate": 8.973893163615498e-06, |
|
"loss": 0.3257, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.4154067723973784, |
|
"learning_rate": 8.96130233495909e-06, |
|
"loss": 0.3199, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.8675555555555555, |
|
"grad_norm": 1.3361597312618834, |
|
"learning_rate": 8.948643675849793e-06, |
|
"loss": 0.3442, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.8711111111111111, |
|
"grad_norm": 1.4032866224408458, |
|
"learning_rate": 8.935917403045251e-06, |
|
"loss": 0.2947, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 1.234939739680067, |
|
"learning_rate": 8.923123734460885e-06, |
|
"loss": 0.3577, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.8782222222222222, |
|
"grad_norm": 1.5765934665163166, |
|
"learning_rate": 8.910262889166144e-06, |
|
"loss": 0.3326, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.8817777777777778, |
|
"grad_norm": 1.5046341548865376, |
|
"learning_rate": 8.897335087380769e-06, |
|
"loss": 0.3212, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 1.3276870900100486, |
|
"learning_rate": 8.884340550471008e-06, |
|
"loss": 0.3143, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.719735619655969, |
|
"learning_rate": 8.87127950094584e-06, |
|
"loss": 0.3747, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"eval_loss": 0.23135392367839813, |
|
"eval_runtime": 562.1868, |
|
"eval_samples_per_second": 17.788, |
|
"eval_steps_per_second": 4.447, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8924444444444445, |
|
"grad_norm": 1.584313301872745, |
|
"learning_rate": 8.85815216245315e-06, |
|
"loss": 0.3251, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 1.2854406639721594, |
|
"learning_rate": 8.844958759775917e-06, |
|
"loss": 0.3242, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.8995555555555556, |
|
"grad_norm": 1.3421636352208044, |
|
"learning_rate": 8.83169951882834e-06, |
|
"loss": 0.3069, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.9031111111111111, |
|
"grad_norm": 1.6982202912735271, |
|
"learning_rate": 8.818374666652001e-06, |
|
"loss": 0.3303, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 1.3802398833209684, |
|
"learning_rate": 8.804984431411951e-06, |
|
"loss": 0.3558, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.9102222222222223, |
|
"grad_norm": 1.8913239549685246, |
|
"learning_rate": 8.791529042392813e-06, |
|
"loss": 0.3947, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.9137777777777778, |
|
"grad_norm": 1.4494060942613418, |
|
"learning_rate": 8.77800872999486e-06, |
|
"loss": 0.3362, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 1.7204036116920214, |
|
"learning_rate": 8.764423725730062e-06, |
|
"loss": 0.3298, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.9208888888888889, |
|
"grad_norm": 1.6130463149964605, |
|
"learning_rate": 8.750774262218129e-06, |
|
"loss": 0.3218, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 1.4272505738840544, |
|
"learning_rate": 8.737060573182518e-06, |
|
"loss": 0.3325, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 1.5909460584884059, |
|
"learning_rate": 8.723282893446447e-06, |
|
"loss": 0.3496, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.9315555555555556, |
|
"grad_norm": 2.0360938733984963, |
|
"learning_rate": 8.709441458928853e-06, |
|
"loss": 0.3197, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.9351111111111111, |
|
"grad_norm": 1.6918095124182533, |
|
"learning_rate": 8.695536506640369e-06, |
|
"loss": 0.3349, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 1.561883507817091, |
|
"learning_rate": 8.681568274679264e-06, |
|
"loss": 0.3357, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.9422222222222222, |
|
"grad_norm": 1.635386123467993, |
|
"learning_rate": 8.66753700222735e-06, |
|
"loss": 0.3023, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.9457777777777778, |
|
"grad_norm": 1.6460980849436542, |
|
"learning_rate": 8.653442929545914e-06, |
|
"loss": 0.3482, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 1.8476260091970051, |
|
"learning_rate": 8.639286297971575e-06, |
|
"loss": 0.3111, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.9528888888888889, |
|
"grad_norm": 1.5625524365842092, |
|
"learning_rate": 8.625067349912171e-06, |
|
"loss": 0.3333, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.9564444444444444, |
|
"grad_norm": 1.679549783886682, |
|
"learning_rate": 8.610786328842602e-06, |
|
"loss": 0.3012, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.7334271987057313, |
|
"learning_rate": 8.59644347930066e-06, |
|
"loss": 0.3158, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9635555555555556, |
|
"grad_norm": 1.7183702234532738, |
|
"learning_rate": 8.582039046882842e-06, |
|
"loss": 0.3045, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.9671111111111111, |
|
"grad_norm": 1.677327314139312, |
|
"learning_rate": 8.567573278240147e-06, |
|
"loss": 0.3379, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 1.4197759922345252, |
|
"learning_rate": 8.55304642107385e-06, |
|
"loss": 0.3376, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.9742222222222222, |
|
"grad_norm": 1.7365860935410007, |
|
"learning_rate": 8.538458724131258e-06, |
|
"loss": 0.3395, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 1.5642529718868006, |
|
"learning_rate": 8.523810437201463e-06, |
|
"loss": 0.3105, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 1.6285786801359268, |
|
"learning_rate": 8.509101811111045e-06, |
|
"loss": 0.314, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.9848888888888889, |
|
"grad_norm": 1.7932095997349375, |
|
"learning_rate": 8.494333097719795e-06, |
|
"loss": 0.3183, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.9884444444444445, |
|
"grad_norm": 1.7636055661476138, |
|
"learning_rate": 8.479504549916393e-06, |
|
"loss": 0.3459, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 1.7893218283734698, |
|
"learning_rate": 8.464616421614077e-06, |
|
"loss": 0.3655, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 1.56040627840869, |
|
"learning_rate": 8.449668967746303e-06, |
|
"loss": 0.3145, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.9991111111111111, |
|
"grad_norm": 1.7372692555117912, |
|
"learning_rate": 8.434662444262374e-06, |
|
"loss": 0.3152, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.0026666666666666, |
|
"grad_norm": 1.3178611516659062, |
|
"learning_rate": 8.419597108123054e-06, |
|
"loss": 0.256, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.0062222222222221, |
|
"grad_norm": 1.7641513434209246, |
|
"learning_rate": 8.404473217296174e-06, |
|
"loss": 0.2304, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.0097777777777779, |
|
"grad_norm": 1.702777106397184, |
|
"learning_rate": 8.389291030752215e-06, |
|
"loss": 0.2451, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.0133333333333334, |
|
"grad_norm": 1.516656565976496, |
|
"learning_rate": 8.37405080845987e-06, |
|
"loss": 0.2463, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.016888888888889, |
|
"grad_norm": 1.2615996283177406, |
|
"learning_rate": 8.358752811381592e-06, |
|
"loss": 0.2439, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.0204444444444445, |
|
"grad_norm": 1.2426761993789008, |
|
"learning_rate": 8.343397301469127e-06, |
|
"loss": 0.2301, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 1.7414567869166766, |
|
"learning_rate": 8.327984541659035e-06, |
|
"loss": 0.26, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.0275555555555556, |
|
"grad_norm": 1.778546754169589, |
|
"learning_rate": 8.312514795868177e-06, |
|
"loss": 0.2537, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.031111111111111, |
|
"grad_norm": 1.693194016869835, |
|
"learning_rate": 8.296988328989195e-06, |
|
"loss": 0.2474, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.0346666666666666, |
|
"grad_norm": 1.4905129718116352, |
|
"learning_rate": 8.281405406885992e-06, |
|
"loss": 0.2259, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.0382222222222222, |
|
"grad_norm": 1.6844431624217413, |
|
"learning_rate": 8.265766296389164e-06, |
|
"loss": 0.2206, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.0417777777777777, |
|
"grad_norm": 1.4064579919162583, |
|
"learning_rate": 8.250071265291432e-06, |
|
"loss": 0.2498, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.0453333333333332, |
|
"grad_norm": 1.4383166925160618, |
|
"learning_rate": 8.23432058234307e-06, |
|
"loss": 0.2316, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.048888888888889, |
|
"grad_norm": 1.7880359369165812, |
|
"learning_rate": 8.218514517247287e-06, |
|
"loss": 0.2421, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.0524444444444445, |
|
"grad_norm": 1.49095155848045, |
|
"learning_rate": 8.202653340655614e-06, |
|
"loss": 0.2547, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 1.802867297616481, |
|
"learning_rate": 8.18673732416328e-06, |
|
"loss": 0.2609, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.0595555555555556, |
|
"grad_norm": 1.799375023246126, |
|
"learning_rate": 8.170766740304541e-06, |
|
"loss": 0.2369, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.0631111111111111, |
|
"grad_norm": 1.645090115101595, |
|
"learning_rate": 8.154741862548035e-06, |
|
"loss": 0.2519, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 1.8315765038402207, |
|
"learning_rate": 8.13866296529208e-06, |
|
"loss": 0.2248, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"eval_loss": 0.23144060373306274, |
|
"eval_runtime": 562.045, |
|
"eval_samples_per_second": 17.792, |
|
"eval_steps_per_second": 4.448, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0702222222222222, |
|
"grad_norm": 1.3604786834079945, |
|
"learning_rate": 8.122530323859992e-06, |
|
"loss": 0.2494, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.0737777777777777, |
|
"grad_norm": 1.472974815302568, |
|
"learning_rate": 8.106344214495359e-06, |
|
"loss": 0.2168, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.0773333333333333, |
|
"grad_norm": 1.9232740710019078, |
|
"learning_rate": 8.090104914357316e-06, |
|
"loss": 0.2544, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.0808888888888888, |
|
"grad_norm": 1.6517745707358162, |
|
"learning_rate": 8.073812701515799e-06, |
|
"loss": 0.2362, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.0844444444444445, |
|
"grad_norm": 1.5375717590050721, |
|
"learning_rate": 8.057467854946783e-06, |
|
"loss": 0.238, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 1.736104134714019, |
|
"learning_rate": 8.041070654527498e-06, |
|
"loss": 0.2329, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.0915555555555556, |
|
"grad_norm": 1.578126670290498, |
|
"learning_rate": 8.024621381031654e-06, |
|
"loss": 0.2525, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.0951111111111111, |
|
"grad_norm": 1.2995445031583646, |
|
"learning_rate": 8.008120316124612e-06, |
|
"loss": 0.2378, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.0986666666666667, |
|
"grad_norm": 1.9084352174123695, |
|
"learning_rate": 7.991567742358582e-06, |
|
"loss": 0.2469, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.1022222222222222, |
|
"grad_norm": 1.6004292294784017, |
|
"learning_rate": 7.974963943167761e-06, |
|
"loss": 0.2721, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.1057777777777777, |
|
"grad_norm": 1.4738079995177567, |
|
"learning_rate": 7.958309202863506e-06, |
|
"loss": 0.2457, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.1093333333333333, |
|
"grad_norm": 1.5493675656690653, |
|
"learning_rate": 7.941603806629444e-06, |
|
"loss": 0.2274, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.1128888888888888, |
|
"grad_norm": 1.6554292154622638, |
|
"learning_rate": 7.9248480405166e-06, |
|
"loss": 0.2595, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.1164444444444444, |
|
"grad_norm": 1.6112904935857704, |
|
"learning_rate": 7.908042191438497e-06, |
|
"loss": 0.2374, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.4663251499352947, |
|
"learning_rate": 7.891186547166238e-06, |
|
"loss": 0.2128, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.1235555555555556, |
|
"grad_norm": 1.8636139047215206, |
|
"learning_rate": 7.874281396323589e-06, |
|
"loss": 0.2263, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.1271111111111112, |
|
"grad_norm": 1.6257921444204015, |
|
"learning_rate": 7.857327028382025e-06, |
|
"loss": 0.2392, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.1306666666666667, |
|
"grad_norm": 1.4066061759358834, |
|
"learning_rate": 7.84032373365578e-06, |
|
"loss": 0.2342, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.1342222222222222, |
|
"grad_norm": 1.5852680151393, |
|
"learning_rate": 7.823271803296876e-06, |
|
"loss": 0.2271, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.1377777777777778, |
|
"grad_norm": 1.7721860252109063, |
|
"learning_rate": 7.80617152929014e-06, |
|
"loss": 0.2376, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.1413333333333333, |
|
"grad_norm": 1.8867413038702499, |
|
"learning_rate": 7.789023204448189e-06, |
|
"loss": 0.2516, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.1448888888888888, |
|
"grad_norm": 1.4279840133381525, |
|
"learning_rate": 7.771827122406437e-06, |
|
"loss": 0.2265, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.1484444444444444, |
|
"grad_norm": 1.676800279171029, |
|
"learning_rate": 7.754583577618057e-06, |
|
"loss": 0.2554, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 1.6723494127405627, |
|
"learning_rate": 7.737292865348933e-06, |
|
"loss": 0.2408, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 1.6148606083372026, |
|
"learning_rate": 7.719955281672618e-06, |
|
"loss": 0.2287, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.1591111111111112, |
|
"grad_norm": 1.6092526546730486, |
|
"learning_rate": 7.702571123465252e-06, |
|
"loss": 0.237, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.1626666666666667, |
|
"grad_norm": 1.3380193435685535, |
|
"learning_rate": 7.685140688400484e-06, |
|
"loss": 0.2393, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.1662222222222223, |
|
"grad_norm": 1.3406231671146336, |
|
"learning_rate": 7.66766427494438e-06, |
|
"loss": 0.2158, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.1697777777777778, |
|
"grad_norm": 1.5365708586926026, |
|
"learning_rate": 7.650142182350294e-06, |
|
"loss": 0.201, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.1733333333333333, |
|
"grad_norm": 1.7847958889549216, |
|
"learning_rate": 7.632574710653773e-06, |
|
"loss": 0.2627, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.1768888888888889, |
|
"grad_norm": 1.4770511975662048, |
|
"learning_rate": 7.614962160667384e-06, |
|
"loss": 0.221, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.1804444444444444, |
|
"grad_norm": 1.8043230337610534, |
|
"learning_rate": 7.597304833975596e-06, |
|
"loss": 0.2419, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 1.9363141324764201, |
|
"learning_rate": 7.579603032929597e-06, |
|
"loss": 0.2572, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.1875555555555555, |
|
"grad_norm": 1.600071864532325, |
|
"learning_rate": 7.56185706064212e-06, |
|
"loss": 0.2462, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.1911111111111112, |
|
"grad_norm": 1.5785414115422856, |
|
"learning_rate": 7.544067220982254e-06, |
|
"loss": 0.2312, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.1946666666666665, |
|
"grad_norm": 1.5789285671514135, |
|
"learning_rate": 7.526233818570245e-06, |
|
"loss": 0.2067, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.1982222222222223, |
|
"grad_norm": 1.7448328186975814, |
|
"learning_rate": 7.508357158772273e-06, |
|
"loss": 0.2448, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.2017777777777778, |
|
"grad_norm": 1.4619128557517416, |
|
"learning_rate": 7.490437547695224e-06, |
|
"loss": 0.2194, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.2053333333333334, |
|
"grad_norm": 1.6063307731749306, |
|
"learning_rate": 7.472475292181454e-06, |
|
"loss": 0.2501, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.208888888888889, |
|
"grad_norm": 1.9510115721688825, |
|
"learning_rate": 7.45447069980353e-06, |
|
"loss": 0.2515, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.2124444444444444, |
|
"grad_norm": 1.5856572080139135, |
|
"learning_rate": 7.4364240788589625e-06, |
|
"loss": 0.2461, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 1.846941973796494, |
|
"learning_rate": 7.418335738364931e-06, |
|
"loss": 0.2241, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.2195555555555555, |
|
"grad_norm": 1.8886992728965029, |
|
"learning_rate": 7.400205988052991e-06, |
|
"loss": 0.2298, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.223111111111111, |
|
"grad_norm": 1.6140767527032074, |
|
"learning_rate": 7.382035138363764e-06, |
|
"loss": 0.2516, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.2266666666666666, |
|
"grad_norm": 1.637777869962237, |
|
"learning_rate": 7.363823500441636e-06, |
|
"loss": 0.2422, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.2302222222222223, |
|
"grad_norm": 1.3783132940885547, |
|
"learning_rate": 7.345571386129413e-06, |
|
"loss": 0.2368, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.2337777777777779, |
|
"grad_norm": 1.750318456803832, |
|
"learning_rate": 7.327279107962995e-06, |
|
"loss": 0.2488, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.2373333333333334, |
|
"grad_norm": 1.7745176716418858, |
|
"learning_rate": 7.308946979166012e-06, |
|
"loss": 0.2277, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.240888888888889, |
|
"grad_norm": 1.7469697925399752, |
|
"learning_rate": 7.290575313644476e-06, |
|
"loss": 0.2329, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 1.4439208816879574, |
|
"learning_rate": 7.272164425981387e-06, |
|
"loss": 0.2575, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"eval_loss": 0.22694812715053558, |
|
"eval_runtime": 564.2235, |
|
"eval_samples_per_second": 17.723, |
|
"eval_steps_per_second": 4.431, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 1.5767155030054063, |
|
"learning_rate": 7.253714631431366e-06, |
|
"loss": 0.2492, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.2515555555555555, |
|
"grad_norm": 1.5655624730827595, |
|
"learning_rate": 7.235226245915239e-06, |
|
"loss": 0.2259, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.255111111111111, |
|
"grad_norm": 1.8883245133962092, |
|
"learning_rate": 7.216699586014642e-06, |
|
"loss": 0.2487, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.2586666666666666, |
|
"grad_norm": 1.2903228684726653, |
|
"learning_rate": 7.198134968966588e-06, |
|
"loss": 0.2341, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.2622222222222224, |
|
"grad_norm": 1.6585013961180077, |
|
"learning_rate": 7.179532712658047e-06, |
|
"loss": 0.2625, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.2657777777777777, |
|
"grad_norm": 1.4955952405740183, |
|
"learning_rate": 7.160893135620488e-06, |
|
"loss": 0.2602, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.2693333333333334, |
|
"grad_norm": 1.8286387441617464, |
|
"learning_rate": 7.142216557024443e-06, |
|
"loss": 0.2221, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.272888888888889, |
|
"grad_norm": 1.6146123865735058, |
|
"learning_rate": 7.123503296674021e-06, |
|
"loss": 0.247, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.2764444444444445, |
|
"grad_norm": 1.4700165794501387, |
|
"learning_rate": 7.104753675001453e-06, |
|
"loss": 0.2405, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.4475828320209072, |
|
"learning_rate": 7.085968013061585e-06, |
|
"loss": 0.2452, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.2835555555555556, |
|
"grad_norm": 1.9854917772925798, |
|
"learning_rate": 7.067146632526398e-06, |
|
"loss": 0.2813, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.287111111111111, |
|
"grad_norm": 1.863775670718366, |
|
"learning_rate": 7.048289855679487e-06, |
|
"loss": 0.2272, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.2906666666666666, |
|
"grad_norm": 2.0238745081645693, |
|
"learning_rate": 7.029398005410551e-06, |
|
"loss": 0.2588, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.2942222222222222, |
|
"grad_norm": 1.8729516419448864, |
|
"learning_rate": 7.01047140520986e-06, |
|
"loss": 0.2403, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.2977777777777777, |
|
"grad_norm": 1.721501900738319, |
|
"learning_rate": 6.9915103791627146e-06, |
|
"loss": 0.2477, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.3013333333333335, |
|
"grad_norm": 1.6626021007269847, |
|
"learning_rate": 6.972515251943901e-06, |
|
"loss": 0.2279, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.3048888888888888, |
|
"grad_norm": 1.6716430135185554, |
|
"learning_rate": 6.953486348812127e-06, |
|
"loss": 0.2414, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.3084444444444445, |
|
"grad_norm": 1.4291636119458788, |
|
"learning_rate": 6.934423995604455e-06, |
|
"loss": 0.248, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 1.4674689793023254, |
|
"learning_rate": 6.915328518730724e-06, |
|
"loss": 0.2459, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.3155555555555556, |
|
"grad_norm": 1.5215618690023482, |
|
"learning_rate": 6.896200245167956e-06, |
|
"loss": 0.2546, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.3191111111111111, |
|
"grad_norm": 1.67624683709797, |
|
"learning_rate": 6.877039502454758e-06, |
|
"loss": 0.2006, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.3226666666666667, |
|
"grad_norm": 1.552246698817707, |
|
"learning_rate": 6.857846618685724e-06, |
|
"loss": 0.2213, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.3262222222222222, |
|
"grad_norm": 2.021180154460745, |
|
"learning_rate": 6.8386219225057945e-06, |
|
"loss": 0.2315, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.3297777777777777, |
|
"grad_norm": 1.8378386656471875, |
|
"learning_rate": 6.819365743104655e-06, |
|
"loss": 0.2235, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.8383503621089257, |
|
"learning_rate": 6.8000784102110795e-06, |
|
"loss": 0.2348, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.3368888888888888, |
|
"grad_norm": 1.476660408503267, |
|
"learning_rate": 6.780760254087293e-06, |
|
"loss": 0.2433, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.3404444444444445, |
|
"grad_norm": 1.6056267413924534, |
|
"learning_rate": 6.7614116055233146e-06, |
|
"loss": 0.2511, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 1.5433968607865032, |
|
"learning_rate": 6.742032795831298e-06, |
|
"loss": 0.2218, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.3475555555555556, |
|
"grad_norm": 1.8752695620093498, |
|
"learning_rate": 6.722624156839847e-06, |
|
"loss": 0.2607, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.3511111111111112, |
|
"grad_norm": 1.7018274048947808, |
|
"learning_rate": 6.703186020888347e-06, |
|
"loss": 0.2434, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.3546666666666667, |
|
"grad_norm": 1.7419410223233012, |
|
"learning_rate": 6.683718720821264e-06, |
|
"loss": 0.2494, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.3582222222222222, |
|
"grad_norm": 1.5145074056393906, |
|
"learning_rate": 6.664222589982451e-06, |
|
"loss": 0.2215, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.3617777777777778, |
|
"grad_norm": 1.2846516741089247, |
|
"learning_rate": 6.644697962209434e-06, |
|
"loss": 0.2346, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.3653333333333333, |
|
"grad_norm": 1.4951097829345636, |
|
"learning_rate": 6.6251451718277095e-06, |
|
"loss": 0.2122, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.3688888888888888, |
|
"grad_norm": 1.837176746272441, |
|
"learning_rate": 6.605564553644998e-06, |
|
"loss": 0.2289, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.3724444444444446, |
|
"grad_norm": 1.7541861945923773, |
|
"learning_rate": 6.585956442945531e-06, |
|
"loss": 0.2304, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 1.456084798251464, |
|
"learning_rate": 6.566321175484298e-06, |
|
"loss": 0.2524, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.3795555555555556, |
|
"grad_norm": 1.4021880078388174, |
|
"learning_rate": 6.546659087481304e-06, |
|
"loss": 0.2344, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.3831111111111112, |
|
"grad_norm": 1.386759603833687, |
|
"learning_rate": 6.526970515615807e-06, |
|
"loss": 0.2278, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.3866666666666667, |
|
"grad_norm": 1.9340717544487618, |
|
"learning_rate": 6.507255797020555e-06, |
|
"loss": 0.2299, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.3902222222222222, |
|
"grad_norm": 1.4309730673942778, |
|
"learning_rate": 6.487515269276015e-06, |
|
"loss": 0.2518, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.3937777777777778, |
|
"grad_norm": 1.5432073955843775, |
|
"learning_rate": 6.467749270404593e-06, |
|
"loss": 0.2196, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.3973333333333333, |
|
"grad_norm": 1.5255820019311863, |
|
"learning_rate": 6.4479581388648404e-06, |
|
"loss": 0.2527, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.4008888888888889, |
|
"grad_norm": 1.9387048217346732, |
|
"learning_rate": 6.428142213545662e-06, |
|
"loss": 0.2663, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.4044444444444444, |
|
"grad_norm": 1.4687424654762213, |
|
"learning_rate": 6.408301833760517e-06, |
|
"loss": 0.2141, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 1.6790491256350315, |
|
"learning_rate": 6.388437339241601e-06, |
|
"loss": 0.2419, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.4115555555555557, |
|
"grad_norm": 1.4986463255132796, |
|
"learning_rate": 6.368549070134036e-06, |
|
"loss": 0.2205, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.415111111111111, |
|
"grad_norm": 1.8639041315873657, |
|
"learning_rate": 6.348637366990038e-06, |
|
"loss": 0.2403, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.4186666666666667, |
|
"grad_norm": 1.8313804556837663, |
|
"learning_rate": 6.328702570763098e-06, |
|
"loss": 0.243, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 1.6288666479905434, |
|
"learning_rate": 6.308745022802128e-06, |
|
"loss": 0.2376, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"eval_loss": 0.22332721948623657, |
|
"eval_runtime": 562.4439, |
|
"eval_samples_per_second": 17.78, |
|
"eval_steps_per_second": 4.445, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.4257777777777778, |
|
"grad_norm": 1.28363469470016, |
|
"learning_rate": 6.288765064845629e-06, |
|
"loss": 0.2119, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.4293333333333333, |
|
"grad_norm": 1.5685400141436767, |
|
"learning_rate": 6.268763039015833e-06, |
|
"loss": 0.2372, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.4328888888888889, |
|
"grad_norm": 1.2419732210599121, |
|
"learning_rate": 6.248739287812846e-06, |
|
"loss": 0.2378, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.4364444444444444, |
|
"grad_norm": 1.450791049105233, |
|
"learning_rate": 6.228694154108783e-06, |
|
"loss": 0.236, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.3478041984965912, |
|
"learning_rate": 6.208627981141902e-06, |
|
"loss": 0.2165, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.4435555555555555, |
|
"grad_norm": 1.6880548918845273, |
|
"learning_rate": 6.188541112510713e-06, |
|
"loss": 0.2405, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.447111111111111, |
|
"grad_norm": 1.489941080547117, |
|
"learning_rate": 6.168433892168113e-06, |
|
"loss": 0.2288, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.4506666666666668, |
|
"grad_norm": 2.036909885440752, |
|
"learning_rate": 6.148306664415476e-06, |
|
"loss": 0.235, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.4542222222222223, |
|
"grad_norm": 1.60733518117776, |
|
"learning_rate": 6.128159773896783e-06, |
|
"loss": 0.2143, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.4577777777777778, |
|
"grad_norm": 1.6002205563066152, |
|
"learning_rate": 6.107993565592693e-06, |
|
"loss": 0.239, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.4613333333333334, |
|
"grad_norm": 1.59924513215813, |
|
"learning_rate": 6.087808384814652e-06, |
|
"loss": 0.2185, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.464888888888889, |
|
"grad_norm": 1.6651512334739322, |
|
"learning_rate": 6.067604577198981e-06, |
|
"loss": 0.238, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.4684444444444444, |
|
"grad_norm": 1.6551324049801701, |
|
"learning_rate": 6.04738248870095e-06, |
|
"loss": 0.2238, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 1.5301258421668906, |
|
"learning_rate": 6.027142465588855e-06, |
|
"loss": 0.2453, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.4755555555555555, |
|
"grad_norm": 1.8144546212524773, |
|
"learning_rate": 6.006884854438099e-06, |
|
"loss": 0.2375, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.479111111111111, |
|
"grad_norm": 1.5099593511650293, |
|
"learning_rate": 5.9866100021252415e-06, |
|
"loss": 0.2331, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.4826666666666668, |
|
"grad_norm": 1.502590510458408, |
|
"learning_rate": 5.966318255822072e-06, |
|
"loss": 0.2131, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.4862222222222221, |
|
"grad_norm": 1.7399671557461471, |
|
"learning_rate": 5.946009962989659e-06, |
|
"loss": 0.243, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.4897777777777779, |
|
"grad_norm": 1.959843593418678, |
|
"learning_rate": 5.9256854713724e-06, |
|
"loss": 0.2344, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.4933333333333334, |
|
"grad_norm": 1.5187384802338688, |
|
"learning_rate": 5.905345128992072e-06, |
|
"loss": 0.2372, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.496888888888889, |
|
"grad_norm": 1.713913961820143, |
|
"learning_rate": 5.884989284141866e-06, |
|
"loss": 0.2137, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.5004444444444445, |
|
"grad_norm": 1.5301932679943313, |
|
"learning_rate": 5.86461828538043e-06, |
|
"loss": 0.2264, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 1.6650108469792486, |
|
"learning_rate": 5.84423248152589e-06, |
|
"loss": 0.2167, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.5075555555555555, |
|
"grad_norm": 1.7377610919859674, |
|
"learning_rate": 5.82383222164989e-06, |
|
"loss": 0.2223, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 1.8280200619954592, |
|
"learning_rate": 5.803417855071603e-06, |
|
"loss": 0.2361, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.5146666666666668, |
|
"grad_norm": 1.7315368181217787, |
|
"learning_rate": 5.782989731351762e-06, |
|
"loss": 0.2665, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.5182222222222221, |
|
"grad_norm": 1.6917154736502973, |
|
"learning_rate": 5.762548200286659e-06, |
|
"loss": 0.212, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.521777777777778, |
|
"grad_norm": 1.5262051452408105, |
|
"learning_rate": 5.742093611902168e-06, |
|
"loss": 0.2142, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.5253333333333332, |
|
"grad_norm": 1.4955231464253305, |
|
"learning_rate": 5.721626316447748e-06, |
|
"loss": 0.2302, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.528888888888889, |
|
"grad_norm": 1.729596636954076, |
|
"learning_rate": 5.7011466643904434e-06, |
|
"loss": 0.2209, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.5324444444444445, |
|
"grad_norm": 1.470928828267314, |
|
"learning_rate": 5.680655006408882e-06, |
|
"loss": 0.2398, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 1.4046672488847465, |
|
"learning_rate": 5.660151693387273e-06, |
|
"loss": 0.2335, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.5395555555555556, |
|
"grad_norm": 1.6687999325358385, |
|
"learning_rate": 5.639637076409404e-06, |
|
"loss": 0.2207, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 1.543111111111111, |
|
"grad_norm": 1.60564618911301, |
|
"learning_rate": 5.6191115067526135e-06, |
|
"loss": 0.2411, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.5466666666666666, |
|
"grad_norm": 1.6047937970455775, |
|
"learning_rate": 5.598575335881792e-06, |
|
"loss": 0.2161, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.5502222222222222, |
|
"grad_norm": 1.3451412373708476, |
|
"learning_rate": 5.578028915443356e-06, |
|
"loss": 0.2104, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.553777777777778, |
|
"grad_norm": 1.827680836587444, |
|
"learning_rate": 5.55747259725923e-06, |
|
"loss": 0.2333, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 1.5573333333333332, |
|
"grad_norm": 1.8474659285597943, |
|
"learning_rate": 5.536906733320816e-06, |
|
"loss": 0.2447, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 1.560888888888889, |
|
"grad_norm": 1.5571932949328393, |
|
"learning_rate": 5.516331675782973e-06, |
|
"loss": 0.2445, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 1.5644444444444443, |
|
"grad_norm": 1.9294806844289611, |
|
"learning_rate": 5.495747776957987e-06, |
|
"loss": 0.2382, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 1.3637347529801744, |
|
"learning_rate": 5.475155389309531e-06, |
|
"loss": 0.2162, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 1.5715555555555556, |
|
"grad_norm": 1.552594376889073, |
|
"learning_rate": 5.4545548654466366e-06, |
|
"loss": 0.2351, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 1.5751111111111111, |
|
"grad_norm": 1.563596866564994, |
|
"learning_rate": 5.433946558117654e-06, |
|
"loss": 0.2259, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 1.5786666666666667, |
|
"grad_norm": 1.9424477147575314, |
|
"learning_rate": 5.413330820204214e-06, |
|
"loss": 0.2269, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 1.5822222222222222, |
|
"grad_norm": 1.7161442287459214, |
|
"learning_rate": 5.392708004715178e-06, |
|
"loss": 0.233, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.5857777777777777, |
|
"grad_norm": 1.4458518805717744, |
|
"learning_rate": 5.372078464780603e-06, |
|
"loss": 0.2428, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 1.5893333333333333, |
|
"grad_norm": 1.7197914268509118, |
|
"learning_rate": 5.351442553645691e-06, |
|
"loss": 0.2095, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 1.592888888888889, |
|
"grad_norm": 1.7871712697682276, |
|
"learning_rate": 5.330800624664736e-06, |
|
"loss": 0.2375, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 1.5964444444444443, |
|
"grad_norm": 1.6154295338481346, |
|
"learning_rate": 5.310153031295079e-06, |
|
"loss": 0.2365, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.8622833358204558, |
|
"learning_rate": 5.289500127091056e-06, |
|
"loss": 0.2521, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.22019484639167786, |
|
"eval_runtime": 562.6101, |
|
"eval_samples_per_second": 17.774, |
|
"eval_steps_per_second": 4.444, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.6035555555555554, |
|
"grad_norm": 1.4160865462023664, |
|
"learning_rate": 5.26884226569794e-06, |
|
"loss": 0.2445, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 1.6071111111111112, |
|
"grad_norm": 1.6982387533503471, |
|
"learning_rate": 5.248179800845884e-06, |
|
"loss": 0.2586, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 1.6106666666666667, |
|
"grad_norm": 1.8063057152671183, |
|
"learning_rate": 5.227513086343875e-06, |
|
"loss": 0.2342, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 1.6142222222222222, |
|
"grad_norm": 1.8369946808465265, |
|
"learning_rate": 5.20684247607366e-06, |
|
"loss": 0.2149, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 1.6177777777777778, |
|
"grad_norm": 1.4919743522204885, |
|
"learning_rate": 5.186168323983702e-06, |
|
"loss": 0.2361, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.6213333333333333, |
|
"grad_norm": 1.908909797085476, |
|
"learning_rate": 5.1654909840831e-06, |
|
"loss": 0.2422, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 1.624888888888889, |
|
"grad_norm": 1.6970594817568836, |
|
"learning_rate": 5.144810810435553e-06, |
|
"loss": 0.2702, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 1.6284444444444444, |
|
"grad_norm": 1.914631182858778, |
|
"learning_rate": 5.124128157153273e-06, |
|
"loss": 0.211, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 1.8308898752074714, |
|
"learning_rate": 5.103443378390935e-06, |
|
"loss": 0.213, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 1.6355555555555554, |
|
"grad_norm": 1.4716155031307734, |
|
"learning_rate": 5.08275682833961e-06, |
|
"loss": 0.2348, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.6391111111111112, |
|
"grad_norm": 1.3846959035420932, |
|
"learning_rate": 5.062068861220697e-06, |
|
"loss": 0.2323, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 1.6426666666666667, |
|
"grad_norm": 1.310528332429156, |
|
"learning_rate": 5.041379831279859e-06, |
|
"loss": 0.2274, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 1.6462222222222223, |
|
"grad_norm": 1.56294035415104, |
|
"learning_rate": 5.020690092780961e-06, |
|
"loss": 0.2382, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 1.6497777777777778, |
|
"grad_norm": 1.797053581769004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2263, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 1.6533333333333333, |
|
"grad_norm": 1.57684485333151, |
|
"learning_rate": 4.9793099072190406e-06, |
|
"loss": 0.2225, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.6568888888888889, |
|
"grad_norm": 2.0411280702141883, |
|
"learning_rate": 4.958620168720144e-06, |
|
"loss": 0.2225, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 1.6604444444444444, |
|
"grad_norm": 1.476641016823167, |
|
"learning_rate": 4.937931138779305e-06, |
|
"loss": 0.2438, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 1.4259185034698016, |
|
"learning_rate": 4.917243171660391e-06, |
|
"loss": 0.2127, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 1.6675555555555555, |
|
"grad_norm": 1.9925037267732388, |
|
"learning_rate": 4.896556621609066e-06, |
|
"loss": 0.223, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 1.6711111111111112, |
|
"grad_norm": 1.3845653896887404, |
|
"learning_rate": 4.8758718428467275e-06, |
|
"loss": 0.2332, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.6746666666666665, |
|
"grad_norm": 1.5936847174408162, |
|
"learning_rate": 4.8551891895644485e-06, |
|
"loss": 0.2381, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 1.6782222222222223, |
|
"grad_norm": 1.8741655887113169, |
|
"learning_rate": 4.8345090159169015e-06, |
|
"loss": 0.2182, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 1.6817777777777778, |
|
"grad_norm": 2.0577120951961057, |
|
"learning_rate": 4.813831676016301e-06, |
|
"loss": 0.2323, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 1.6853333333333333, |
|
"grad_norm": 1.6887655358314864, |
|
"learning_rate": 4.793157523926343e-06, |
|
"loss": 0.2236, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 1.669624887759933, |
|
"learning_rate": 4.772486913656126e-06, |
|
"loss": 0.216, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.6924444444444444, |
|
"grad_norm": 1.3957590014036165, |
|
"learning_rate": 4.751820199154116e-06, |
|
"loss": 0.2104, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 1.7601085948001791, |
|
"learning_rate": 4.731157734302063e-06, |
|
"loss": 0.2255, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 1.6995555555555555, |
|
"grad_norm": 1.4141936030167341, |
|
"learning_rate": 4.7104998729089456e-06, |
|
"loss": 0.2216, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 1.7031111111111112, |
|
"grad_norm": 1.5375991664201998, |
|
"learning_rate": 4.689846968704921e-06, |
|
"loss": 0.2316, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 1.7066666666666666, |
|
"grad_norm": 1.835379245628528, |
|
"learning_rate": 4.669199375335267e-06, |
|
"loss": 0.2211, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.7102222222222223, |
|
"grad_norm": 1.8813507703109071, |
|
"learning_rate": 4.64855744635431e-06, |
|
"loss": 0.2279, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 1.7137777777777776, |
|
"grad_norm": 1.6192801344534893, |
|
"learning_rate": 4.627921535219398e-06, |
|
"loss": 0.2076, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 1.7173333333333334, |
|
"grad_norm": 1.5047363033780152, |
|
"learning_rate": 4.607291995284824e-06, |
|
"loss": 0.2272, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 1.720888888888889, |
|
"grad_norm": 1.7489501841705488, |
|
"learning_rate": 4.586669179795789e-06, |
|
"loss": 0.2269, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 1.7244444444444444, |
|
"grad_norm": 1.5125229649844467, |
|
"learning_rate": 4.566053441882346e-06, |
|
"loss": 0.2187, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 1.456492370626904, |
|
"learning_rate": 4.545445134553365e-06, |
|
"loss": 0.2179, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 1.7315555555555555, |
|
"grad_norm": 1.620452560710039, |
|
"learning_rate": 4.52484461069047e-06, |
|
"loss": 0.2262, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 1.7351111111111113, |
|
"grad_norm": 2.0083784630353887, |
|
"learning_rate": 4.504252223042015e-06, |
|
"loss": 0.2363, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 1.7386666666666666, |
|
"grad_norm": 1.4284347298197593, |
|
"learning_rate": 4.4836683242170274e-06, |
|
"loss": 0.2297, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 1.7422222222222223, |
|
"grad_norm": 1.4968259463132965, |
|
"learning_rate": 4.463093266679185e-06, |
|
"loss": 0.2223, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.7457777777777777, |
|
"grad_norm": 1.625381108991568, |
|
"learning_rate": 4.442527402740773e-06, |
|
"loss": 0.2177, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 1.7493333333333334, |
|
"grad_norm": 1.7761034776967624, |
|
"learning_rate": 4.4219710845566445e-06, |
|
"loss": 0.2266, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 1.752888888888889, |
|
"grad_norm": 1.513194923019174, |
|
"learning_rate": 4.401424664118209e-06, |
|
"loss": 0.2385, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 1.7564444444444445, |
|
"grad_norm": 1.6662188116169265, |
|
"learning_rate": 4.380888493247389e-06, |
|
"loss": 0.2209, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.7192566216460916, |
|
"learning_rate": 4.360362923590599e-06, |
|
"loss": 0.2273, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.7635555555555555, |
|
"grad_norm": 1.6376141309754375, |
|
"learning_rate": 4.339848306612726e-06, |
|
"loss": 0.2263, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 1.767111111111111, |
|
"grad_norm": 1.5441961811580323, |
|
"learning_rate": 4.319344993591122e-06, |
|
"loss": 0.2317, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 1.7706666666666666, |
|
"grad_norm": 1.8214320335618939, |
|
"learning_rate": 4.298853335609558e-06, |
|
"loss": 0.2352, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 1.7742222222222224, |
|
"grad_norm": 1.56553607416482, |
|
"learning_rate": 4.278373683552252e-06, |
|
"loss": 0.2451, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 1.3995626238477137, |
|
"learning_rate": 4.257906388097833e-06, |
|
"loss": 0.2119, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"eval_loss": 0.2164340764284134, |
|
"eval_runtime": 560.6747, |
|
"eval_samples_per_second": 17.836, |
|
"eval_steps_per_second": 4.459, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.7813333333333334, |
|
"grad_norm": 2.040538040793932, |
|
"learning_rate": 4.237451799713343e-06, |
|
"loss": 0.2311, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 1.7848888888888887, |
|
"grad_norm": 1.718359867250397, |
|
"learning_rate": 4.2170102686482386e-06, |
|
"loss": 0.2308, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 1.7884444444444445, |
|
"grad_norm": 1.647498620915099, |
|
"learning_rate": 4.196582144928398e-06, |
|
"loss": 0.2343, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 1.529219174043635, |
|
"learning_rate": 4.176167778350111e-06, |
|
"loss": 0.2471, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 1.7955555555555556, |
|
"grad_norm": 1.8299602144032394, |
|
"learning_rate": 4.155767518474112e-06, |
|
"loss": 0.2334, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.799111111111111, |
|
"grad_norm": 1.6343462536475093, |
|
"learning_rate": 4.135381714619572e-06, |
|
"loss": 0.2352, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 1.8026666666666666, |
|
"grad_norm": 1.9294723624845498, |
|
"learning_rate": 4.115010715858135e-06, |
|
"loss": 0.2295, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 1.8062222222222222, |
|
"grad_norm": 1.8402038191366281, |
|
"learning_rate": 4.09465487100793e-06, |
|
"loss": 0.2227, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 1.8097777777777777, |
|
"grad_norm": 1.8931304584295443, |
|
"learning_rate": 4.074314528627602e-06, |
|
"loss": 0.2355, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 1.8133333333333335, |
|
"grad_norm": 1.8206151546804537, |
|
"learning_rate": 4.053990037010342e-06, |
|
"loss": 0.2323, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.8168888888888888, |
|
"grad_norm": 1.5473952396079231, |
|
"learning_rate": 4.033681744177929e-06, |
|
"loss": 0.2069, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 1.8204444444444445, |
|
"grad_norm": 1.2199743932660083, |
|
"learning_rate": 4.013389997874759e-06, |
|
"loss": 0.2076, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 1.7825722106285342, |
|
"learning_rate": 3.993115145561902e-06, |
|
"loss": 0.2425, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 1.8275555555555556, |
|
"grad_norm": 1.8303008392916014, |
|
"learning_rate": 3.9728575344111456e-06, |
|
"loss": 0.234, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 1.8311111111111111, |
|
"grad_norm": 1.2964915164879398, |
|
"learning_rate": 3.9526175112990515e-06, |
|
"loss": 0.1987, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.8346666666666667, |
|
"grad_norm": 1.5700753166440498, |
|
"learning_rate": 3.93239542280102e-06, |
|
"loss": 0.2137, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 1.8382222222222222, |
|
"grad_norm": 1.6406760092620998, |
|
"learning_rate": 3.912191615185349e-06, |
|
"loss": 0.2235, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 1.8417777777777777, |
|
"grad_norm": 1.5447905159493263, |
|
"learning_rate": 3.892006434407309e-06, |
|
"loss": 0.2218, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 1.8453333333333335, |
|
"grad_norm": 1.7383544264235498, |
|
"learning_rate": 3.871840226103219e-06, |
|
"loss": 0.2287, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 1.8488888888888888, |
|
"grad_norm": 1.9317016214891507, |
|
"learning_rate": 3.851693335584525e-06, |
|
"loss": 0.2228, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.8524444444444446, |
|
"grad_norm": 1.5692018080933492, |
|
"learning_rate": 3.831566107831889e-06, |
|
"loss": 0.2331, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 2.050378660719503, |
|
"learning_rate": 3.8114588874892893e-06, |
|
"loss": 0.2137, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 1.8595555555555556, |
|
"grad_norm": 1.5271617708228957, |
|
"learning_rate": 3.791372018858099e-06, |
|
"loss": 0.2135, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 1.8631111111111112, |
|
"grad_norm": 1.31763541419423, |
|
"learning_rate": 3.7713058458912164e-06, |
|
"loss": 0.2217, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 1.6488724873659462, |
|
"learning_rate": 3.751260712187156e-06, |
|
"loss": 0.2539, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.8702222222222222, |
|
"grad_norm": 1.392136229173735, |
|
"learning_rate": 3.731236960984169e-06, |
|
"loss": 0.2179, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 1.8737777777777778, |
|
"grad_norm": 1.6189512718112575, |
|
"learning_rate": 3.711234935154372e-06, |
|
"loss": 0.2183, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 1.8773333333333333, |
|
"grad_norm": 1.5548818693905742, |
|
"learning_rate": 3.6912549771978747e-06, |
|
"loss": 0.2354, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 1.8808888888888888, |
|
"grad_norm": 1.4728328055912387, |
|
"learning_rate": 3.6712974292369035e-06, |
|
"loss": 0.2268, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 1.8844444444444446, |
|
"grad_norm": 1.5435161738551857, |
|
"learning_rate": 3.651362633009962e-06, |
|
"loss": 0.204, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 1.5873129086509827, |
|
"learning_rate": 3.6314509298659663e-06, |
|
"loss": 0.208, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 1.8915555555555557, |
|
"grad_norm": 1.3391876728975607, |
|
"learning_rate": 3.6115626607584e-06, |
|
"loss": 0.2372, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 1.895111111111111, |
|
"grad_norm": 1.88178920211116, |
|
"learning_rate": 3.5916981662394856e-06, |
|
"loss": 0.2257, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 1.8986666666666667, |
|
"grad_norm": 1.764120901512499, |
|
"learning_rate": 3.5718577864543396e-06, |
|
"loss": 0.2103, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 1.9022222222222223, |
|
"grad_norm": 1.6698875487111986, |
|
"learning_rate": 3.552041861135161e-06, |
|
"loss": 0.211, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.9057777777777778, |
|
"grad_norm": 1.6957349016200651, |
|
"learning_rate": 3.532250729595408e-06, |
|
"loss": 0.2164, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 1.9093333333333333, |
|
"grad_norm": 1.5603565111247202, |
|
"learning_rate": 3.5124847307239863e-06, |
|
"loss": 0.2265, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 1.9128888888888889, |
|
"grad_norm": 1.5529468285695374, |
|
"learning_rate": 3.4927442029794467e-06, |
|
"loss": 0.2316, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 1.9164444444444444, |
|
"grad_norm": 1.7677530671686799, |
|
"learning_rate": 3.473029484384196e-06, |
|
"loss": 0.219, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.9782571884316444, |
|
"learning_rate": 3.4533409125186974e-06, |
|
"loss": 0.2252, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.9235555555555557, |
|
"grad_norm": 1.7371605678560165, |
|
"learning_rate": 3.4336788245157026e-06, |
|
"loss": 0.2222, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 1.927111111111111, |
|
"grad_norm": 1.7241089696999294, |
|
"learning_rate": 3.4140435570544708e-06, |
|
"loss": 0.2345, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 1.9306666666666668, |
|
"grad_norm": 1.7019802310043695, |
|
"learning_rate": 3.3944354463550035e-06, |
|
"loss": 0.214, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 1.934222222222222, |
|
"grad_norm": 1.8394276850187319, |
|
"learning_rate": 3.374854828172292e-06, |
|
"loss": 0.234, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 1.9377777777777778, |
|
"grad_norm": 1.7264682966489493, |
|
"learning_rate": 3.3553020377905663e-06, |
|
"loss": 0.2242, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.9413333333333334, |
|
"grad_norm": 1.6744044298365783, |
|
"learning_rate": 3.3357774100175513e-06, |
|
"loss": 0.2245, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 1.944888888888889, |
|
"grad_norm": 1.4991747809315612, |
|
"learning_rate": 3.316281279178737e-06, |
|
"loss": 0.2114, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 1.9484444444444444, |
|
"grad_norm": 1.5141154002091217, |
|
"learning_rate": 3.296813979111655e-06, |
|
"loss": 0.2182, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 1.7580533484108005, |
|
"learning_rate": 3.2773758431601543e-06, |
|
"loss": 0.2234, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 1.6014365241780455, |
|
"learning_rate": 3.257967204168705e-06, |
|
"loss": 0.238, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"eval_loss": 0.21176277101039886, |
|
"eval_runtime": 560.9255, |
|
"eval_samples_per_second": 17.828, |
|
"eval_steps_per_second": 4.457, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.959111111111111, |
|
"grad_norm": 1.566927102750067, |
|
"learning_rate": 3.2385883944766867e-06, |
|
"loss": 0.1932, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 1.9626666666666668, |
|
"grad_norm": 1.7041733469332605, |
|
"learning_rate": 3.2192397459127077e-06, |
|
"loss": 0.2194, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 1.966222222222222, |
|
"grad_norm": 1.7846179835205314, |
|
"learning_rate": 3.199921589788923e-06, |
|
"loss": 0.2092, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 1.9697777777777778, |
|
"grad_norm": 1.482707355318634, |
|
"learning_rate": 3.180634256895345e-06, |
|
"loss": 0.2328, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 1.9733333333333334, |
|
"grad_norm": 1.6559180099205715, |
|
"learning_rate": 3.161378077494205e-06, |
|
"loss": 0.234, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 1.976888888888889, |
|
"grad_norm": 1.4931797613124567, |
|
"learning_rate": 3.142153381314278e-06, |
|
"loss": 0.2285, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 1.9804444444444445, |
|
"grad_norm": 1.6899228150340497, |
|
"learning_rate": 3.122960497545242e-06, |
|
"loss": 0.2347, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 1.6112817535514066, |
|
"learning_rate": 3.103799754832045e-06, |
|
"loss": 0.2017, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 1.9875555555555555, |
|
"grad_norm": 1.4492842053913877, |
|
"learning_rate": 3.0846714812692774e-06, |
|
"loss": 0.2282, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 1.991111111111111, |
|
"grad_norm": 1.6227303784789882, |
|
"learning_rate": 3.065576004395546e-06, |
|
"loss": 0.2193, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.9946666666666668, |
|
"grad_norm": 1.6532339878737676, |
|
"learning_rate": 3.046513651187874e-06, |
|
"loss": 0.205, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 1.9982222222222221, |
|
"grad_norm": 1.726150455488493, |
|
"learning_rate": 3.027484748056101e-06, |
|
"loss": 0.2052, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 2.001777777777778, |
|
"grad_norm": 1.2491575364238943, |
|
"learning_rate": 3.008489620837287e-06, |
|
"loss": 0.1793, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 2.005333333333333, |
|
"grad_norm": 1.539466703681713, |
|
"learning_rate": 2.989528594790142e-06, |
|
"loss": 0.133, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 2.008888888888889, |
|
"grad_norm": 1.5201921987042595, |
|
"learning_rate": 2.97060199458945e-06, |
|
"loss": 0.1364, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 2.0124444444444443, |
|
"grad_norm": 1.8387836805686166, |
|
"learning_rate": 2.9517101443205143e-06, |
|
"loss": 0.138, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 1.6624452979538558, |
|
"learning_rate": 2.9328533674736043e-06, |
|
"loss": 0.1372, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 2.0195555555555558, |
|
"grad_norm": 2.0375067274701464, |
|
"learning_rate": 2.914031986938417e-06, |
|
"loss": 0.1376, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 2.023111111111111, |
|
"grad_norm": 1.5020388133691598, |
|
"learning_rate": 2.895246324998549e-06, |
|
"loss": 0.132, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 2.026666666666667, |
|
"grad_norm": 1.5200304354769367, |
|
"learning_rate": 2.8764967033259793e-06, |
|
"loss": 0.1332, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.030222222222222, |
|
"grad_norm": 1.615938242121572, |
|
"learning_rate": 2.8577834429755586e-06, |
|
"loss": 0.137, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 2.033777777777778, |
|
"grad_norm": 1.7244206202588588, |
|
"learning_rate": 2.839106864379512e-06, |
|
"loss": 0.1311, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 2.037333333333333, |
|
"grad_norm": 1.4204204890159835, |
|
"learning_rate": 2.8204672873419565e-06, |
|
"loss": 0.1359, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 2.040888888888889, |
|
"grad_norm": 1.641810724006462, |
|
"learning_rate": 2.8018650310334118e-06, |
|
"loss": 0.1524, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 2.0444444444444443, |
|
"grad_norm": 1.6197231294728873, |
|
"learning_rate": 2.783300413985359e-06, |
|
"loss": 0.1216, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 1.7166152973793496, |
|
"learning_rate": 2.764773754084763e-06, |
|
"loss": 0.1393, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 2.0515555555555554, |
|
"grad_norm": 1.7305108784705923, |
|
"learning_rate": 2.7462853685686362e-06, |
|
"loss": 0.1429, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 2.055111111111111, |
|
"grad_norm": 1.2910967057789844, |
|
"learning_rate": 2.7278355740186123e-06, |
|
"loss": 0.1336, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 2.058666666666667, |
|
"grad_norm": 1.5080611405633613, |
|
"learning_rate": 2.7094246863555262e-06, |
|
"loss": 0.1359, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 2.062222222222222, |
|
"grad_norm": 1.8733744454525603, |
|
"learning_rate": 2.691053020833988e-06, |
|
"loss": 0.1388, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.065777777777778, |
|
"grad_norm": 1.7085324740063759, |
|
"learning_rate": 2.6727208920370063e-06, |
|
"loss": 0.1355, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 2.0693333333333332, |
|
"grad_norm": 1.5576784710780245, |
|
"learning_rate": 2.6544286138705867e-06, |
|
"loss": 0.1328, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 2.072888888888889, |
|
"grad_norm": 1.9703710936721526, |
|
"learning_rate": 2.636176499558364e-06, |
|
"loss": 0.1354, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 2.0764444444444443, |
|
"grad_norm": 1.5952203119705437, |
|
"learning_rate": 2.6179648616362374e-06, |
|
"loss": 0.1493, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.9073156525645674, |
|
"learning_rate": 2.599794011947012e-06, |
|
"loss": 0.1579, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 2.0835555555555554, |
|
"grad_norm": 1.7695748236621889, |
|
"learning_rate": 2.581664261635069e-06, |
|
"loss": 0.1446, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 2.087111111111111, |
|
"grad_norm": 1.8880183020861152, |
|
"learning_rate": 2.5635759211410396e-06, |
|
"loss": 0.1406, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 2.0906666666666665, |
|
"grad_norm": 1.5198269240530051, |
|
"learning_rate": 2.545529300196472e-06, |
|
"loss": 0.1244, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 2.094222222222222, |
|
"grad_norm": 1.9355343365767825, |
|
"learning_rate": 2.527524707818547e-06, |
|
"loss": 0.1289, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 2.097777777777778, |
|
"grad_norm": 1.546102626213903, |
|
"learning_rate": 2.5095624523047775e-06, |
|
"loss": 0.1151, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.1013333333333333, |
|
"grad_norm": 1.3237810299249595, |
|
"learning_rate": 2.491642841227729e-06, |
|
"loss": 0.1386, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 2.104888888888889, |
|
"grad_norm": 1.6354432410587478, |
|
"learning_rate": 2.4737661814297557e-06, |
|
"loss": 0.1152, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 2.1084444444444443, |
|
"grad_norm": 1.7641939157921844, |
|
"learning_rate": 2.455932779017747e-06, |
|
"loss": 0.1267, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 1.7717956617877848, |
|
"learning_rate": 2.438142939357882e-06, |
|
"loss": 0.1468, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 2.1155555555555554, |
|
"grad_norm": 1.9248857260031529, |
|
"learning_rate": 2.4203969670704065e-06, |
|
"loss": 0.1426, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.119111111111111, |
|
"grad_norm": 1.6693083011986807, |
|
"learning_rate": 2.4026951660244063e-06, |
|
"loss": 0.1519, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 2.1226666666666665, |
|
"grad_norm": 1.4577868069815147, |
|
"learning_rate": 2.385037839332616e-06, |
|
"loss": 0.1449, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 2.1262222222222222, |
|
"grad_norm": 1.5757247401728414, |
|
"learning_rate": 2.3674252893462304e-06, |
|
"loss": 0.1508, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 2.129777777777778, |
|
"grad_norm": 1.798414953668795, |
|
"learning_rate": 2.3498578176497055e-06, |
|
"loss": 0.1336, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 1.3502333712237125, |
|
"learning_rate": 2.3323357250556213e-06, |
|
"loss": 0.1289, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"eval_loss": 0.24109843373298645, |
|
"eval_runtime": 561.0318, |
|
"eval_samples_per_second": 17.824, |
|
"eval_steps_per_second": 4.456, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.136888888888889, |
|
"grad_norm": 1.6807098639484461, |
|
"learning_rate": 2.3148593115995155e-06, |
|
"loss": 0.1232, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 2.1404444444444444, |
|
"grad_norm": 1.3750693562838343, |
|
"learning_rate": 2.2974288765347484e-06, |
|
"loss": 0.1406, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 1.7740210796916787, |
|
"learning_rate": 2.280044718327383e-06, |
|
"loss": 0.1366, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 2.1475555555555554, |
|
"grad_norm": 1.3613431283259703, |
|
"learning_rate": 2.262707134651069e-06, |
|
"loss": 0.1347, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 2.151111111111111, |
|
"grad_norm": 1.5001232721911446, |
|
"learning_rate": 2.2454164223819443e-06, |
|
"loss": 0.1435, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.1546666666666665, |
|
"grad_norm": 1.6096086307058128, |
|
"learning_rate": 2.228172877593563e-06, |
|
"loss": 0.1248, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 2.1582222222222223, |
|
"grad_norm": 1.4625689431665512, |
|
"learning_rate": 2.2109767955518135e-06, |
|
"loss": 0.129, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 2.1617777777777776, |
|
"grad_norm": 1.7396993983427422, |
|
"learning_rate": 2.193828470709863e-06, |
|
"loss": 0.1259, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 2.1653333333333333, |
|
"grad_norm": 1.4423513554123952, |
|
"learning_rate": 2.176728196703122e-06, |
|
"loss": 0.1308, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 2.168888888888889, |
|
"grad_norm": 1.9920936118384482, |
|
"learning_rate": 2.159676266344222e-06, |
|
"loss": 0.1496, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.1724444444444444, |
|
"grad_norm": 2.13727569719491, |
|
"learning_rate": 2.142672971617978e-06, |
|
"loss": 0.1359, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 1.5724700258419562, |
|
"learning_rate": 2.125718603676413e-06, |
|
"loss": 0.1412, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 2.1795555555555555, |
|
"grad_norm": 1.3817720285663424, |
|
"learning_rate": 2.1088134528337635e-06, |
|
"loss": 0.1357, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 2.1831111111111112, |
|
"grad_norm": 1.6852270201894561, |
|
"learning_rate": 2.091957808561505e-06, |
|
"loss": 0.1388, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 2.1866666666666665, |
|
"grad_norm": 1.5752301082061768, |
|
"learning_rate": 2.0751519594834025e-06, |
|
"loss": 0.1359, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.1902222222222223, |
|
"grad_norm": 1.9588237176858065, |
|
"learning_rate": 2.058396193370556e-06, |
|
"loss": 0.1364, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.1937777777777776, |
|
"grad_norm": 1.5906028620881005, |
|
"learning_rate": 2.0416907971364937e-06, |
|
"loss": 0.1286, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 2.1973333333333334, |
|
"grad_norm": 1.6040127033831966, |
|
"learning_rate": 2.0250360568322395e-06, |
|
"loss": 0.132, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 2.2008888888888887, |
|
"grad_norm": 1.903945940065679, |
|
"learning_rate": 2.0084322576414205e-06, |
|
"loss": 0.1311, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 2.2044444444444444, |
|
"grad_norm": 1.7327408494603853, |
|
"learning_rate": 1.991879683875386e-06, |
|
"loss": 0.1412, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 1.6938104353348038, |
|
"learning_rate": 1.975378618968348e-06, |
|
"loss": 0.1358, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 2.2115555555555555, |
|
"grad_norm": 1.498102728760879, |
|
"learning_rate": 1.958929345472503e-06, |
|
"loss": 0.1272, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 2.2151111111111113, |
|
"grad_norm": 1.5061713395545921, |
|
"learning_rate": 1.942532145053219e-06, |
|
"loss": 0.1335, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 2.2186666666666666, |
|
"grad_norm": 1.8881968807558394, |
|
"learning_rate": 1.926187298484201e-06, |
|
"loss": 0.13, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.7409457044279315, |
|
"learning_rate": 1.9098950856426845e-06, |
|
"loss": 0.1197, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 2.2257777777777776, |
|
"grad_norm": 1.7410736866607524, |
|
"learning_rate": 1.893655785504644e-06, |
|
"loss": 0.136, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 2.2293333333333334, |
|
"grad_norm": 1.4673795329307866, |
|
"learning_rate": 1.8774696761400107e-06, |
|
"loss": 0.1351, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 2.2328888888888887, |
|
"grad_norm": 1.4286935284704283, |
|
"learning_rate": 1.8613370347079207e-06, |
|
"loss": 0.1316, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 2.2364444444444445, |
|
"grad_norm": 1.6752679462634348, |
|
"learning_rate": 1.845258137451968e-06, |
|
"loss": 0.1343, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.5334658674891999, |
|
"learning_rate": 1.8292332596954605e-06, |
|
"loss": 0.1252, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.2435555555555555, |
|
"grad_norm": 1.7816021858972186, |
|
"learning_rate": 1.8132626758367217e-06, |
|
"loss": 0.1373, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 2.2471111111111113, |
|
"grad_norm": 1.4751058571451898, |
|
"learning_rate": 1.7973466593443861e-06, |
|
"loss": 0.1238, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 2.2506666666666666, |
|
"grad_norm": 1.5737118263350949, |
|
"learning_rate": 1.7814854827527144e-06, |
|
"loss": 0.1331, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 2.2542222222222223, |
|
"grad_norm": 1.6723085510766795, |
|
"learning_rate": 1.7656794176569302e-06, |
|
"loss": 0.1392, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 2.2577777777777777, |
|
"grad_norm": 1.6074614963797307, |
|
"learning_rate": 1.749928734708568e-06, |
|
"loss": 0.1482, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 2.2613333333333334, |
|
"grad_norm": 1.514935517928495, |
|
"learning_rate": 1.734233703610838e-06, |
|
"loss": 0.1318, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 2.2648888888888887, |
|
"grad_norm": 2.1990045539686767, |
|
"learning_rate": 1.7185945931140086e-06, |
|
"loss": 0.1389, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 2.2684444444444445, |
|
"grad_norm": 1.7900402567821287, |
|
"learning_rate": 1.7030116710108068e-06, |
|
"loss": 0.1402, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 1.5936415333953513, |
|
"learning_rate": 1.6874852041318246e-06, |
|
"loss": 0.1383, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 2.2755555555555556, |
|
"grad_norm": 1.6874167667097502, |
|
"learning_rate": 1.6720154583409642e-06, |
|
"loss": 0.1297, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.279111111111111, |
|
"grad_norm": 1.7461565673164665, |
|
"learning_rate": 1.6566026985308737e-06, |
|
"loss": 0.1265, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 2.2826666666666666, |
|
"grad_norm": 1.9943666083505533, |
|
"learning_rate": 1.6412471886184106e-06, |
|
"loss": 0.1433, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 2.2862222222222224, |
|
"grad_norm": 1.889269033390485, |
|
"learning_rate": 1.6259491915401322e-06, |
|
"loss": 0.1295, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 2.2897777777777777, |
|
"grad_norm": 1.9954192603921324, |
|
"learning_rate": 1.6107089692477856e-06, |
|
"loss": 0.1506, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 2.2933333333333334, |
|
"grad_norm": 1.73943513110269, |
|
"learning_rate": 1.5955267827038267e-06, |
|
"loss": 0.1309, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 2.2968888888888888, |
|
"grad_norm": 1.5696215992092173, |
|
"learning_rate": 1.5804028918769488e-06, |
|
"loss": 0.1245, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 2.3004444444444445, |
|
"grad_norm": 1.4480211516999386, |
|
"learning_rate": 1.5653375557376266e-06, |
|
"loss": 0.1419, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 1.7769598112511977, |
|
"learning_rate": 1.5503310322536962e-06, |
|
"loss": 0.1357, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 2.3075555555555556, |
|
"grad_norm": 1.6914490635403432, |
|
"learning_rate": 1.5353835783859244e-06, |
|
"loss": 0.1344, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"grad_norm": 1.2896364219654397, |
|
"learning_rate": 1.5204954500836095e-06, |
|
"loss": 0.1336, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"eval_loss": 0.2400493621826172, |
|
"eval_runtime": 562.3512, |
|
"eval_samples_per_second": 17.782, |
|
"eval_steps_per_second": 4.446, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.3146666666666667, |
|
"grad_norm": 1.6249516275302234, |
|
"learning_rate": 1.5056669022802051e-06, |
|
"loss": 0.1578, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 2.3182222222222224, |
|
"grad_norm": 1.5534728727358678, |
|
"learning_rate": 1.4908981888889562e-06, |
|
"loss": 0.1236, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 2.3217777777777777, |
|
"grad_norm": 2.305594450780404, |
|
"learning_rate": 1.4761895627985384e-06, |
|
"loss": 0.1437, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 2.3253333333333335, |
|
"grad_norm": 1.7525804358624415, |
|
"learning_rate": 1.461541275868742e-06, |
|
"loss": 0.1244, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 2.328888888888889, |
|
"grad_norm": 1.5857723879215653, |
|
"learning_rate": 1.4469535789261518e-06, |
|
"loss": 0.138, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 2.3324444444444445, |
|
"grad_norm": 1.4470785666281207, |
|
"learning_rate": 1.4324267217598543e-06, |
|
"loss": 0.1311, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 1.5783013529079604, |
|
"learning_rate": 1.41796095311716e-06, |
|
"loss": 0.1476, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 2.3395555555555556, |
|
"grad_norm": 1.792387189040966, |
|
"learning_rate": 1.4035565206993407e-06, |
|
"loss": 0.1313, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 2.343111111111111, |
|
"grad_norm": 2.0097219507066986, |
|
"learning_rate": 1.3892136711573983e-06, |
|
"loss": 0.1481, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 2.3466666666666667, |
|
"grad_norm": 1.6038575587094324, |
|
"learning_rate": 1.3749326500878308e-06, |
|
"loss": 0.1329, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.3502222222222224, |
|
"grad_norm": 1.8038941533229218, |
|
"learning_rate": 1.3607137020284267e-06, |
|
"loss": 0.1296, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 2.3537777777777777, |
|
"grad_norm": 1.5967517903597408, |
|
"learning_rate": 1.3465570704540877e-06, |
|
"loss": 0.1323, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 2.3573333333333335, |
|
"grad_norm": 1.6630671725280828, |
|
"learning_rate": 1.33246299777265e-06, |
|
"loss": 0.1353, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 2.360888888888889, |
|
"grad_norm": 1.6910996186336409, |
|
"learning_rate": 1.3184317253207379e-06, |
|
"loss": 0.1198, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 2.3644444444444446, |
|
"grad_norm": 1.667550829249205, |
|
"learning_rate": 1.3044634933596311e-06, |
|
"loss": 0.1398, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 1.3604264834299673, |
|
"learning_rate": 1.290558541071148e-06, |
|
"loss": 0.123, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 2.3715555555555556, |
|
"grad_norm": 1.4966865021721736, |
|
"learning_rate": 1.2767171065535538e-06, |
|
"loss": 0.1221, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 2.375111111111111, |
|
"grad_norm": 1.3751769981745194, |
|
"learning_rate": 1.2629394268174811e-06, |
|
"loss": 0.1398, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 2.3786666666666667, |
|
"grad_norm": 1.7552964254373993, |
|
"learning_rate": 1.2492257377818734e-06, |
|
"loss": 0.122, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 2.3822222222222225, |
|
"grad_norm": 1.984424873865648, |
|
"learning_rate": 1.235576274269938e-06, |
|
"loss": 0.1366, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 2.3857777777777778, |
|
"grad_norm": 1.8024296643627178, |
|
"learning_rate": 1.2219912700051417e-06, |
|
"loss": 0.1304, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 2.389333333333333, |
|
"grad_norm": 1.6704237658027163, |
|
"learning_rate": 1.2084709576071885e-06, |
|
"loss": 0.1339, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 2.392888888888889, |
|
"grad_norm": 1.8905223292433262, |
|
"learning_rate": 1.1950155685880504e-06, |
|
"loss": 0.138, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 2.3964444444444446, |
|
"grad_norm": 1.8585326052998994, |
|
"learning_rate": 1.1816253333479994e-06, |
|
"loss": 0.1402, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.4117751565900303, |
|
"learning_rate": 1.1683004811716597e-06, |
|
"loss": 0.1219, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 2.4035555555555557, |
|
"grad_norm": 2.177441304004068, |
|
"learning_rate": 1.1550412402240852e-06, |
|
"loss": 0.1472, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 2.407111111111111, |
|
"grad_norm": 1.7312870442889088, |
|
"learning_rate": 1.1418478375468496e-06, |
|
"loss": 0.14, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 2.4106666666666667, |
|
"grad_norm": 1.4691171208612808, |
|
"learning_rate": 1.1287204990541612e-06, |
|
"loss": 0.1382, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 2.414222222222222, |
|
"grad_norm": 1.9102821919207582, |
|
"learning_rate": 1.1156594495289923e-06, |
|
"loss": 0.1508, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 2.417777777777778, |
|
"grad_norm": 1.5765296328104144, |
|
"learning_rate": 1.1026649126192334e-06, |
|
"loss": 0.1244, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.421333333333333, |
|
"grad_norm": 1.485558878346715, |
|
"learning_rate": 1.0897371108338572e-06, |
|
"loss": 0.1262, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 2.424888888888889, |
|
"grad_norm": 1.6805947418795415, |
|
"learning_rate": 1.076876265539115e-06, |
|
"loss": 0.1397, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 2.4284444444444446, |
|
"grad_norm": 1.8439671145791727, |
|
"learning_rate": 1.0640825969547498e-06, |
|
"loss": 0.1298, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 1.8675356289498493, |
|
"learning_rate": 1.051356324150209e-06, |
|
"loss": 0.1334, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 2.4355555555555557, |
|
"grad_norm": 2.097329265797065, |
|
"learning_rate": 1.0386976650409102e-06, |
|
"loss": 0.1342, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 2.439111111111111, |
|
"grad_norm": 1.7733262424549074, |
|
"learning_rate": 1.0261068363845034e-06, |
|
"loss": 0.1297, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 2.4426666666666668, |
|
"grad_norm": 1.7698885455909084, |
|
"learning_rate": 1.0135840537771574e-06, |
|
"loss": 0.1355, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 2.446222222222222, |
|
"grad_norm": 1.699595680180769, |
|
"learning_rate": 1.001129531649872e-06, |
|
"loss": 0.1255, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 2.449777777777778, |
|
"grad_norm": 1.8061641909036275, |
|
"learning_rate": 9.887434832647997e-07, |
|
"loss": 0.1355, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 2.453333333333333, |
|
"grad_norm": 1.8282679409791762, |
|
"learning_rate": 9.764261207116061e-07, |
|
"loss": 0.1437, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.456888888888889, |
|
"grad_norm": 1.8691781223789907, |
|
"learning_rate": 9.641776549038257e-07, |
|
"loss": 0.1274, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 2.4604444444444447, |
|
"grad_norm": 1.8720204975109627, |
|
"learning_rate": 9.519982955752549e-07, |
|
"loss": 0.1321, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 1.714725769185188, |
|
"learning_rate": 9.398882512763618e-07, |
|
"loss": 0.1299, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 2.4675555555555557, |
|
"grad_norm": 1.5736356325676821, |
|
"learning_rate": 9.278477293707189e-07, |
|
"loss": 0.1454, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 2.471111111111111, |
|
"grad_norm": 1.7235279739808778, |
|
"learning_rate": 9.158769360314412e-07, |
|
"loss": 0.1301, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 2.474666666666667, |
|
"grad_norm": 1.7964601353844663, |
|
"learning_rate": 9.039760762376665e-07, |
|
"loss": 0.1329, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 2.478222222222222, |
|
"grad_norm": 1.7113961505997257, |
|
"learning_rate": 8.921453537710406e-07, |
|
"loss": 0.1301, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 2.481777777777778, |
|
"grad_norm": 3.7247151362742708, |
|
"learning_rate": 8.803849712122292e-07, |
|
"loss": 0.1366, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 2.485333333333333, |
|
"grad_norm": 1.6042128553101094, |
|
"learning_rate": 8.686951299374474e-07, |
|
"loss": 0.1248, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"grad_norm": 1.7566315817690532, |
|
"learning_rate": 8.570760301150166e-07, |
|
"loss": 0.1397, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"eval_loss": 0.239632710814476, |
|
"eval_runtime": 563.0915, |
|
"eval_samples_per_second": 17.759, |
|
"eval_steps_per_second": 4.44, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.4924444444444447, |
|
"grad_norm": 1.915869222287072, |
|
"learning_rate": 8.455278707019255e-07, |
|
"loss": 0.133, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 1.4611242467498158, |
|
"learning_rate": 8.340508494404415e-07, |
|
"loss": 0.128, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 2.4995555555555553, |
|
"grad_norm": 1.8274207116893812, |
|
"learning_rate": 8.226451628547039e-07, |
|
"loss": 0.1304, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 2.503111111111111, |
|
"grad_norm": 1.5195837090357422, |
|
"learning_rate": 8.113110062473756e-07, |
|
"loss": 0.1337, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 2.506666666666667, |
|
"grad_norm": 1.534284195780538, |
|
"learning_rate": 8.000485736962899e-07, |
|
"loss": 0.1365, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 2.510222222222222, |
|
"grad_norm": 1.3874360730778557, |
|
"learning_rate": 7.888580580511307e-07, |
|
"loss": 0.1157, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 2.513777777777778, |
|
"grad_norm": 1.347897014568791, |
|
"learning_rate": 7.777396509301278e-07, |
|
"loss": 0.1258, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 2.517333333333333, |
|
"grad_norm": 1.5444960857241712, |
|
"learning_rate": 7.666935427167777e-07, |
|
"loss": 0.1261, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 2.520888888888889, |
|
"grad_norm": 1.5787802499569878, |
|
"learning_rate": 7.557199225565848e-07, |
|
"loss": 0.1353, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 2.5244444444444447, |
|
"grad_norm": 1.6575537900928325, |
|
"learning_rate": 7.448189783538184e-07, |
|
"loss": 0.1223, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 1.58456318992188, |
|
"learning_rate": 7.339908967683007e-07, |
|
"loss": 0.1227, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 2.5315555555555553, |
|
"grad_norm": 1.916341417565209, |
|
"learning_rate": 7.232358632122022e-07, |
|
"loss": 0.1365, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 2.535111111111111, |
|
"grad_norm": 2.009648842498942, |
|
"learning_rate": 7.125540618468784e-07, |
|
"loss": 0.1435, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 2.538666666666667, |
|
"grad_norm": 1.2589650678388224, |
|
"learning_rate": 7.019456755797083e-07, |
|
"loss": 0.1333, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 2.542222222222222, |
|
"grad_norm": 1.534526581817288, |
|
"learning_rate": 6.914108860609608e-07, |
|
"loss": 0.1372, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 2.545777777777778, |
|
"grad_norm": 1.5742622053962463, |
|
"learning_rate": 6.809498736806919e-07, |
|
"loss": 0.135, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 2.5493333333333332, |
|
"grad_norm": 1.876907152948741, |
|
"learning_rate": 6.705628175656498e-07, |
|
"loss": 0.1304, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 2.552888888888889, |
|
"grad_norm": 1.7507039554831174, |
|
"learning_rate": 6.602498955762105e-07, |
|
"loss": 0.1361, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 2.5564444444444443, |
|
"grad_norm": 1.5168112309443524, |
|
"learning_rate": 6.500112843033313e-07, |
|
"loss": 0.1235, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.366857399391539, |
|
"learning_rate": 6.39847159065523e-07, |
|
"loss": 0.1268, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.5635555555555554, |
|
"grad_norm": 1.7472209117726187, |
|
"learning_rate": 6.297576939058586e-07, |
|
"loss": 0.1338, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 2.567111111111111, |
|
"grad_norm": 1.5771285823832333, |
|
"learning_rate": 6.197430615889838e-07, |
|
"loss": 0.1304, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 2.570666666666667, |
|
"grad_norm": 1.5122386895026887, |
|
"learning_rate": 6.098034335981573e-07, |
|
"loss": 0.1255, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 2.574222222222222, |
|
"grad_norm": 1.5101320862852827, |
|
"learning_rate": 5.999389801323219e-07, |
|
"loss": 0.128, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 2.5777777777777775, |
|
"grad_norm": 1.751375058176443, |
|
"learning_rate": 5.901498701031894e-07, |
|
"loss": 0.131, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 2.5813333333333333, |
|
"grad_norm": 1.5370110538793642, |
|
"learning_rate": 5.804362711323391e-07, |
|
"loss": 0.1273, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 2.584888888888889, |
|
"grad_norm": 1.5422190674222276, |
|
"learning_rate": 5.707983495483593e-07, |
|
"loss": 0.122, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 2.5884444444444443, |
|
"grad_norm": 1.8111593254497258, |
|
"learning_rate": 5.612362703839907e-07, |
|
"loss": 0.1308, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 1.7898287718649462, |
|
"learning_rate": 5.517501973733059e-07, |
|
"loss": 0.1239, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 2.5955555555555554, |
|
"grad_norm": 1.5741550714022359, |
|
"learning_rate": 5.423402929489019e-07, |
|
"loss": 0.1242, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.599111111111111, |
|
"grad_norm": 1.7431025808198797, |
|
"learning_rate": 5.330067182391219e-07, |
|
"loss": 0.1258, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 2.602666666666667, |
|
"grad_norm": 1.669472703725672, |
|
"learning_rate": 5.237496330652925e-07, |
|
"loss": 0.1318, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 2.606222222222222, |
|
"grad_norm": 1.7086096850592123, |
|
"learning_rate": 5.145691959389932e-07, |
|
"loss": 0.1292, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 2.6097777777777775, |
|
"grad_norm": 1.79780883791639, |
|
"learning_rate": 5.054655640593325e-07, |
|
"loss": 0.1446, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 2.6133333333333333, |
|
"grad_norm": 1.760230682240199, |
|
"learning_rate": 4.964388933102666e-07, |
|
"loss": 0.1418, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 2.616888888888889, |
|
"grad_norm": 1.540197801989686, |
|
"learning_rate": 4.874893382579232e-07, |
|
"loss": 0.1269, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 2.6204444444444444, |
|
"grad_norm": 1.7177370855999565, |
|
"learning_rate": 4.786170521479588e-07, |
|
"loss": 0.1223, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 1.881294576905093, |
|
"learning_rate": 4.698221869029307e-07, |
|
"loss": 0.1443, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 2.6275555555555554, |
|
"grad_norm": 1.74196972034532, |
|
"learning_rate": 4.6110489311969876e-07, |
|
"loss": 0.1429, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 2.631111111111111, |
|
"grad_norm": 1.5651241374342044, |
|
"learning_rate": 4.524653200668461e-07, |
|
"loss": 0.1264, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.634666666666667, |
|
"grad_norm": 1.8251309622054404, |
|
"learning_rate": 4.439036156821225e-07, |
|
"loss": 0.1213, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 2.6382222222222222, |
|
"grad_norm": 1.4351427368380598, |
|
"learning_rate": 4.3541992656991163e-07, |
|
"loss": 0.1182, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 2.6417777777777776, |
|
"grad_norm": 1.9769377027322241, |
|
"learning_rate": 4.2701439799871847e-07, |
|
"loss": 0.1453, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 2.6453333333333333, |
|
"grad_norm": 1.6755217149463195, |
|
"learning_rate": 4.1868717389868694e-07, |
|
"loss": 0.1284, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 2.648888888888889, |
|
"grad_norm": 1.4882784431490907, |
|
"learning_rate": 4.1043839685913135e-07, |
|
"loss": 0.1289, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 2.6524444444444444, |
|
"grad_norm": 1.2678152146637376, |
|
"learning_rate": 4.022682081260942e-07, |
|
"loss": 0.122, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 1.7036091433400906, |
|
"learning_rate": 3.941767475999297e-07, |
|
"loss": 0.1292, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 2.6595555555555555, |
|
"grad_norm": 2.0073020304210485, |
|
"learning_rate": 3.8616415383291083e-07, |
|
"loss": 0.1281, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 2.663111111111111, |
|
"grad_norm": 1.7003882572239488, |
|
"learning_rate": 3.7823056402684856e-07, |
|
"loss": 0.1205, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 1.8649824143158358, |
|
"learning_rate": 3.70376114030751e-07, |
|
"loss": 0.1405, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"eval_loss": 0.2399507761001587, |
|
"eval_runtime": 561.5965, |
|
"eval_samples_per_second": 17.806, |
|
"eval_steps_per_second": 4.452, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.6702222222222223, |
|
"grad_norm": 1.778861851144716, |
|
"learning_rate": 3.626009383384926e-07, |
|
"loss": 0.1424, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 2.6737777777777776, |
|
"grad_norm": 1.7506343466298935, |
|
"learning_rate": 3.549051700865136e-07, |
|
"loss": 0.1242, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 2.6773333333333333, |
|
"grad_norm": 1.5579333925843626, |
|
"learning_rate": 3.47288941051539e-07, |
|
"loss": 0.125, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 2.680888888888889, |
|
"grad_norm": 2.030096385748008, |
|
"learning_rate": 3.3975238164831893e-07, |
|
"loss": 0.1253, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 2.6844444444444444, |
|
"grad_norm": 1.635535994621638, |
|
"learning_rate": 3.322956209274031e-07, |
|
"loss": 0.1322, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 1.7329277515156414, |
|
"learning_rate": 3.2491878657292643e-07, |
|
"loss": 0.1355, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 2.6915555555555555, |
|
"grad_norm": 1.7444157426686764, |
|
"learning_rate": 3.176220049004197e-07, |
|
"loss": 0.1179, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 2.6951111111111112, |
|
"grad_norm": 1.3483728954452034, |
|
"learning_rate": 3.104054008546525e-07, |
|
"loss": 0.1338, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 2.6986666666666665, |
|
"grad_norm": 1.3906620863471058, |
|
"learning_rate": 3.032690980074915e-07, |
|
"loss": 0.131, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 2.7022222222222223, |
|
"grad_norm": 1.8327466893042572, |
|
"learning_rate": 2.962132185557826e-07, |
|
"loss": 0.1223, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.7057777777777776, |
|
"grad_norm": 1.5547638545825841, |
|
"learning_rate": 2.892378833192611e-07, |
|
"loss": 0.1282, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 2.7093333333333334, |
|
"grad_norm": 1.804096897597165, |
|
"learning_rate": 2.823432117384822e-07, |
|
"loss": 0.1321, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 2.712888888888889, |
|
"grad_norm": 1.5920189474841397, |
|
"learning_rate": 2.755293218727739e-07, |
|
"loss": 0.1266, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 2.7164444444444444, |
|
"grad_norm": 1.95119518386987, |
|
"learning_rate": 2.6879633039821994e-07, |
|
"loss": 0.1356, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 1.8385230420520196, |
|
"learning_rate": 2.62144352605655e-07, |
|
"loss": 0.1262, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 2.7235555555555555, |
|
"grad_norm": 1.7885799872230752, |
|
"learning_rate": 2.555735023986966e-07, |
|
"loss": 0.1315, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 2.7271111111111113, |
|
"grad_norm": 1.8941729319880476, |
|
"learning_rate": 2.4908389229179484e-07, |
|
"loss": 0.1179, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 2.7306666666666666, |
|
"grad_norm": 1.5725333890356554, |
|
"learning_rate": 2.4267563340830026e-07, |
|
"loss": 0.1122, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 2.7342222222222223, |
|
"grad_norm": 1.9949059298619423, |
|
"learning_rate": 2.363488354785648e-07, |
|
"loss": 0.1372, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 2.7377777777777776, |
|
"grad_norm": 1.706241835042834, |
|
"learning_rate": 2.301036068380641e-07, |
|
"loss": 0.1303, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.7413333333333334, |
|
"grad_norm": 1.5015166048586166, |
|
"learning_rate": 2.239400544255399e-07, |
|
"loss": 0.121, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 2.744888888888889, |
|
"grad_norm": 1.69358016809196, |
|
"learning_rate": 2.178582837811688e-07, |
|
"loss": 0.1249, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 2.7484444444444445, |
|
"grad_norm": 1.9732967017351475, |
|
"learning_rate": 2.1185839904475869e-07, |
|
"loss": 0.133, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 1.5594363807881604, |
|
"learning_rate": 2.0594050295395852e-07, |
|
"loss": 0.1304, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 2.7555555555555555, |
|
"grad_norm": 2.026099043557669, |
|
"learning_rate": 2.0010469684250856e-07, |
|
"loss": 0.1385, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 2.7591111111111113, |
|
"grad_norm": 1.5917173969753626, |
|
"learning_rate": 1.9435108063849684e-07, |
|
"loss": 0.1365, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 2.7626666666666666, |
|
"grad_norm": 1.7387563784538043, |
|
"learning_rate": 1.8867975286265106e-07, |
|
"loss": 0.1278, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 2.7662222222222224, |
|
"grad_norm": 1.491992475001642, |
|
"learning_rate": 1.830908106266538e-07, |
|
"loss": 0.1169, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 2.7697777777777777, |
|
"grad_norm": 1.8209635910179756, |
|
"learning_rate": 1.7758434963147665e-07, |
|
"loss": 0.143, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 2.7733333333333334, |
|
"grad_norm": 1.6054626426110197, |
|
"learning_rate": 1.7216046416574316e-07, |
|
"loss": 0.1335, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.7768888888888887, |
|
"grad_norm": 1.6151516199907796, |
|
"learning_rate": 1.66819247104113e-07, |
|
"loss": 0.1338, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 2.7804444444444445, |
|
"grad_norm": 1.9698941742198866, |
|
"learning_rate": 1.6156078990569313e-07, |
|
"loss": 0.1203, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 1.6305672042666572, |
|
"learning_rate": 1.563851826124696e-07, |
|
"loss": 0.1216, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 2.7875555555555556, |
|
"grad_norm": 1.0194788026355706, |
|
"learning_rate": 1.5129251384776998e-07, |
|
"loss": 0.1181, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 2.7911111111111113, |
|
"grad_norm": 1.7073067625712353, |
|
"learning_rate": 1.462828708147379e-07, |
|
"loss": 0.139, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 2.7946666666666666, |
|
"grad_norm": 1.4957713592543374, |
|
"learning_rate": 1.4135633929485026e-07, |
|
"loss": 0.1373, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 2.7982222222222224, |
|
"grad_norm": 1.6268976958462047, |
|
"learning_rate": 1.3651300364644126e-07, |
|
"loss": 0.1294, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 2.8017777777777777, |
|
"grad_norm": 1.3636030825381604, |
|
"learning_rate": 1.317529468032569e-07, |
|
"loss": 0.1158, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 2.8053333333333335, |
|
"grad_norm": 1.5147346477252843, |
|
"learning_rate": 1.2707625027304104e-07, |
|
"loss": 0.124, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 2.8088888888888888, |
|
"grad_norm": 1.7193516342629052, |
|
"learning_rate": 1.2248299413613607e-07, |
|
"loss": 0.1332, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.8124444444444445, |
|
"grad_norm": 1.6484553937509365, |
|
"learning_rate": 1.1797325704411e-07, |
|
"loss": 0.1214, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 1.6919284405549642, |
|
"learning_rate": 1.1354711621841208e-07, |
|
"loss": 0.133, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 2.8195555555555556, |
|
"grad_norm": 1.223501357852658, |
|
"learning_rate": 1.0920464744905157e-07, |
|
"loss": 0.1205, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 2.8231111111111113, |
|
"grad_norm": 1.5481520280664143, |
|
"learning_rate": 1.0494592509329716e-07, |
|
"loss": 0.1469, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 2.8266666666666667, |
|
"grad_norm": 1.7879544199201751, |
|
"learning_rate": 1.007710220744057e-07, |
|
"loss": 0.1269, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 2.830222222222222, |
|
"grad_norm": 1.513993378655108, |
|
"learning_rate": 9.668000988037163e-08, |
|
"loss": 0.1322, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 2.8337777777777777, |
|
"grad_norm": 1.7964467427017516, |
|
"learning_rate": 9.267295856270509e-08, |
|
"loss": 0.1354, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 2.8373333333333335, |
|
"grad_norm": 1.787987364521523, |
|
"learning_rate": 8.874993673523236e-08, |
|
"loss": 0.1319, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 2.840888888888889, |
|
"grad_norm": 1.6897870372176325, |
|
"learning_rate": 8.491101157291737e-08, |
|
"loss": 0.1274, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"grad_norm": 1.6105609971746402, |
|
"learning_rate": 8.115624881071594e-08, |
|
"loss": 0.1318, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"eval_loss": 0.23905394971370697, |
|
"eval_runtime": 559.7682, |
|
"eval_samples_per_second": 17.865, |
|
"eval_steps_per_second": 4.466, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 1.3881391902801445, |
|
"learning_rate": 7.748571274244776e-08, |
|
"loss": 0.1199, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 2.8515555555555556, |
|
"grad_norm": 1.8275543306577795, |
|
"learning_rate": 7.389946621969679e-08, |
|
"loss": 0.1494, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 2.8551111111111114, |
|
"grad_norm": 1.8960525825598256, |
|
"learning_rate": 7.039757065073316e-08, |
|
"loss": 0.1354, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 2.8586666666666667, |
|
"grad_norm": 1.6485916403071794, |
|
"learning_rate": 6.698008599946404e-08, |
|
"loss": 0.1246, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 2.862222222222222, |
|
"grad_norm": 1.2435705558011503, |
|
"learning_rate": 6.364707078440335e-08, |
|
"loss": 0.1266, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 2.8657777777777778, |
|
"grad_norm": 1.5746164801301799, |
|
"learning_rate": 6.039858207767479e-08, |
|
"loss": 0.134, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 2.8693333333333335, |
|
"grad_norm": 1.5169697571883205, |
|
"learning_rate": 5.723467550403039e-08, |
|
"loss": 0.1326, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 2.872888888888889, |
|
"grad_norm": 1.5881237505008923, |
|
"learning_rate": 5.4155405239897926e-08, |
|
"loss": 0.1488, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 2.8764444444444446, |
|
"grad_norm": 1.690061086159581, |
|
"learning_rate": 5.1160824012458367e-08, |
|
"loss": 0.1232, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.6253293072576216, |
|
"learning_rate": 4.825098309873544e-08, |
|
"loss": 0.1264, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.8835555555555556, |
|
"grad_norm": 1.8528993602738453, |
|
"learning_rate": 4.542593232472414e-08, |
|
"loss": 0.1328, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 2.887111111111111, |
|
"grad_norm": 1.949296952991108, |
|
"learning_rate": 4.268572006453364e-08, |
|
"loss": 0.1264, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 2.8906666666666667, |
|
"grad_norm": 1.5505902041733666, |
|
"learning_rate": 4.003039323956126e-08, |
|
"loss": 0.1308, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 2.894222222222222, |
|
"grad_norm": 0.9023008663346067, |
|
"learning_rate": 3.7459997317687014e-08, |
|
"loss": 0.1101, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 2.897777777777778, |
|
"grad_norm": 1.8468547733058307, |
|
"learning_rate": 3.4974576312497564e-08, |
|
"loss": 0.1249, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 2.9013333333333335, |
|
"grad_norm": 1.7056102650658924, |
|
"learning_rate": 3.25741727825285e-08, |
|
"loss": 0.1193, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 2.904888888888889, |
|
"grad_norm": 1.3690587953613977, |
|
"learning_rate": 3.025882783054046e-08, |
|
"loss": 0.1199, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 2.9084444444444446, |
|
"grad_norm": 1.3946208158917515, |
|
"learning_rate": 2.8028581102811924e-08, |
|
"loss": 0.1365, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 1.9644328667604294, |
|
"learning_rate": 2.588347078846254e-08, |
|
"loss": 0.1323, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 2.9155555555555557, |
|
"grad_norm": 1.7619431494028974, |
|
"learning_rate": 2.382353361879586e-08, |
|
"loss": 0.1244, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.919111111111111, |
|
"grad_norm": 1.6739735252712569, |
|
"learning_rate": 2.18488048666754e-08, |
|
"loss": 0.1241, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 2.9226666666666667, |
|
"grad_norm": 1.7618267751958017, |
|
"learning_rate": 1.995931834591569e-08, |
|
"loss": 0.132, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 2.926222222222222, |
|
"grad_norm": 1.5149144065240054, |
|
"learning_rate": 1.8155106410706613e-08, |
|
"loss": 0.1359, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 2.929777777777778, |
|
"grad_norm": 1.7464428231188038, |
|
"learning_rate": 1.6436199955057742e-08, |
|
"loss": 0.1477, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 2.9333333333333336, |
|
"grad_norm": 1.7961519057796862, |
|
"learning_rate": 1.480262841226987e-08, |
|
"loss": 0.1482, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 2.936888888888889, |
|
"grad_norm": 1.668237688338044, |
|
"learning_rate": 1.3254419754430981e-08, |
|
"loss": 0.1369, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 2.940444444444444, |
|
"grad_norm": 1.5710565780518715, |
|
"learning_rate": 1.1791600491937172e-08, |
|
"loss": 0.1265, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 1.6190463651101816, |
|
"learning_rate": 1.041419567303914e-08, |
|
"loss": 0.1233, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 2.9475555555555557, |
|
"grad_norm": 1.3359272700606026, |
|
"learning_rate": 9.12222888341252e-09, |
|
"loss": 0.1308, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 2.951111111111111, |
|
"grad_norm": 1.7965214936961842, |
|
"learning_rate": 7.915722245754876e-09, |
|
"loss": 0.141, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.9546666666666668, |
|
"grad_norm": 1.7433994283889143, |
|
"learning_rate": 6.7946964194059994e-09, |
|
"loss": 0.1493, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 2.958222222222222, |
|
"grad_norm": 1.6666804006077884, |
|
"learning_rate": 5.759170599994868e-09, |
|
"loss": 0.1284, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 2.961777777777778, |
|
"grad_norm": 1.4232443691197452, |
|
"learning_rate": 4.809162519110455e-09, |
|
"loss": 0.1231, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 2.9653333333333336, |
|
"grad_norm": 1.8464380977109713, |
|
"learning_rate": 3.944688443998646e-09, |
|
"loss": 0.1466, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 2.968888888888889, |
|
"grad_norm": 1.8474020149086245, |
|
"learning_rate": 3.16576317728301e-09, |
|
"loss": 0.126, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 2.9724444444444442, |
|
"grad_norm": 1.731427281949659, |
|
"learning_rate": 2.4724000567116768e-09, |
|
"loss": 0.1361, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 1.8993388895043506, |
|
"learning_rate": 1.86461095492918e-09, |
|
"loss": 0.1258, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 2.9795555555555557, |
|
"grad_norm": 1.676714063923629, |
|
"learning_rate": 1.3424062792738445e-09, |
|
"loss": 0.1311, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 2.983111111111111, |
|
"grad_norm": 1.714542756833673, |
|
"learning_rate": 9.057949715968183e-10, |
|
"loss": 0.1236, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 2.986666666666667, |
|
"grad_norm": 1.6829258625832335, |
|
"learning_rate": 5.547845081121939e-10, |
|
"loss": 0.1171, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.990222222222222, |
|
"grad_norm": 1.5917279579386703, |
|
"learning_rate": 2.89380899267111e-10, |
|
"loss": 0.1309, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 2.993777777777778, |
|
"grad_norm": 1.72982950263424, |
|
"learning_rate": 1.0958868963906188e-10, |
|
"loss": 0.1314, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 2.997333333333333, |
|
"grad_norm": 1.6121134095652765, |
|
"learning_rate": 1.541095785984048e-11, |
|
"loss": 0.1267, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 2.9994666666666667, |
|
"step": 8436, |
|
"total_flos": 621656373067776.0, |
|
"train_loss": 0.25353994178455436, |
|
"train_runtime": 39823.1973, |
|
"train_samples_per_second": 6.78, |
|
"train_steps_per_second": 0.212 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 8436, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 621656373067776.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|