|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.994262174488745, |
|
"eval_steps": 500, |
|
"global_step": 1272, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02353979696925114, |
|
"grad_norm": 8.782275747399924, |
|
"learning_rate": 3.846153846153846e-06, |
|
"loss": 1.5306, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04707959393850228, |
|
"grad_norm": 4.3210560861300085, |
|
"learning_rate": 7.692307692307692e-06, |
|
"loss": 0.8302, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07061939090775342, |
|
"grad_norm": 3.023277309045724, |
|
"learning_rate": 1.153846153846154e-05, |
|
"loss": 0.6217, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09415918787700456, |
|
"grad_norm": 2.654024929474284, |
|
"learning_rate": 1.4999975655288908e-05, |
|
"loss": 0.5967, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1176989848462557, |
|
"grad_norm": 2.8603296276880723, |
|
"learning_rate": 1.4997054481186253e-05, |
|
"loss": 0.5732, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14123878181550684, |
|
"grad_norm": 2.4252498205382347, |
|
"learning_rate": 1.4989266537738068e-05, |
|
"loss": 0.5573, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.164778578784758, |
|
"grad_norm": 2.0008212035639787, |
|
"learning_rate": 1.4976616880546462e-05, |
|
"loss": 0.5488, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18831837575400912, |
|
"grad_norm": 2.1468051355877065, |
|
"learning_rate": 1.4959113721231613e-05, |
|
"loss": 0.5378, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21185817272326027, |
|
"grad_norm": 2.09758107733118, |
|
"learning_rate": 1.4936768422101134e-05, |
|
"loss": 0.524, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2353979696925114, |
|
"grad_norm": 1.9891368777701572, |
|
"learning_rate": 1.4909595488774145e-05, |
|
"loss": 0.5238, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25893776666176255, |
|
"grad_norm": 1.8856649935655327, |
|
"learning_rate": 1.487761256076484e-05, |
|
"loss": 0.51, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2824775636310137, |
|
"grad_norm": 1.9946450768120385, |
|
"learning_rate": 1.4840840400031667e-05, |
|
"loss": 0.5045, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3060173606002648, |
|
"grad_norm": 2.02887277527658, |
|
"learning_rate": 1.4799302877499543e-05, |
|
"loss": 0.4993, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.329557157569516, |
|
"grad_norm": 1.7846741494593932, |
|
"learning_rate": 1.475302695756387e-05, |
|
"loss": 0.4959, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3530969545387671, |
|
"grad_norm": 1.7910316946805966, |
|
"learning_rate": 1.4702042680586378e-05, |
|
"loss": 0.4903, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.37663675150801823, |
|
"grad_norm": 1.583559495202325, |
|
"learning_rate": 1.4646383143394222e-05, |
|
"loss": 0.4852, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.40017654847726936, |
|
"grad_norm": 1.8258988651876884, |
|
"learning_rate": 1.458608447779491e-05, |
|
"loss": 0.4835, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.42371634544652054, |
|
"grad_norm": 1.7923596937902033, |
|
"learning_rate": 1.4521185827121071e-05, |
|
"loss": 0.474, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.44725614241577166, |
|
"grad_norm": 1.8160198247756183, |
|
"learning_rate": 1.4451729320820273e-05, |
|
"loss": 0.4721, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4707959393850228, |
|
"grad_norm": 1.7150789518558318, |
|
"learning_rate": 1.437776004710637e-05, |
|
"loss": 0.4729, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.49433573635427397, |
|
"grad_norm": 1.7400033525862848, |
|
"learning_rate": 1.4299326023690146e-05, |
|
"loss": 0.4607, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5178755333235251, |
|
"grad_norm": 1.7002289663882648, |
|
"learning_rate": 1.4216478166608262e-05, |
|
"loss": 0.4613, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5414153302927762, |
|
"grad_norm": 1.534972777417761, |
|
"learning_rate": 1.4129270257170722e-05, |
|
"loss": 0.4567, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5649551272620273, |
|
"grad_norm": 1.6489960881148158, |
|
"learning_rate": 1.403775890704834e-05, |
|
"loss": 0.4559, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5884949242312785, |
|
"grad_norm": 1.7846842786992185, |
|
"learning_rate": 1.3942003521522837e-05, |
|
"loss": 0.4624, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6120347212005296, |
|
"grad_norm": 1.4351473554704604, |
|
"learning_rate": 1.3842066260923462e-05, |
|
"loss": 0.4496, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6355745181697808, |
|
"grad_norm": 1.541683257911224, |
|
"learning_rate": 1.3738012000275142e-05, |
|
"loss": 0.4516, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.659114315139032, |
|
"grad_norm": 1.5879015034427175, |
|
"learning_rate": 1.3629908287184381e-05, |
|
"loss": 0.4475, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6826541121082831, |
|
"grad_norm": 1.728309878285744, |
|
"learning_rate": 1.3517825297990205e-05, |
|
"loss": 0.4446, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7061939090775342, |
|
"grad_norm": 1.4690146169046678, |
|
"learning_rate": 1.340183579220868e-05, |
|
"loss": 0.4491, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7297337060467853, |
|
"grad_norm": 1.4969895208337947, |
|
"learning_rate": 1.328201506530052e-05, |
|
"loss": 0.4436, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7532735030160365, |
|
"grad_norm": 1.4385449709842155, |
|
"learning_rate": 1.3158440899792465e-05, |
|
"loss": 0.4344, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7768132999852876, |
|
"grad_norm": 1.4819229027772451, |
|
"learning_rate": 1.3031193514784178e-05, |
|
"loss": 0.4387, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8003530969545387, |
|
"grad_norm": 1.5138328167674073, |
|
"learning_rate": 1.2900355513873408e-05, |
|
"loss": 0.4374, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.82389289392379, |
|
"grad_norm": 1.5486731806719658, |
|
"learning_rate": 1.276601183153324e-05, |
|
"loss": 0.4268, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8474326908930411, |
|
"grad_norm": 1.3966287731811042, |
|
"learning_rate": 1.2628249677976246e-05, |
|
"loss": 0.4222, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8709724878622922, |
|
"grad_norm": 1.4851117277296138, |
|
"learning_rate": 1.2487158482541324e-05, |
|
"loss": 0.4304, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8945122848315433, |
|
"grad_norm": 1.4886042812511937, |
|
"learning_rate": 1.2342829835639957e-05, |
|
"loss": 0.4327, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9180520818007945, |
|
"grad_norm": 1.5916975841301912, |
|
"learning_rate": 1.219535742929962e-05, |
|
"loss": 0.4312, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9415918787700456, |
|
"grad_norm": 1.4359537454947529, |
|
"learning_rate": 1.204483699634289e-05, |
|
"loss": 0.418, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9651316757392967, |
|
"grad_norm": 1.5039520247360716, |
|
"learning_rate": 1.1891366248241758e-05, |
|
"loss": 0.4169, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9886714727085479, |
|
"grad_norm": 1.4396432279411446, |
|
"learning_rate": 1.1735044811687508e-05, |
|
"loss": 0.4165, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.012211269677799, |
|
"grad_norm": 1.4837321963480583, |
|
"learning_rate": 1.1575974163917282e-05, |
|
"loss": 0.3505, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0357510666470502, |
|
"grad_norm": 1.438403278814582, |
|
"learning_rate": 1.1414257566839376e-05, |
|
"loss": 0.2795, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0592908636163012, |
|
"grad_norm": 1.510877062877757, |
|
"learning_rate": 1.125e-05, |
|
"loss": 0.2814, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0828306605855524, |
|
"grad_norm": 1.519349451859131, |
|
"learning_rate": 1.1083308092435006e-05, |
|
"loss": 0.2773, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1063704575548037, |
|
"grad_norm": 1.3106222683180944, |
|
"learning_rate": 1.091429005345085e-05, |
|
"loss": 0.2706, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1299102545240547, |
|
"grad_norm": 1.3984600885535579, |
|
"learning_rate": 1.0743055602379712e-05, |
|
"loss": 0.2754, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.153450051493306, |
|
"grad_norm": 1.4833593619381575, |
|
"learning_rate": 1.0569715897354354e-05, |
|
"loss": 0.2756, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.176989848462557, |
|
"grad_norm": 1.3932629119237105, |
|
"learning_rate": 1.0394383463148993e-05, |
|
"loss": 0.27, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2005296454318082, |
|
"grad_norm": 1.4062493836827707, |
|
"learning_rate": 1.0217172118132994e-05, |
|
"loss": 0.2703, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2240694424010594, |
|
"grad_norm": 1.4874549162201474, |
|
"learning_rate": 1.003819690038481e-05, |
|
"loss": 0.2739, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2476092393703104, |
|
"grad_norm": 1.3708361425521698, |
|
"learning_rate": 9.857573993014161e-06, |
|
"loss": 0.2754, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2711490363395614, |
|
"grad_norm": 1.3978327948674458, |
|
"learning_rate": 9.675420648740886e-06, |
|
"loss": 0.2692, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2946888333088127, |
|
"grad_norm": 1.4715538000650579, |
|
"learning_rate": 9.491855113779456e-06, |
|
"loss": 0.2694, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.318228630278064, |
|
"grad_norm": 1.3468394374097452, |
|
"learning_rate": 9.306996551078556e-06, |
|
"loss": 0.2678, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.341768427247315, |
|
"grad_norm": 1.5175300555093134, |
|
"learning_rate": 9.120964962965565e-06, |
|
"loss": 0.266, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3653082242165662, |
|
"grad_norm": 1.317724218624981, |
|
"learning_rate": 8.933881113246134e-06, |
|
"loss": 0.2666, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3888480211858174, |
|
"grad_norm": 1.3809839276376867, |
|
"learning_rate": 8.74586644880946e-06, |
|
"loss": 0.2672, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4123878181550684, |
|
"grad_norm": 1.3120307514577167, |
|
"learning_rate": 8.557043020790113e-06, |
|
"loss": 0.2671, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4359276151243194, |
|
"grad_norm": 1.5865169519570839, |
|
"learning_rate": 8.367533405337635e-06, |
|
"loss": 0.2669, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4594674120935707, |
|
"grad_norm": 1.315530128518467, |
|
"learning_rate": 8.177460624045303e-06, |
|
"loss": 0.2619, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.483007209062822, |
|
"grad_norm": 1.2729315130214236, |
|
"learning_rate": 7.986948064089741e-06, |
|
"loss": 0.2625, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.506547006032073, |
|
"grad_norm": 1.2890769997858393, |
|
"learning_rate": 7.7961193981332e-06, |
|
"loss": 0.2624, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5300868030013242, |
|
"grad_norm": 1.344250246429933, |
|
"learning_rate": 7.605098504040519e-06, |
|
"loss": 0.2571, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5536265999705754, |
|
"grad_norm": 1.3112449299655193, |
|
"learning_rate": 7.414009384462882e-06, |
|
"loss": 0.2603, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5771663969398264, |
|
"grad_norm": 1.3722682469514345, |
|
"learning_rate": 7.22297608634056e-06, |
|
"loss": 0.2592, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6007061939090774, |
|
"grad_norm": 1.3170860077023523, |
|
"learning_rate": 7.032122620376899e-06, |
|
"loss": 0.2535, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6242459908783287, |
|
"grad_norm": 1.3106743856940115, |
|
"learning_rate": 6.841572880535854e-06, |
|
"loss": 0.2531, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.64778578784758, |
|
"grad_norm": 1.2798600011553958, |
|
"learning_rate": 6.6514505636152814e-06, |
|
"loss": 0.2548, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.671325584816831, |
|
"grad_norm": 1.3968866134114841, |
|
"learning_rate": 6.461879088948227e-06, |
|
"loss": 0.2507, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.6948653817860821, |
|
"grad_norm": 1.378986621794313, |
|
"learning_rate": 6.272981518284342e-06, |
|
"loss": 0.249, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7184051787553334, |
|
"grad_norm": 1.2776388271568753, |
|
"learning_rate": 6.084880475903424e-06, |
|
"loss": 0.2451, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7419449757245844, |
|
"grad_norm": 1.317908633074014, |
|
"learning_rate": 5.897698069012938e-06, |
|
"loss": 0.2484, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7654847726938354, |
|
"grad_norm": 1.2702586608772435, |
|
"learning_rate": 5.711555808481213e-06, |
|
"loss": 0.2486, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.7890245696630867, |
|
"grad_norm": 1.3790950940287556, |
|
"learning_rate": 5.526574529957729e-06, |
|
"loss": 0.2451, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8125643666323379, |
|
"grad_norm": 1.417794065597115, |
|
"learning_rate": 5.342874315431765e-06, |
|
"loss": 0.2442, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.836104163601589, |
|
"grad_norm": 1.3118057182766703, |
|
"learning_rate": 5.160574415280257e-06, |
|
"loss": 0.2438, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.85964396057084, |
|
"grad_norm": 1.380491802365371, |
|
"learning_rate": 4.979793170855537e-06, |
|
"loss": 0.2368, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8831837575400914, |
|
"grad_norm": 1.3404717450970274, |
|
"learning_rate": 4.800647937663126e-06, |
|
"loss": 0.2393, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9067235545093424, |
|
"grad_norm": 1.3084085289097462, |
|
"learning_rate": 4.623255009179547e-06, |
|
"loss": 0.2402, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9302633514785934, |
|
"grad_norm": 1.2819422568439027, |
|
"learning_rate": 4.4477295413595385e-06, |
|
"loss": 0.2359, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.9538031484478446, |
|
"grad_norm": 1.2506810831080168, |
|
"learning_rate": 4.274185477881681e-06, |
|
"loss": 0.2365, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9773429454170959, |
|
"grad_norm": 1.236732854007973, |
|
"learning_rate": 4.102735476181019e-06, |
|
"loss": 0.2308, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.000882742386347, |
|
"grad_norm": 1.2110215418076606, |
|
"learning_rate": 3.933490834316633e-06, |
|
"loss": 0.2284, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.024422539355598, |
|
"grad_norm": 1.4341116240165481, |
|
"learning_rate": 3.7665614187216603e-06, |
|
"loss": 0.128, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.0479623363248494, |
|
"grad_norm": 1.1448959546708688, |
|
"learning_rate": 3.6020555928826935e-06, |
|
"loss": 0.1201, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.0715021332941004, |
|
"grad_norm": 1.1891044799932367, |
|
"learning_rate": 3.4400801469947734e-06, |
|
"loss": 0.1196, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.0950419302633514, |
|
"grad_norm": 1.2226413834941532, |
|
"learning_rate": 3.2807402286377675e-06, |
|
"loss": 0.1183, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.1185817272326024, |
|
"grad_norm": 1.3069856648534286, |
|
"learning_rate": 3.1241392745189985e-06, |
|
"loss": 0.1181, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.142121524201854, |
|
"grad_norm": 1.1702804083458354, |
|
"learning_rate": 2.970378943326543e-06, |
|
"loss": 0.1166, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.165661321171105, |
|
"grad_norm": 1.2124612400300803, |
|
"learning_rate": 2.8195590497367222e-06, |
|
"loss": 0.1155, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.189201118140356, |
|
"grad_norm": 1.1611863697964633, |
|
"learning_rate": 2.671777499618652e-06, |
|
"loss": 0.1159, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2127409151096074, |
|
"grad_norm": 1.1087717944945865, |
|
"learning_rate": 2.527130226477926e-06, |
|
"loss": 0.1168, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.2362807120788584, |
|
"grad_norm": 1.2482164891383611, |
|
"learning_rate": 2.3857111291806348e-06, |
|
"loss": 0.114, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.2598205090481094, |
|
"grad_norm": 1.1704503748862236, |
|
"learning_rate": 2.2476120109982267e-06, |
|
"loss": 0.1146, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.283360306017361, |
|
"grad_norm": 1.1831598264423941, |
|
"learning_rate": 2.1129225200127196e-06, |
|
"loss": 0.1142, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.306900102986612, |
|
"grad_norm": 1.1923513771478957, |
|
"learning_rate": 1.981730090920969e-06, |
|
"loss": 0.1121, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.330439899955863, |
|
"grad_norm": 1.1650306486853608, |
|
"learning_rate": 1.8541198882757892e-06, |
|
"loss": 0.1103, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.353979696925114, |
|
"grad_norm": 1.2474184378379198, |
|
"learning_rate": 1.73017475120072e-06, |
|
"loss": 0.1111, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.3775194938943653, |
|
"grad_norm": 1.1410121185645685, |
|
"learning_rate": 1.6099751396144142e-06, |
|
"loss": 0.1101, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4010592908636164, |
|
"grad_norm": 1.1781449006163716, |
|
"learning_rate": 1.493599081999453e-06, |
|
"loss": 0.1085, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.4245990878328674, |
|
"grad_norm": 1.095164172281657, |
|
"learning_rate": 1.3811221247495784e-06, |
|
"loss": 0.109, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.448138884802119, |
|
"grad_norm": 1.1290274158456068, |
|
"learning_rate": 1.2726172831281898e-06, |
|
"loss": 0.1083, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.47167868177137, |
|
"grad_norm": 1.1205826114027153, |
|
"learning_rate": 1.1681549938699157e-06, |
|
"loss": 0.1079, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.495218478740621, |
|
"grad_norm": 1.237936841602485, |
|
"learning_rate": 1.0678030694561044e-06, |
|
"loss": 0.1084, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.518758275709872, |
|
"grad_norm": 1.2748079747165908, |
|
"learning_rate": 9.716266540938183e-07, |
|
"loss": 0.1084, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.542298072679123, |
|
"grad_norm": 1.1620787715820426, |
|
"learning_rate": 8.796881814269941e-07, |
|
"loss": 0.106, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.5658378696483743, |
|
"grad_norm": 1.1814024146181927, |
|
"learning_rate": 7.92047334007169e-07, |
|
"loss": 0.1037, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.5893776666176254, |
|
"grad_norm": 1.1411929584644598, |
|
"learning_rate": 7.087610045501023e-07, |
|
"loss": 0.1044, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.612917463586877, |
|
"grad_norm": 1.1770255689497389, |
|
"learning_rate": 6.29883259003445e-07, |
|
"loss": 0.1051, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.636457260556128, |
|
"grad_norm": 1.1667654713458508, |
|
"learning_rate": 5.554653014494176e-07, |
|
"loss": 0.1031, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.659997057525379, |
|
"grad_norm": 1.238846596793772, |
|
"learning_rate": 4.855554408652985e-07, |
|
"loss": 0.1039, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.68353685449463, |
|
"grad_norm": 1.0962748439341052, |
|
"learning_rate": 4.201990597632832e-07, |
|
"loss": 0.1032, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.707076651463881, |
|
"grad_norm": 1.1228066231093363, |
|
"learning_rate": 3.5943858473009237e-07, |
|
"loss": 0.105, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.7306164484331323, |
|
"grad_norm": 1.1681919060053207, |
|
"learning_rate": 3.033134588854289e-07, |
|
"loss": 0.1029, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.7541562454023834, |
|
"grad_norm": 1.107535834090668, |
|
"learning_rate": 2.518601162771755e-07, |
|
"loss": 0.1048, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.777696042371635, |
|
"grad_norm": 1.0985797806813495, |
|
"learning_rate": 2.0511195822997013e-07, |
|
"loss": 0.102, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.801235839340886, |
|
"grad_norm": 1.1563052202999995, |
|
"learning_rate": 1.6309933166247403e-07, |
|
"loss": 0.1028, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.824775636310137, |
|
"grad_norm": 1.1299722934234167, |
|
"learning_rate": 1.258495093874454e-07, |
|
"loss": 0.1006, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.848315433279388, |
|
"grad_norm": 1.0731524173833442, |
|
"learning_rate": 9.338667240738619e-08, |
|
"loss": 0.1029, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.871855230248639, |
|
"grad_norm": 1.0991693593720597, |
|
"learning_rate": 6.573189421726466e-08, |
|
"loss": 0.102, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.8953950272178903, |
|
"grad_norm": 1.1501966264186627, |
|
"learning_rate": 4.2903127124496454e-08, |
|
"loss": 0.1032, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.9189348241871413, |
|
"grad_norm": 1.1357187349706255, |
|
"learning_rate": 2.4915190595068415e-08, |
|
"loss": 0.1031, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.942474621156393, |
|
"grad_norm": 1.1136651822369734, |
|
"learning_rate": 1.1779761633370633e-08, |
|
"loss": 0.0998, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.966014418125644, |
|
"grad_norm": 1.1212278780119938, |
|
"learning_rate": 3.50536720197997e-09, |
|
"loss": 0.1005, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.989554215094895, |
|
"grad_norm": 1.1503104789107468, |
|
"learning_rate": 9.73786863237014e-11, |
|
"loss": 0.1019, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.994262174488745, |
|
"step": 1272, |
|
"total_flos": 76511791349760.0, |
|
"train_loss": 0.29241873632015297, |
|
"train_runtime": 70414.7871, |
|
"train_samples_per_second": 1.158, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1272, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 76511791349760.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|