|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 2986, |
|
"global_step": 14930, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.033489618218352314, |
|
"grad_norm": 27.663120029291747, |
|
"learning_rate": 6.697923643670463e-07, |
|
"loss": 11.7393, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06697923643670463, |
|
"grad_norm": 1.7320339885689573, |
|
"learning_rate": 1.3395847287340927e-06, |
|
"loss": 3.297, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10046885465505694, |
|
"grad_norm": 1.2696699900839306, |
|
"learning_rate": 2.0093770931011387e-06, |
|
"loss": 1.8384, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13395847287340926, |
|
"grad_norm": 1.1644334409082224, |
|
"learning_rate": 2.6791694574681854e-06, |
|
"loss": 1.6728, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.16744809109176156, |
|
"grad_norm": 1.272695149119617, |
|
"learning_rate": 3.3489618218352316e-06, |
|
"loss": 1.5883, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.20093770931011387, |
|
"grad_norm": 1.2497604240608484, |
|
"learning_rate": 4.018754186202277e-06, |
|
"loss": 1.5399, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.23442732752846618, |
|
"grad_norm": 1.1521160041280882, |
|
"learning_rate": 4.688546550569324e-06, |
|
"loss": 1.5214, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2679169457468185, |
|
"grad_norm": 1.0610261246972188, |
|
"learning_rate": 5.358338914936371e-06, |
|
"loss": 1.4822, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3014065639651708, |
|
"grad_norm": 1.1385854301492127, |
|
"learning_rate": 6.028131279303416e-06, |
|
"loss": 1.4623, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.33489618218352313, |
|
"grad_norm": 1.04201570658625, |
|
"learning_rate": 6.697923643670463e-06, |
|
"loss": 1.4408, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3683858004018754, |
|
"grad_norm": 1.0568444189378532, |
|
"learning_rate": 7.3677160080375086e-06, |
|
"loss": 1.4378, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.40187541862022774, |
|
"grad_norm": 1.088346568816493, |
|
"learning_rate": 8.037508372404555e-06, |
|
"loss": 1.4189, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.43536503683858, |
|
"grad_norm": 1.0714826983472352, |
|
"learning_rate": 8.707300736771601e-06, |
|
"loss": 1.4098, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.46885465505693236, |
|
"grad_norm": 1.0415739456028938, |
|
"learning_rate": 9.377093101138647e-06, |
|
"loss": 1.4011, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5023442732752846, |
|
"grad_norm": 1.0789186294154542, |
|
"learning_rate": 9.999993303758581e-06, |
|
"loss": 1.3933, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.535833891493637, |
|
"grad_norm": 1.0509262824198378, |
|
"learning_rate": 9.998435483941776e-06, |
|
"loss": 1.3828, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5693235097119893, |
|
"grad_norm": 1.0764796757312198, |
|
"learning_rate": 9.994145483428403e-06, |
|
"loss": 1.3772, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6028131279303416, |
|
"grad_norm": 1.047153842290928, |
|
"learning_rate": 9.987125647163527e-06, |
|
"loss": 1.3674, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6363027461486939, |
|
"grad_norm": 0.9188670491033151, |
|
"learning_rate": 9.977379812240013e-06, |
|
"loss": 1.3538, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6697923643670463, |
|
"grad_norm": 1.004701344911835, |
|
"learning_rate": 9.964913305801151e-06, |
|
"loss": 1.3566, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7032819825853985, |
|
"grad_norm": 1.1760354727564355, |
|
"learning_rate": 9.9497329421288e-06, |
|
"loss": 1.3473, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7367716008037508, |
|
"grad_norm": 0.9440290166066829, |
|
"learning_rate": 9.931847018918654e-06, |
|
"loss": 1.35, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7702612190221031, |
|
"grad_norm": 0.8730032484094693, |
|
"learning_rate": 9.911265312744663e-06, |
|
"loss": 1.3381, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8037508372404555, |
|
"grad_norm": 1.0757201534073755, |
|
"learning_rate": 9.887999073715083e-06, |
|
"loss": 1.3409, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8372404554588078, |
|
"grad_norm": 0.8841599798410706, |
|
"learning_rate": 9.86206101932309e-06, |
|
"loss": 1.333, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.87073007367716, |
|
"grad_norm": 0.9378532515897725, |
|
"learning_rate": 9.833465327495307e-06, |
|
"loss": 1.3195, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.9042196918955124, |
|
"grad_norm": 1.2091982749118577, |
|
"learning_rate": 9.802227628842045e-06, |
|
"loss": 1.3183, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9377093101138647, |
|
"grad_norm": 1.0171797419584845, |
|
"learning_rate": 9.7683649981135e-06, |
|
"loss": 1.3145, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.971198928332217, |
|
"grad_norm": 0.9664734856676821, |
|
"learning_rate": 9.731895944866576e-06, |
|
"loss": 1.3178, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": NaN, |
|
"eval_runtime": 348.649, |
|
"eval_samples_per_second": 45.673, |
|
"eval_steps_per_second": 1.428, |
|
"step": 2986 |
|
}, |
|
{ |
|
"epoch": 1.0046885465505693, |
|
"grad_norm": 1.0415846298834508, |
|
"learning_rate": 9.69284040334742e-06, |
|
"loss": 1.3204, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.0381781647689217, |
|
"grad_norm": 0.8757930482053028, |
|
"learning_rate": 9.651219721595235e-06, |
|
"loss": 1.31, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.0716677829872738, |
|
"grad_norm": 0.9287401756438767, |
|
"learning_rate": 9.607056649773266e-06, |
|
"loss": 1.2948, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.1051574012056262, |
|
"grad_norm": 0.966772568640745, |
|
"learning_rate": 9.56037532773342e-06, |
|
"loss": 1.2819, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.1386470194239786, |
|
"grad_norm": 0.9981440658348595, |
|
"learning_rate": 9.511201271821235e-06, |
|
"loss": 1.2655, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.1721366376423308, |
|
"grad_norm": 1.0546997584325322, |
|
"learning_rate": 9.459561360928472e-06, |
|
"loss": 1.2451, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.2056262558606832, |
|
"grad_norm": 0.8994951932737885, |
|
"learning_rate": 9.405483821800912e-06, |
|
"loss": 1.2355, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.2391158740790356, |
|
"grad_norm": 1.1226364755523701, |
|
"learning_rate": 9.348998213609416e-06, |
|
"loss": 1.2295, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.2726054922973877, |
|
"grad_norm": 1.0157346646837568, |
|
"learning_rate": 9.29013541179268e-06, |
|
"loss": 1.2011, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.3060951105157401, |
|
"grad_norm": 1.0670758632556911, |
|
"learning_rate": 9.228927591180484e-06, |
|
"loss": 1.1906, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.3395847287340925, |
|
"grad_norm": 0.9528782862122593, |
|
"learning_rate": 9.165408208406703e-06, |
|
"loss": 1.1715, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3730743469524447, |
|
"grad_norm": 0.9576902845349418, |
|
"learning_rate": 9.099611983621684e-06, |
|
"loss": 1.1605, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.406563965170797, |
|
"grad_norm": 1.271234164945301, |
|
"learning_rate": 9.03157488151394e-06, |
|
"loss": 1.1425, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.4400535833891492, |
|
"grad_norm": 1.1036926365187665, |
|
"learning_rate": 8.961334091651618e-06, |
|
"loss": 1.1233, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.4735432016075016, |
|
"grad_norm": 0.9973365902952639, |
|
"learning_rate": 8.888928008154393e-06, |
|
"loss": 1.1193, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.507032819825854, |
|
"grad_norm": 1.0008322426572456, |
|
"learning_rate": 8.81439620870698e-06, |
|
"loss": 1.0943, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.5405224380442064, |
|
"grad_norm": 1.035859311044444, |
|
"learning_rate": 8.737779432925682e-06, |
|
"loss": 1.0895, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.5740120562625586, |
|
"grad_norm": 1.00625654318314, |
|
"learning_rate": 8.659119560089822e-06, |
|
"loss": 1.0894, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.607501674480911, |
|
"grad_norm": 1.0315606204038301, |
|
"learning_rate": 8.578459586250235e-06, |
|
"loss": 1.0715, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.6409912926992631, |
|
"grad_norm": 1.0926346917432537, |
|
"learning_rate": 8.495843600727313e-06, |
|
"loss": 1.0717, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.6744809109176155, |
|
"grad_norm": 1.0612821467128837, |
|
"learning_rate": 8.411316762011469e-06, |
|
"loss": 1.0706, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.707970529135968, |
|
"grad_norm": 1.016164489585248, |
|
"learning_rate": 8.324925273079176e-06, |
|
"loss": 1.0665, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.7414601473543203, |
|
"grad_norm": 1.0371020643635414, |
|
"learning_rate": 8.236716356138098e-06, |
|
"loss": 1.0661, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.7749497655726725, |
|
"grad_norm": 1.0416961084839862, |
|
"learning_rate": 8.146738226815088e-06, |
|
"loss": 1.0636, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.8084393837910246, |
|
"grad_norm": 0.9790282258830432, |
|
"learning_rate": 8.055040067801172e-06, |
|
"loss": 1.0676, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.841929002009377, |
|
"grad_norm": 1.0172124964539753, |
|
"learning_rate": 7.961672001967954e-06, |
|
"loss": 1.0612, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.8754186202277294, |
|
"grad_norm": 1.1307997119049311, |
|
"learning_rate": 7.866685064970086e-06, |
|
"loss": 1.0561, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.9089082384460818, |
|
"grad_norm": 1.0443307684727732, |
|
"learning_rate": 7.770131177348806e-06, |
|
"loss": 1.0597, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.942397856664434, |
|
"grad_norm": 1.0764967903414877, |
|
"learning_rate": 7.672063116151811e-06, |
|
"loss": 1.0603, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.9758874748827864, |
|
"grad_norm": 1.1410122429288658, |
|
"learning_rate": 7.572534486084937e-06, |
|
"loss": 1.0555, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": NaN, |
|
"eval_runtime": 348.1208, |
|
"eval_samples_per_second": 45.743, |
|
"eval_steps_per_second": 1.431, |
|
"step": 5972 |
|
}, |
|
{ |
|
"epoch": 2.0093770931011385, |
|
"grad_norm": 1.0764883927956457, |
|
"learning_rate": 7.47159969021144e-06, |
|
"loss": 1.0571, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.042866711319491, |
|
"grad_norm": 1.1019967079045427, |
|
"learning_rate": 7.369313900214897e-06, |
|
"loss": 1.0526, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.0763563295378433, |
|
"grad_norm": 1.09355151149325, |
|
"learning_rate": 7.265733026241967e-06, |
|
"loss": 1.0395, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.1098459477561957, |
|
"grad_norm": 1.0207706582548508, |
|
"learning_rate": 7.160913686341495e-06, |
|
"loss": 1.0189, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.1433355659745477, |
|
"grad_norm": 1.2991784934719508, |
|
"learning_rate": 7.054913175516698e-06, |
|
"loss": 1.0034, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.1768251841929, |
|
"grad_norm": 1.1127907199675022, |
|
"learning_rate": 6.947789434407284e-06, |
|
"loss": 0.9789, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.2103148024112524, |
|
"grad_norm": 1.0987386576171831, |
|
"learning_rate": 6.839601017618699e-06, |
|
"loss": 0.9673, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.243804420629605, |
|
"grad_norm": 1.1749268268222908, |
|
"learning_rate": 6.730407061715752e-06, |
|
"loss": 0.9564, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 2.2772940388479572, |
|
"grad_norm": 1.2656062020023109, |
|
"learning_rate": 6.620267252898148e-06, |
|
"loss": 0.9251, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.3107836570663096, |
|
"grad_norm": 1.2775085805331419, |
|
"learning_rate": 6.509241794375577e-06, |
|
"loss": 0.9125, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.3442732752846616, |
|
"grad_norm": 1.3173880822264534, |
|
"learning_rate": 6.3973913734602174e-06, |
|
"loss": 0.8919, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.377762893503014, |
|
"grad_norm": 1.2455502782297214, |
|
"learning_rate": 6.284777128394603e-06, |
|
"loss": 0.879, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.4112525117213663, |
|
"grad_norm": 1.332442255586022, |
|
"learning_rate": 6.171460614933038e-06, |
|
"loss": 0.8625, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.4447421299397187, |
|
"grad_norm": 1.3507817687995796, |
|
"learning_rate": 6.057503772694761e-06, |
|
"loss": 0.8374, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.478231748158071, |
|
"grad_norm": 1.2079298830395293, |
|
"learning_rate": 5.942968891307317e-06, |
|
"loss": 0.8394, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.511721366376423, |
|
"grad_norm": 1.2858100084464055, |
|
"learning_rate": 5.8279185763585975e-06, |
|
"loss": 0.8131, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.5452109845947755, |
|
"grad_norm": 1.50200071509283, |
|
"learning_rate": 5.7124157151761795e-06, |
|
"loss": 0.805, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.578700602813128, |
|
"grad_norm": 1.7131605578882496, |
|
"learning_rate": 5.596523442452652e-06, |
|
"loss": 0.8083, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.6121902210314802, |
|
"grad_norm": 1.549866945076186, |
|
"learning_rate": 5.480305105735749e-06, |
|
"loss": 0.7845, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.6456798392498326, |
|
"grad_norm": 1.3213984004712975, |
|
"learning_rate": 5.363824230802127e-06, |
|
"loss": 0.7909, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.679169457468185, |
|
"grad_norm": 1.4279995035623863, |
|
"learning_rate": 5.247144486933706e-06, |
|
"loss": 0.7963, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.7126590756865374, |
|
"grad_norm": 1.8497497934553626, |
|
"learning_rate": 5.130329652115603e-06, |
|
"loss": 0.7835, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 2.7461486939048894, |
|
"grad_norm": 1.4531964604398142, |
|
"learning_rate": 5.013443578174608e-06, |
|
"loss": 0.7899, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 2.7796383121232418, |
|
"grad_norm": 1.5125912635190244, |
|
"learning_rate": 4.8965501558773326e-06, |
|
"loss": 0.788, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 2.813127930341594, |
|
"grad_norm": 1.3974817703691491, |
|
"learning_rate": 4.779713280007051e-06, |
|
"loss": 0.7979, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 2.8466175485599465, |
|
"grad_norm": 1.3666908779315028, |
|
"learning_rate": 4.6629968144383545e-06, |
|
"loss": 0.7839, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.8801071667782985, |
|
"grad_norm": 1.3996848474214347, |
|
"learning_rate": 4.546464557228699e-06, |
|
"loss": 0.7899, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.913596784996651, |
|
"grad_norm": 1.3649644046851297, |
|
"learning_rate": 4.430180205745932e-06, |
|
"loss": 0.7959, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.9470864032150033, |
|
"grad_norm": 1.369973634022689, |
|
"learning_rate": 4.314207321850849e-06, |
|
"loss": 0.7956, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.9805760214333556, |
|
"grad_norm": 1.5071401378465865, |
|
"learning_rate": 4.198609297153831e-06, |
|
"loss": 0.7954, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": NaN, |
|
"eval_runtime": 348.2929, |
|
"eval_samples_per_second": 45.72, |
|
"eval_steps_per_second": 1.43, |
|
"step": 8958 |
|
}, |
|
{ |
|
"epoch": 3.014065639651708, |
|
"grad_norm": 1.4428883177319884, |
|
"learning_rate": 4.083449318364527e-06, |
|
"loss": 0.7853, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.0475552578700604, |
|
"grad_norm": 1.351580022628522, |
|
"learning_rate": 3.968790332753555e-06, |
|
"loss": 0.7987, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 3.081044876088413, |
|
"grad_norm": 1.3743638112843153, |
|
"learning_rate": 3.8546950137450656e-06, |
|
"loss": 0.7732, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 3.1145344943067648, |
|
"grad_norm": 1.484385543221885, |
|
"learning_rate": 3.7412257266590007e-06, |
|
"loss": 0.7563, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 3.148024112525117, |
|
"grad_norm": 1.5933705790768147, |
|
"learning_rate": 3.62844449462176e-06, |
|
"loss": 0.739, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 3.1815137307434695, |
|
"grad_norm": 1.6870709018629149, |
|
"learning_rate": 3.5164129646639204e-06, |
|
"loss": 0.7228, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.215003348961822, |
|
"grad_norm": 1.6481990122285803, |
|
"learning_rate": 3.4051923740235205e-06, |
|
"loss": 0.7088, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 3.2484929671801743, |
|
"grad_norm": 1.5879233480759347, |
|
"learning_rate": 3.2948435166733506e-06, |
|
"loss": 0.6959, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 3.2819825853985263, |
|
"grad_norm": 1.5176278019008493, |
|
"learning_rate": 3.1854267100905344e-06, |
|
"loss": 0.6686, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 3.3154722036168787, |
|
"grad_norm": 1.552541254972543, |
|
"learning_rate": 3.0770017622865523e-06, |
|
"loss": 0.6606, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 3.348961821835231, |
|
"grad_norm": 1.7082594351164857, |
|
"learning_rate": 2.9696279391157663e-06, |
|
"loss": 0.6448, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.3824514400535834, |
|
"grad_norm": 3.1211350166343546, |
|
"learning_rate": 2.8633639318802685e-06, |
|
"loss": 0.6331, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 3.415941058271936, |
|
"grad_norm": 1.6469804551491567, |
|
"learning_rate": 2.758267825248798e-06, |
|
"loss": 0.6145, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 3.4494306764902882, |
|
"grad_norm": 1.7146851447701008, |
|
"learning_rate": 2.6543970655072514e-06, |
|
"loss": 0.6065, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 3.48292029470864, |
|
"grad_norm": 2.043969779746548, |
|
"learning_rate": 2.5518084291581163e-06, |
|
"loss": 0.6009, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 3.5164099129269926, |
|
"grad_norm": 1.6446301288443177, |
|
"learning_rate": 2.450557991886039e-06, |
|
"loss": 0.5847, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.549899531145345, |
|
"grad_norm": 1.8987310858087982, |
|
"learning_rate": 2.350701097906447e-06, |
|
"loss": 0.578, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 3.5833891493636973, |
|
"grad_norm": 1.5875486081882013, |
|
"learning_rate": 2.252292329714012e-06, |
|
"loss": 0.5838, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 3.6168787675820493, |
|
"grad_norm": 1.5799154770812294, |
|
"learning_rate": 2.155385478247455e-06, |
|
"loss": 0.5653, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 3.6503683858004017, |
|
"grad_norm": 1.9320449012232872, |
|
"learning_rate": 2.0600335134870415e-06, |
|
"loss": 0.5723, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 3.683858004018754, |
|
"grad_norm": 1.9440461986884756, |
|
"learning_rate": 1.9662885555008055e-06, |
|
"loss": 0.5762, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.7173476222371065, |
|
"grad_norm": 1.8693480723789162, |
|
"learning_rate": 1.8742018459553551e-06, |
|
"loss": 0.5715, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 3.750837240455459, |
|
"grad_norm": 1.6174259181592696, |
|
"learning_rate": 1.7838237201067976e-06, |
|
"loss": 0.577, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 3.7843268586738112, |
|
"grad_norm": 1.7622557353103667, |
|
"learning_rate": 1.695203579287134e-06, |
|
"loss": 0.5825, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 3.8178164768921636, |
|
"grad_norm": 1.6601130738963383, |
|
"learning_rate": 1.6083898639011402e-06, |
|
"loss": 0.5871, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 3.8513060951105156, |
|
"grad_norm": 2.0156698217638445, |
|
"learning_rate": 1.5234300269484848e-06, |
|
"loss": 0.5805, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.884795713328868, |
|
"grad_norm": 1.6232268197279267, |
|
"learning_rate": 1.440370508085589e-06, |
|
"loss": 0.5859, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 3.9182853315472204, |
|
"grad_norm": 1.9405869273694487, |
|
"learning_rate": 1.3592567082413683e-06, |
|
"loss": 0.5961, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 3.9517749497655728, |
|
"grad_norm": 1.6205319658414614, |
|
"learning_rate": 1.2801329648007648e-06, |
|
"loss": 0.6098, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 3.985264567983925, |
|
"grad_norm": 2.0422623489549028, |
|
"learning_rate": 1.203042527369611e-06, |
|
"loss": 0.5992, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": NaN, |
|
"eval_runtime": 348.6255, |
|
"eval_samples_per_second": 45.677, |
|
"eval_steps_per_second": 1.428, |
|
"step": 11944 |
|
}, |
|
{ |
|
"epoch": 4.018754186202277, |
|
"grad_norm": 2.042627434325046, |
|
"learning_rate": 1.1280275341340919e-06, |
|
"loss": 0.5984, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.0522438044206295, |
|
"grad_norm": 1.6714080325577199, |
|
"learning_rate": 1.0551289888277e-06, |
|
"loss": 0.6147, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 4.085733422638982, |
|
"grad_norm": 1.73673619396585, |
|
"learning_rate": 9.843867383183065e-07, |
|
"loss": 0.5882, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 4.119223040857334, |
|
"grad_norm": 1.7412386584226505, |
|
"learning_rate": 9.158394508275764e-07, |
|
"loss": 0.5785, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 4.152712659075687, |
|
"grad_norm": 1.688382578729062, |
|
"learning_rate": 8.495245947946428e-07, |
|
"loss": 0.568, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 4.186202277294039, |
|
"grad_norm": 1.7570920632165827, |
|
"learning_rate": 7.85478418395586e-07, |
|
"loss": 0.5548, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.219691895512391, |
|
"grad_norm": 2.0216120473215775, |
|
"learning_rate": 7.237359297299213e-07, |
|
"loss": 0.5491, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 4.253181513730744, |
|
"grad_norm": 1.7189176283861873, |
|
"learning_rate": 6.643308776849211e-07, |
|
"loss": 0.5344, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 4.286671131949095, |
|
"grad_norm": 1.878389510492855, |
|
"learning_rate": 6.07295733488234e-07, |
|
"loss": 0.5205, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 4.320160750167448, |
|
"grad_norm": 1.8272392601326726, |
|
"learning_rate": 5.526616729588719e-07, |
|
"loss": 0.5143, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 4.3536503683858, |
|
"grad_norm": 1.9885081295428908, |
|
"learning_rate": 5.00458559466292e-07, |
|
"loss": 0.5027, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.3871399866041525, |
|
"grad_norm": 1.8856403938879776, |
|
"learning_rate": 4.507149276068562e-07, |
|
"loss": 0.498, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 4.420629604822505, |
|
"grad_norm": 1.9693198350457568, |
|
"learning_rate": 4.0345796760662247e-07, |
|
"loss": 0.4925, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 4.454119223040857, |
|
"grad_norm": 2.1533445061135827, |
|
"learning_rate": 3.587135104589706e-07, |
|
"loss": 0.4893, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 4.48760884125921, |
|
"grad_norm": 1.795431467461819, |
|
"learning_rate": 3.16506013805194e-07, |
|
"loss": 0.4844, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 4.521098459477562, |
|
"grad_norm": 1.8492311436151545, |
|
"learning_rate": 2.7685854856577934e-07, |
|
"loss": 0.472, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.5545880776959144, |
|
"grad_norm": 1.9596331843479984, |
|
"learning_rate": 2.3979278632967507e-07, |
|
"loss": 0.4774, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 4.588077695914267, |
|
"grad_norm": 1.9830637420731552, |
|
"learning_rate": 2.0532898750844633e-07, |
|
"loss": 0.4786, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 4.621567314132619, |
|
"grad_norm": 2.0258341442311423, |
|
"learning_rate": 1.734859902617886e-07, |
|
"loss": 0.4786, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 4.655056932350972, |
|
"grad_norm": 1.9180928479733919, |
|
"learning_rate": 1.4428120020045122e-07, |
|
"loss": 0.4882, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 4.688546550569323, |
|
"grad_norm": 1.897536871580035, |
|
"learning_rate": 1.1773058087221068e-07, |
|
"loss": 0.4816, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.7220361687876755, |
|
"grad_norm": 1.9655089659832947, |
|
"learning_rate": 9.384864503607871e-08, |
|
"loss": 0.482, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 4.755525787006028, |
|
"grad_norm": 2.1423206586220345, |
|
"learning_rate": 7.264844672952299e-08, |
|
"loss": 0.4989, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 4.78901540522438, |
|
"grad_norm": 1.8698413000910197, |
|
"learning_rate": 5.4141574133037555e-08, |
|
"loss": 0.5135, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 4.822505023442733, |
|
"grad_norm": 1.888414362344593, |
|
"learning_rate": 3.8338143235959746e-08, |
|
"loss": 0.5077, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 4.855994641661085, |
|
"grad_norm": 1.9521761786674516, |
|
"learning_rate": 2.5246792306999334e-08, |
|
"loss": 0.5146, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.8894842598794375, |
|
"grad_norm": 1.8440132403394436, |
|
"learning_rate": 1.4874677172497243e-08, |
|
"loss": 0.5236, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 4.92297387809779, |
|
"grad_norm": 1.9744361393291854, |
|
"learning_rate": 7.2274673050010124e-09, |
|
"loss": 0.5404, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 4.956463496316142, |
|
"grad_norm": 1.8076672661575879, |
|
"learning_rate": 2.309342724287622e-09, |
|
"loss": 0.5488, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 4.989953114534495, |
|
"grad_norm": 1.9120835449610545, |
|
"learning_rate": 1.229917125389335e-10, |
|
"loss": 0.5494, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": NaN, |
|
"eval_runtime": 348.4998, |
|
"eval_samples_per_second": 45.693, |
|
"eval_steps_per_second": 1.429, |
|
"step": 14930 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 14930, |
|
"total_flos": 7403300223713280.0, |
|
"train_loss": 0.9973710162960596, |
|
"train_runtime": 63021.9977, |
|
"train_samples_per_second": 11.37, |
|
"train_steps_per_second": 0.237 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 14930, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7403300223713280.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|