Deepseek-R1-Distill-14B-Code-Ft / trainer_state.json
realYinkaIyiola's picture
Upload folder using huggingface_hub
d04320f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 2986,
"global_step": 14930,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.033489618218352314,
"grad_norm": 27.663120029291747,
"learning_rate": 6.697923643670463e-07,
"loss": 11.7393,
"step": 100
},
{
"epoch": 0.06697923643670463,
"grad_norm": 1.7320339885689573,
"learning_rate": 1.3395847287340927e-06,
"loss": 3.297,
"step": 200
},
{
"epoch": 0.10046885465505694,
"grad_norm": 1.2696699900839306,
"learning_rate": 2.0093770931011387e-06,
"loss": 1.8384,
"step": 300
},
{
"epoch": 0.13395847287340926,
"grad_norm": 1.1644334409082224,
"learning_rate": 2.6791694574681854e-06,
"loss": 1.6728,
"step": 400
},
{
"epoch": 0.16744809109176156,
"grad_norm": 1.272695149119617,
"learning_rate": 3.3489618218352316e-06,
"loss": 1.5883,
"step": 500
},
{
"epoch": 0.20093770931011387,
"grad_norm": 1.2497604240608484,
"learning_rate": 4.018754186202277e-06,
"loss": 1.5399,
"step": 600
},
{
"epoch": 0.23442732752846618,
"grad_norm": 1.1521160041280882,
"learning_rate": 4.688546550569324e-06,
"loss": 1.5214,
"step": 700
},
{
"epoch": 0.2679169457468185,
"grad_norm": 1.0610261246972188,
"learning_rate": 5.358338914936371e-06,
"loss": 1.4822,
"step": 800
},
{
"epoch": 0.3014065639651708,
"grad_norm": 1.1385854301492127,
"learning_rate": 6.028131279303416e-06,
"loss": 1.4623,
"step": 900
},
{
"epoch": 0.33489618218352313,
"grad_norm": 1.04201570658625,
"learning_rate": 6.697923643670463e-06,
"loss": 1.4408,
"step": 1000
},
{
"epoch": 0.3683858004018754,
"grad_norm": 1.0568444189378532,
"learning_rate": 7.3677160080375086e-06,
"loss": 1.4378,
"step": 1100
},
{
"epoch": 0.40187541862022774,
"grad_norm": 1.088346568816493,
"learning_rate": 8.037508372404555e-06,
"loss": 1.4189,
"step": 1200
},
{
"epoch": 0.43536503683858,
"grad_norm": 1.0714826983472352,
"learning_rate": 8.707300736771601e-06,
"loss": 1.4098,
"step": 1300
},
{
"epoch": 0.46885465505693236,
"grad_norm": 1.0415739456028938,
"learning_rate": 9.377093101138647e-06,
"loss": 1.4011,
"step": 1400
},
{
"epoch": 0.5023442732752846,
"grad_norm": 1.0789186294154542,
"learning_rate": 9.999993303758581e-06,
"loss": 1.3933,
"step": 1500
},
{
"epoch": 0.535833891493637,
"grad_norm": 1.0509262824198378,
"learning_rate": 9.998435483941776e-06,
"loss": 1.3828,
"step": 1600
},
{
"epoch": 0.5693235097119893,
"grad_norm": 1.0764796757312198,
"learning_rate": 9.994145483428403e-06,
"loss": 1.3772,
"step": 1700
},
{
"epoch": 0.6028131279303416,
"grad_norm": 1.047153842290928,
"learning_rate": 9.987125647163527e-06,
"loss": 1.3674,
"step": 1800
},
{
"epoch": 0.6363027461486939,
"grad_norm": 0.9188670491033151,
"learning_rate": 9.977379812240013e-06,
"loss": 1.3538,
"step": 1900
},
{
"epoch": 0.6697923643670463,
"grad_norm": 1.004701344911835,
"learning_rate": 9.964913305801151e-06,
"loss": 1.3566,
"step": 2000
},
{
"epoch": 0.7032819825853985,
"grad_norm": 1.1760354727564355,
"learning_rate": 9.9497329421288e-06,
"loss": 1.3473,
"step": 2100
},
{
"epoch": 0.7367716008037508,
"grad_norm": 0.9440290166066829,
"learning_rate": 9.931847018918654e-06,
"loss": 1.35,
"step": 2200
},
{
"epoch": 0.7702612190221031,
"grad_norm": 0.8730032484094693,
"learning_rate": 9.911265312744663e-06,
"loss": 1.3381,
"step": 2300
},
{
"epoch": 0.8037508372404555,
"grad_norm": 1.0757201534073755,
"learning_rate": 9.887999073715083e-06,
"loss": 1.3409,
"step": 2400
},
{
"epoch": 0.8372404554588078,
"grad_norm": 0.8841599798410706,
"learning_rate": 9.86206101932309e-06,
"loss": 1.333,
"step": 2500
},
{
"epoch": 0.87073007367716,
"grad_norm": 0.9378532515897725,
"learning_rate": 9.833465327495307e-06,
"loss": 1.3195,
"step": 2600
},
{
"epoch": 0.9042196918955124,
"grad_norm": 1.2091982749118577,
"learning_rate": 9.802227628842045e-06,
"loss": 1.3183,
"step": 2700
},
{
"epoch": 0.9377093101138647,
"grad_norm": 1.0171797419584845,
"learning_rate": 9.7683649981135e-06,
"loss": 1.3145,
"step": 2800
},
{
"epoch": 0.971198928332217,
"grad_norm": 0.9664734856676821,
"learning_rate": 9.731895944866576e-06,
"loss": 1.3178,
"step": 2900
},
{
"epoch": 1.0,
"eval_loss": NaN,
"eval_runtime": 348.649,
"eval_samples_per_second": 45.673,
"eval_steps_per_second": 1.428,
"step": 2986
},
{
"epoch": 1.0046885465505693,
"grad_norm": 1.0415846298834508,
"learning_rate": 9.69284040334742e-06,
"loss": 1.3204,
"step": 3000
},
{
"epoch": 1.0381781647689217,
"grad_norm": 0.8757930482053028,
"learning_rate": 9.651219721595235e-06,
"loss": 1.31,
"step": 3100
},
{
"epoch": 1.0716677829872738,
"grad_norm": 0.9287401756438767,
"learning_rate": 9.607056649773266e-06,
"loss": 1.2948,
"step": 3200
},
{
"epoch": 1.1051574012056262,
"grad_norm": 0.966772568640745,
"learning_rate": 9.56037532773342e-06,
"loss": 1.2819,
"step": 3300
},
{
"epoch": 1.1386470194239786,
"grad_norm": 0.9981440658348595,
"learning_rate": 9.511201271821235e-06,
"loss": 1.2655,
"step": 3400
},
{
"epoch": 1.1721366376423308,
"grad_norm": 1.0546997584325322,
"learning_rate": 9.459561360928472e-06,
"loss": 1.2451,
"step": 3500
},
{
"epoch": 1.2056262558606832,
"grad_norm": 0.8994951932737885,
"learning_rate": 9.405483821800912e-06,
"loss": 1.2355,
"step": 3600
},
{
"epoch": 1.2391158740790356,
"grad_norm": 1.1226364755523701,
"learning_rate": 9.348998213609416e-06,
"loss": 1.2295,
"step": 3700
},
{
"epoch": 1.2726054922973877,
"grad_norm": 1.0157346646837568,
"learning_rate": 9.29013541179268e-06,
"loss": 1.2011,
"step": 3800
},
{
"epoch": 1.3060951105157401,
"grad_norm": 1.0670758632556911,
"learning_rate": 9.228927591180484e-06,
"loss": 1.1906,
"step": 3900
},
{
"epoch": 1.3395847287340925,
"grad_norm": 0.9528782862122593,
"learning_rate": 9.165408208406703e-06,
"loss": 1.1715,
"step": 4000
},
{
"epoch": 1.3730743469524447,
"grad_norm": 0.9576902845349418,
"learning_rate": 9.099611983621684e-06,
"loss": 1.1605,
"step": 4100
},
{
"epoch": 1.406563965170797,
"grad_norm": 1.271234164945301,
"learning_rate": 9.03157488151394e-06,
"loss": 1.1425,
"step": 4200
},
{
"epoch": 1.4400535833891492,
"grad_norm": 1.1036926365187665,
"learning_rate": 8.961334091651618e-06,
"loss": 1.1233,
"step": 4300
},
{
"epoch": 1.4735432016075016,
"grad_norm": 0.9973365902952639,
"learning_rate": 8.888928008154393e-06,
"loss": 1.1193,
"step": 4400
},
{
"epoch": 1.507032819825854,
"grad_norm": 1.0008322426572456,
"learning_rate": 8.81439620870698e-06,
"loss": 1.0943,
"step": 4500
},
{
"epoch": 1.5405224380442064,
"grad_norm": 1.035859311044444,
"learning_rate": 8.737779432925682e-06,
"loss": 1.0895,
"step": 4600
},
{
"epoch": 1.5740120562625586,
"grad_norm": 1.00625654318314,
"learning_rate": 8.659119560089822e-06,
"loss": 1.0894,
"step": 4700
},
{
"epoch": 1.607501674480911,
"grad_norm": 1.0315606204038301,
"learning_rate": 8.578459586250235e-06,
"loss": 1.0715,
"step": 4800
},
{
"epoch": 1.6409912926992631,
"grad_norm": 1.0926346917432537,
"learning_rate": 8.495843600727313e-06,
"loss": 1.0717,
"step": 4900
},
{
"epoch": 1.6744809109176155,
"grad_norm": 1.0612821467128837,
"learning_rate": 8.411316762011469e-06,
"loss": 1.0706,
"step": 5000
},
{
"epoch": 1.707970529135968,
"grad_norm": 1.016164489585248,
"learning_rate": 8.324925273079176e-06,
"loss": 1.0665,
"step": 5100
},
{
"epoch": 1.7414601473543203,
"grad_norm": 1.0371020643635414,
"learning_rate": 8.236716356138098e-06,
"loss": 1.0661,
"step": 5200
},
{
"epoch": 1.7749497655726725,
"grad_norm": 1.0416961084839862,
"learning_rate": 8.146738226815088e-06,
"loss": 1.0636,
"step": 5300
},
{
"epoch": 1.8084393837910246,
"grad_norm": 0.9790282258830432,
"learning_rate": 8.055040067801172e-06,
"loss": 1.0676,
"step": 5400
},
{
"epoch": 1.841929002009377,
"grad_norm": 1.0172124964539753,
"learning_rate": 7.961672001967954e-06,
"loss": 1.0612,
"step": 5500
},
{
"epoch": 1.8754186202277294,
"grad_norm": 1.1307997119049311,
"learning_rate": 7.866685064970086e-06,
"loss": 1.0561,
"step": 5600
},
{
"epoch": 1.9089082384460818,
"grad_norm": 1.0443307684727732,
"learning_rate": 7.770131177348806e-06,
"loss": 1.0597,
"step": 5700
},
{
"epoch": 1.942397856664434,
"grad_norm": 1.0764967903414877,
"learning_rate": 7.672063116151811e-06,
"loss": 1.0603,
"step": 5800
},
{
"epoch": 1.9758874748827864,
"grad_norm": 1.1410122429288658,
"learning_rate": 7.572534486084937e-06,
"loss": 1.0555,
"step": 5900
},
{
"epoch": 2.0,
"eval_loss": NaN,
"eval_runtime": 348.1208,
"eval_samples_per_second": 45.743,
"eval_steps_per_second": 1.431,
"step": 5972
},
{
"epoch": 2.0093770931011385,
"grad_norm": 1.0764883927956457,
"learning_rate": 7.47159969021144e-06,
"loss": 1.0571,
"step": 6000
},
{
"epoch": 2.042866711319491,
"grad_norm": 1.1019967079045427,
"learning_rate": 7.369313900214897e-06,
"loss": 1.0526,
"step": 6100
},
{
"epoch": 2.0763563295378433,
"grad_norm": 1.09355151149325,
"learning_rate": 7.265733026241967e-06,
"loss": 1.0395,
"step": 6200
},
{
"epoch": 2.1098459477561957,
"grad_norm": 1.0207706582548508,
"learning_rate": 7.160913686341495e-06,
"loss": 1.0189,
"step": 6300
},
{
"epoch": 2.1433355659745477,
"grad_norm": 1.2991784934719508,
"learning_rate": 7.054913175516698e-06,
"loss": 1.0034,
"step": 6400
},
{
"epoch": 2.1768251841929,
"grad_norm": 1.1127907199675022,
"learning_rate": 6.947789434407284e-06,
"loss": 0.9789,
"step": 6500
},
{
"epoch": 2.2103148024112524,
"grad_norm": 1.0987386576171831,
"learning_rate": 6.839601017618699e-06,
"loss": 0.9673,
"step": 6600
},
{
"epoch": 2.243804420629605,
"grad_norm": 1.1749268268222908,
"learning_rate": 6.730407061715752e-06,
"loss": 0.9564,
"step": 6700
},
{
"epoch": 2.2772940388479572,
"grad_norm": 1.2656062020023109,
"learning_rate": 6.620267252898148e-06,
"loss": 0.9251,
"step": 6800
},
{
"epoch": 2.3107836570663096,
"grad_norm": 1.2775085805331419,
"learning_rate": 6.509241794375577e-06,
"loss": 0.9125,
"step": 6900
},
{
"epoch": 2.3442732752846616,
"grad_norm": 1.3173880822264534,
"learning_rate": 6.3973913734602174e-06,
"loss": 0.8919,
"step": 7000
},
{
"epoch": 2.377762893503014,
"grad_norm": 1.2455502782297214,
"learning_rate": 6.284777128394603e-06,
"loss": 0.879,
"step": 7100
},
{
"epoch": 2.4112525117213663,
"grad_norm": 1.332442255586022,
"learning_rate": 6.171460614933038e-06,
"loss": 0.8625,
"step": 7200
},
{
"epoch": 2.4447421299397187,
"grad_norm": 1.3507817687995796,
"learning_rate": 6.057503772694761e-06,
"loss": 0.8374,
"step": 7300
},
{
"epoch": 2.478231748158071,
"grad_norm": 1.2079298830395293,
"learning_rate": 5.942968891307317e-06,
"loss": 0.8394,
"step": 7400
},
{
"epoch": 2.511721366376423,
"grad_norm": 1.2858100084464055,
"learning_rate": 5.8279185763585975e-06,
"loss": 0.8131,
"step": 7500
},
{
"epoch": 2.5452109845947755,
"grad_norm": 1.50200071509283,
"learning_rate": 5.7124157151761795e-06,
"loss": 0.805,
"step": 7600
},
{
"epoch": 2.578700602813128,
"grad_norm": 1.7131605578882496,
"learning_rate": 5.596523442452652e-06,
"loss": 0.8083,
"step": 7700
},
{
"epoch": 2.6121902210314802,
"grad_norm": 1.549866945076186,
"learning_rate": 5.480305105735749e-06,
"loss": 0.7845,
"step": 7800
},
{
"epoch": 2.6456798392498326,
"grad_norm": 1.3213984004712975,
"learning_rate": 5.363824230802127e-06,
"loss": 0.7909,
"step": 7900
},
{
"epoch": 2.679169457468185,
"grad_norm": 1.4279995035623863,
"learning_rate": 5.247144486933706e-06,
"loss": 0.7963,
"step": 8000
},
{
"epoch": 2.7126590756865374,
"grad_norm": 1.8497497934553626,
"learning_rate": 5.130329652115603e-06,
"loss": 0.7835,
"step": 8100
},
{
"epoch": 2.7461486939048894,
"grad_norm": 1.4531964604398142,
"learning_rate": 5.013443578174608e-06,
"loss": 0.7899,
"step": 8200
},
{
"epoch": 2.7796383121232418,
"grad_norm": 1.5125912635190244,
"learning_rate": 4.8965501558773326e-06,
"loss": 0.788,
"step": 8300
},
{
"epoch": 2.813127930341594,
"grad_norm": 1.3974817703691491,
"learning_rate": 4.779713280007051e-06,
"loss": 0.7979,
"step": 8400
},
{
"epoch": 2.8466175485599465,
"grad_norm": 1.3666908779315028,
"learning_rate": 4.6629968144383545e-06,
"loss": 0.7839,
"step": 8500
},
{
"epoch": 2.8801071667782985,
"grad_norm": 1.3996848474214347,
"learning_rate": 4.546464557228699e-06,
"loss": 0.7899,
"step": 8600
},
{
"epoch": 2.913596784996651,
"grad_norm": 1.3649644046851297,
"learning_rate": 4.430180205745932e-06,
"loss": 0.7959,
"step": 8700
},
{
"epoch": 2.9470864032150033,
"grad_norm": 1.369973634022689,
"learning_rate": 4.314207321850849e-06,
"loss": 0.7956,
"step": 8800
},
{
"epoch": 2.9805760214333556,
"grad_norm": 1.5071401378465865,
"learning_rate": 4.198609297153831e-06,
"loss": 0.7954,
"step": 8900
},
{
"epoch": 3.0,
"eval_loss": NaN,
"eval_runtime": 348.2929,
"eval_samples_per_second": 45.72,
"eval_steps_per_second": 1.43,
"step": 8958
},
{
"epoch": 3.014065639651708,
"grad_norm": 1.4428883177319884,
"learning_rate": 4.083449318364527e-06,
"loss": 0.7853,
"step": 9000
},
{
"epoch": 3.0475552578700604,
"grad_norm": 1.351580022628522,
"learning_rate": 3.968790332753555e-06,
"loss": 0.7987,
"step": 9100
},
{
"epoch": 3.081044876088413,
"grad_norm": 1.3743638112843153,
"learning_rate": 3.8546950137450656e-06,
"loss": 0.7732,
"step": 9200
},
{
"epoch": 3.1145344943067648,
"grad_norm": 1.484385543221885,
"learning_rate": 3.7412257266590007e-06,
"loss": 0.7563,
"step": 9300
},
{
"epoch": 3.148024112525117,
"grad_norm": 1.5933705790768147,
"learning_rate": 3.62844449462176e-06,
"loss": 0.739,
"step": 9400
},
{
"epoch": 3.1815137307434695,
"grad_norm": 1.6870709018629149,
"learning_rate": 3.5164129646639204e-06,
"loss": 0.7228,
"step": 9500
},
{
"epoch": 3.215003348961822,
"grad_norm": 1.6481990122285803,
"learning_rate": 3.4051923740235205e-06,
"loss": 0.7088,
"step": 9600
},
{
"epoch": 3.2484929671801743,
"grad_norm": 1.5879233480759347,
"learning_rate": 3.2948435166733506e-06,
"loss": 0.6959,
"step": 9700
},
{
"epoch": 3.2819825853985263,
"grad_norm": 1.5176278019008493,
"learning_rate": 3.1854267100905344e-06,
"loss": 0.6686,
"step": 9800
},
{
"epoch": 3.3154722036168787,
"grad_norm": 1.552541254972543,
"learning_rate": 3.0770017622865523e-06,
"loss": 0.6606,
"step": 9900
},
{
"epoch": 3.348961821835231,
"grad_norm": 1.7082594351164857,
"learning_rate": 2.9696279391157663e-06,
"loss": 0.6448,
"step": 10000
},
{
"epoch": 3.3824514400535834,
"grad_norm": 3.1211350166343546,
"learning_rate": 2.8633639318802685e-06,
"loss": 0.6331,
"step": 10100
},
{
"epoch": 3.415941058271936,
"grad_norm": 1.6469804551491567,
"learning_rate": 2.758267825248798e-06,
"loss": 0.6145,
"step": 10200
},
{
"epoch": 3.4494306764902882,
"grad_norm": 1.7146851447701008,
"learning_rate": 2.6543970655072514e-06,
"loss": 0.6065,
"step": 10300
},
{
"epoch": 3.48292029470864,
"grad_norm": 2.043969779746548,
"learning_rate": 2.5518084291581163e-06,
"loss": 0.6009,
"step": 10400
},
{
"epoch": 3.5164099129269926,
"grad_norm": 1.6446301288443177,
"learning_rate": 2.450557991886039e-06,
"loss": 0.5847,
"step": 10500
},
{
"epoch": 3.549899531145345,
"grad_norm": 1.8987310858087982,
"learning_rate": 2.350701097906447e-06,
"loss": 0.578,
"step": 10600
},
{
"epoch": 3.5833891493636973,
"grad_norm": 1.5875486081882013,
"learning_rate": 2.252292329714012e-06,
"loss": 0.5838,
"step": 10700
},
{
"epoch": 3.6168787675820493,
"grad_norm": 1.5799154770812294,
"learning_rate": 2.155385478247455e-06,
"loss": 0.5653,
"step": 10800
},
{
"epoch": 3.6503683858004017,
"grad_norm": 1.9320449012232872,
"learning_rate": 2.0600335134870415e-06,
"loss": 0.5723,
"step": 10900
},
{
"epoch": 3.683858004018754,
"grad_norm": 1.9440461986884756,
"learning_rate": 1.9662885555008055e-06,
"loss": 0.5762,
"step": 11000
},
{
"epoch": 3.7173476222371065,
"grad_norm": 1.8693480723789162,
"learning_rate": 1.8742018459553551e-06,
"loss": 0.5715,
"step": 11100
},
{
"epoch": 3.750837240455459,
"grad_norm": 1.6174259181592696,
"learning_rate": 1.7838237201067976e-06,
"loss": 0.577,
"step": 11200
},
{
"epoch": 3.7843268586738112,
"grad_norm": 1.7622557353103667,
"learning_rate": 1.695203579287134e-06,
"loss": 0.5825,
"step": 11300
},
{
"epoch": 3.8178164768921636,
"grad_norm": 1.6601130738963383,
"learning_rate": 1.6083898639011402e-06,
"loss": 0.5871,
"step": 11400
},
{
"epoch": 3.8513060951105156,
"grad_norm": 2.0156698217638445,
"learning_rate": 1.5234300269484848e-06,
"loss": 0.5805,
"step": 11500
},
{
"epoch": 3.884795713328868,
"grad_norm": 1.6232268197279267,
"learning_rate": 1.440370508085589e-06,
"loss": 0.5859,
"step": 11600
},
{
"epoch": 3.9182853315472204,
"grad_norm": 1.9405869273694487,
"learning_rate": 1.3592567082413683e-06,
"loss": 0.5961,
"step": 11700
},
{
"epoch": 3.9517749497655728,
"grad_norm": 1.6205319658414614,
"learning_rate": 1.2801329648007648e-06,
"loss": 0.6098,
"step": 11800
},
{
"epoch": 3.985264567983925,
"grad_norm": 2.0422623489549028,
"learning_rate": 1.203042527369611e-06,
"loss": 0.5992,
"step": 11900
},
{
"epoch": 4.0,
"eval_loss": NaN,
"eval_runtime": 348.6255,
"eval_samples_per_second": 45.677,
"eval_steps_per_second": 1.428,
"step": 11944
},
{
"epoch": 4.018754186202277,
"grad_norm": 2.042627434325046,
"learning_rate": 1.1280275341340919e-06,
"loss": 0.5984,
"step": 12000
},
{
"epoch": 4.0522438044206295,
"grad_norm": 1.6714080325577199,
"learning_rate": 1.0551289888277e-06,
"loss": 0.6147,
"step": 12100
},
{
"epoch": 4.085733422638982,
"grad_norm": 1.73673619396585,
"learning_rate": 9.843867383183065e-07,
"loss": 0.5882,
"step": 12200
},
{
"epoch": 4.119223040857334,
"grad_norm": 1.7412386584226505,
"learning_rate": 9.158394508275764e-07,
"loss": 0.5785,
"step": 12300
},
{
"epoch": 4.152712659075687,
"grad_norm": 1.688382578729062,
"learning_rate": 8.495245947946428e-07,
"loss": 0.568,
"step": 12400
},
{
"epoch": 4.186202277294039,
"grad_norm": 1.7570920632165827,
"learning_rate": 7.85478418395586e-07,
"loss": 0.5548,
"step": 12500
},
{
"epoch": 4.219691895512391,
"grad_norm": 2.0216120473215775,
"learning_rate": 7.237359297299213e-07,
"loss": 0.5491,
"step": 12600
},
{
"epoch": 4.253181513730744,
"grad_norm": 1.7189176283861873,
"learning_rate": 6.643308776849211e-07,
"loss": 0.5344,
"step": 12700
},
{
"epoch": 4.286671131949095,
"grad_norm": 1.878389510492855,
"learning_rate": 6.07295733488234e-07,
"loss": 0.5205,
"step": 12800
},
{
"epoch": 4.320160750167448,
"grad_norm": 1.8272392601326726,
"learning_rate": 5.526616729588719e-07,
"loss": 0.5143,
"step": 12900
},
{
"epoch": 4.3536503683858,
"grad_norm": 1.9885081295428908,
"learning_rate": 5.00458559466292e-07,
"loss": 0.5027,
"step": 13000
},
{
"epoch": 4.3871399866041525,
"grad_norm": 1.8856403938879776,
"learning_rate": 4.507149276068562e-07,
"loss": 0.498,
"step": 13100
},
{
"epoch": 4.420629604822505,
"grad_norm": 1.9693198350457568,
"learning_rate": 4.0345796760662247e-07,
"loss": 0.4925,
"step": 13200
},
{
"epoch": 4.454119223040857,
"grad_norm": 2.1533445061135827,
"learning_rate": 3.587135104589706e-07,
"loss": 0.4893,
"step": 13300
},
{
"epoch": 4.48760884125921,
"grad_norm": 1.795431467461819,
"learning_rate": 3.16506013805194e-07,
"loss": 0.4844,
"step": 13400
},
{
"epoch": 4.521098459477562,
"grad_norm": 1.8492311436151545,
"learning_rate": 2.7685854856577934e-07,
"loss": 0.472,
"step": 13500
},
{
"epoch": 4.5545880776959144,
"grad_norm": 1.9596331843479984,
"learning_rate": 2.3979278632967507e-07,
"loss": 0.4774,
"step": 13600
},
{
"epoch": 4.588077695914267,
"grad_norm": 1.9830637420731552,
"learning_rate": 2.0532898750844633e-07,
"loss": 0.4786,
"step": 13700
},
{
"epoch": 4.621567314132619,
"grad_norm": 2.0258341442311423,
"learning_rate": 1.734859902617886e-07,
"loss": 0.4786,
"step": 13800
},
{
"epoch": 4.655056932350972,
"grad_norm": 1.9180928479733919,
"learning_rate": 1.4428120020045122e-07,
"loss": 0.4882,
"step": 13900
},
{
"epoch": 4.688546550569323,
"grad_norm": 1.897536871580035,
"learning_rate": 1.1773058087221068e-07,
"loss": 0.4816,
"step": 14000
},
{
"epoch": 4.7220361687876755,
"grad_norm": 1.9655089659832947,
"learning_rate": 9.384864503607871e-08,
"loss": 0.482,
"step": 14100
},
{
"epoch": 4.755525787006028,
"grad_norm": 2.1423206586220345,
"learning_rate": 7.264844672952299e-08,
"loss": 0.4989,
"step": 14200
},
{
"epoch": 4.78901540522438,
"grad_norm": 1.8698413000910197,
"learning_rate": 5.4141574133037555e-08,
"loss": 0.5135,
"step": 14300
},
{
"epoch": 4.822505023442733,
"grad_norm": 1.888414362344593,
"learning_rate": 3.8338143235959746e-08,
"loss": 0.5077,
"step": 14400
},
{
"epoch": 4.855994641661085,
"grad_norm": 1.9521761786674516,
"learning_rate": 2.5246792306999334e-08,
"loss": 0.5146,
"step": 14500
},
{
"epoch": 4.8894842598794375,
"grad_norm": 1.8440132403394436,
"learning_rate": 1.4874677172497243e-08,
"loss": 0.5236,
"step": 14600
},
{
"epoch": 4.92297387809779,
"grad_norm": 1.9744361393291854,
"learning_rate": 7.2274673050010124e-09,
"loss": 0.5404,
"step": 14700
},
{
"epoch": 4.956463496316142,
"grad_norm": 1.8076672661575879,
"learning_rate": 2.309342724287622e-09,
"loss": 0.5488,
"step": 14800
},
{
"epoch": 4.989953114534495,
"grad_norm": 1.9120835449610545,
"learning_rate": 1.229917125389335e-10,
"loss": 0.5494,
"step": 14900
},
{
"epoch": 5.0,
"eval_loss": NaN,
"eval_runtime": 348.4998,
"eval_samples_per_second": 45.693,
"eval_steps_per_second": 1.429,
"step": 14930
},
{
"epoch": 5.0,
"step": 14930,
"total_flos": 7403300223713280.0,
"train_loss": 0.9973710162960596,
"train_runtime": 63021.9977,
"train_samples_per_second": 11.37,
"train_steps_per_second": 0.237
}
],
"logging_steps": 100,
"max_steps": 14930,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7403300223713280.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}