t5-ru-text-normalization-v1 / trainer_state.json
CrabInHoney's picture
Upload 11 files
2a330d3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.13344644508202191,
"eval_steps": 1000,
"global_step": 14000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009531888934430136,
"grad_norm": 1.0701079368591309,
"learning_rate": 1.5e-06,
"loss": 0.2617,
"step": 100
},
{
"epoch": 0.0019063777868860272,
"grad_norm": 1.0461440086364746,
"learning_rate": 3e-06,
"loss": 0.2595,
"step": 200
},
{
"epoch": 0.002859566680329041,
"grad_norm": 1.0249755382537842,
"learning_rate": 4.5e-06,
"loss": 0.2595,
"step": 300
},
{
"epoch": 0.0038127555737720543,
"grad_norm": 0.9327605366706848,
"learning_rate": 6e-06,
"loss": 0.2563,
"step": 400
},
{
"epoch": 0.004765944467215068,
"grad_norm": 0.9439413547515869,
"learning_rate": 7.5e-06,
"loss": 0.2589,
"step": 500
},
{
"epoch": 0.005719133360658082,
"grad_norm": 0.8729381561279297,
"learning_rate": 9e-06,
"loss": 0.2617,
"step": 600
},
{
"epoch": 0.006672322254101095,
"grad_norm": 0.9562346935272217,
"learning_rate": 1.05e-05,
"loss": 0.259,
"step": 700
},
{
"epoch": 0.007625511147544109,
"grad_norm": 1.7502244710922241,
"learning_rate": 1.2e-05,
"loss": 0.2551,
"step": 800
},
{
"epoch": 0.008578700040987123,
"grad_norm": 0.8447253704071045,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.2555,
"step": 900
},
{
"epoch": 0.009531888934430136,
"grad_norm": 0.9096837043762207,
"learning_rate": 1.5e-05,
"loss": 0.2637,
"step": 1000
},
{
"epoch": 0.009531888934430136,
"eval_loss": 0.22202371060848236,
"eval_runtime": 24.6656,
"eval_samples_per_second": 608.134,
"eval_steps_per_second": 9.527,
"step": 1000
},
{
"epoch": 0.01048507782787315,
"grad_norm": 0.9705513715744019,
"learning_rate": 1.65e-05,
"loss": 0.2614,
"step": 1100
},
{
"epoch": 0.011438266721316164,
"grad_norm": 0.9748035669326782,
"learning_rate": 1.8e-05,
"loss": 0.2648,
"step": 1200
},
{
"epoch": 0.012391455614759177,
"grad_norm": 2.0027875900268555,
"learning_rate": 1.95e-05,
"loss": 0.2605,
"step": 1300
},
{
"epoch": 0.01334464450820219,
"grad_norm": 1.203764796257019,
"learning_rate": 2.1e-05,
"loss": 0.2645,
"step": 1400
},
{
"epoch": 0.014297833401645204,
"grad_norm": 1.2857439517974854,
"learning_rate": 2.25e-05,
"loss": 0.2622,
"step": 1500
},
{
"epoch": 0.015251022295088217,
"grad_norm": 0.969646692276001,
"learning_rate": 2.4e-05,
"loss": 0.2604,
"step": 1600
},
{
"epoch": 0.016204211188531232,
"grad_norm": 0.8485471606254578,
"learning_rate": 2.55e-05,
"loss": 0.2628,
"step": 1700
},
{
"epoch": 0.017157400081974247,
"grad_norm": 1.1885377168655396,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.2665,
"step": 1800
},
{
"epoch": 0.018110588975417258,
"grad_norm": 1.98976469039917,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.2723,
"step": 1900
},
{
"epoch": 0.019063777868860272,
"grad_norm": 1.0017362833023071,
"learning_rate": 3e-05,
"loss": 0.2645,
"step": 2000
},
{
"epoch": 0.019063777868860272,
"eval_loss": 0.2264958769083023,
"eval_runtime": 24.3618,
"eval_samples_per_second": 615.718,
"eval_steps_per_second": 9.646,
"step": 2000
},
{
"epoch": 0.020016966762303287,
"grad_norm": 1.3095935583114624,
"learning_rate": 2.9970848597331675e-05,
"loss": 0.2735,
"step": 2100
},
{
"epoch": 0.0209701556557463,
"grad_norm": 1.0084208250045776,
"learning_rate": 2.9941697194663354e-05,
"loss": 0.2697,
"step": 2200
},
{
"epoch": 0.021923344549189313,
"grad_norm": 0.9595718383789062,
"learning_rate": 2.9912545791995025e-05,
"loss": 0.2706,
"step": 2300
},
{
"epoch": 0.022876533442632328,
"grad_norm": 1.156947374343872,
"learning_rate": 2.98833943893267e-05,
"loss": 0.2664,
"step": 2400
},
{
"epoch": 0.02382972233607534,
"grad_norm": 0.9906996488571167,
"learning_rate": 2.9854242986658374e-05,
"loss": 0.267,
"step": 2500
},
{
"epoch": 0.024782911229518353,
"grad_norm": 1.133239507675171,
"learning_rate": 2.9825091583990053e-05,
"loss": 0.2697,
"step": 2600
},
{
"epoch": 0.025736100122961368,
"grad_norm": 1.1839542388916016,
"learning_rate": 2.9795940181321727e-05,
"loss": 0.2644,
"step": 2700
},
{
"epoch": 0.02668928901640438,
"grad_norm": 1.1177607774734497,
"learning_rate": 2.97667887786534e-05,
"loss": 0.2649,
"step": 2800
},
{
"epoch": 0.027642477909847394,
"grad_norm": 1.0634980201721191,
"learning_rate": 2.9737637375985073e-05,
"loss": 0.273,
"step": 2900
},
{
"epoch": 0.02859566680329041,
"grad_norm": 1.141790747642517,
"learning_rate": 2.970848597331675e-05,
"loss": 0.2717,
"step": 3000
},
{
"epoch": 0.02859566680329041,
"eval_loss": 0.23009072244167328,
"eval_runtime": 25.5443,
"eval_samples_per_second": 587.214,
"eval_steps_per_second": 9.2,
"step": 3000
},
{
"epoch": 0.029548855696733423,
"grad_norm": 0.8992202281951904,
"learning_rate": 2.9679334570648426e-05,
"loss": 0.272,
"step": 3100
},
{
"epoch": 0.030502044590176434,
"grad_norm": 1.1783612966537476,
"learning_rate": 2.96501831679801e-05,
"loss": 0.2705,
"step": 3200
},
{
"epoch": 0.03145523348361945,
"grad_norm": 1.516988754272461,
"learning_rate": 2.9621031765311772e-05,
"loss": 0.2696,
"step": 3300
},
{
"epoch": 0.032408422377062464,
"grad_norm": 0.9750285148620605,
"learning_rate": 2.959188036264345e-05,
"loss": 0.2661,
"step": 3400
},
{
"epoch": 0.03336161127050548,
"grad_norm": 1.0874147415161133,
"learning_rate": 2.9562728959975125e-05,
"loss": 0.2713,
"step": 3500
},
{
"epoch": 0.03431480016394849,
"grad_norm": 1.2503632307052612,
"learning_rate": 2.95335775573068e-05,
"loss": 0.2694,
"step": 3600
},
{
"epoch": 0.0352679890573915,
"grad_norm": 2.1983683109283447,
"learning_rate": 2.9504426154638478e-05,
"loss": 0.2715,
"step": 3700
},
{
"epoch": 0.036221177950834516,
"grad_norm": 1.0884830951690674,
"learning_rate": 2.947527475197015e-05,
"loss": 0.2671,
"step": 3800
},
{
"epoch": 0.03717436684427753,
"grad_norm": 0.9805251955986023,
"learning_rate": 2.9446123349301824e-05,
"loss": 0.2705,
"step": 3900
},
{
"epoch": 0.038127555737720545,
"grad_norm": 1.0471646785736084,
"learning_rate": 2.94169719466335e-05,
"loss": 0.2657,
"step": 4000
},
{
"epoch": 0.038127555737720545,
"eval_loss": 0.22619383037090302,
"eval_runtime": 24.3251,
"eval_samples_per_second": 616.647,
"eval_steps_per_second": 9.661,
"step": 4000
},
{
"epoch": 0.03908074463116356,
"grad_norm": 1.080304503440857,
"learning_rate": 2.9387820543965177e-05,
"loss": 0.2755,
"step": 4100
},
{
"epoch": 0.040033933524606574,
"grad_norm": 1.2072677612304688,
"learning_rate": 2.935866914129685e-05,
"loss": 0.2666,
"step": 4200
},
{
"epoch": 0.04098712241804958,
"grad_norm": 1.1678977012634277,
"learning_rate": 2.9329517738628523e-05,
"loss": 0.2708,
"step": 4300
},
{
"epoch": 0.0419403113114926,
"grad_norm": 0.9155502319335938,
"learning_rate": 2.9300366335960198e-05,
"loss": 0.2701,
"step": 4400
},
{
"epoch": 0.04289350020493561,
"grad_norm": 1.022687315940857,
"learning_rate": 2.9271214933291876e-05,
"loss": 0.276,
"step": 4500
},
{
"epoch": 0.043846689098378626,
"grad_norm": 1.0507577657699585,
"learning_rate": 2.924206353062355e-05,
"loss": 0.2695,
"step": 4600
},
{
"epoch": 0.04479987799182164,
"grad_norm": 0.9346485137939453,
"learning_rate": 2.9212912127955225e-05,
"loss": 0.2715,
"step": 4700
},
{
"epoch": 0.045753066885264655,
"grad_norm": 1.0042835474014282,
"learning_rate": 2.9183760725286897e-05,
"loss": 0.2671,
"step": 4800
},
{
"epoch": 0.04670625577870767,
"grad_norm": 1.106454610824585,
"learning_rate": 2.9154609322618575e-05,
"loss": 0.2666,
"step": 4900
},
{
"epoch": 0.04765944467215068,
"grad_norm": 0.911589503288269,
"learning_rate": 2.912545791995025e-05,
"loss": 0.264,
"step": 5000
},
{
"epoch": 0.04765944467215068,
"eval_loss": 0.22571362555027008,
"eval_runtime": 24.0986,
"eval_samples_per_second": 622.442,
"eval_steps_per_second": 9.752,
"step": 5000
},
{
"epoch": 0.04861263356559369,
"grad_norm": 0.8723756670951843,
"learning_rate": 2.9096306517281924e-05,
"loss": 0.264,
"step": 5100
},
{
"epoch": 0.04956582245903671,
"grad_norm": 1.034590482711792,
"learning_rate": 2.90671551146136e-05,
"loss": 0.2767,
"step": 5200
},
{
"epoch": 0.05051901135247972,
"grad_norm": 1.0665106773376465,
"learning_rate": 2.9038003711945274e-05,
"loss": 0.2676,
"step": 5300
},
{
"epoch": 0.051472200245922736,
"grad_norm": 0.9242556095123291,
"learning_rate": 2.900885230927695e-05,
"loss": 0.2699,
"step": 5400
},
{
"epoch": 0.05242538913936575,
"grad_norm": 1.1992926597595215,
"learning_rate": 2.8979700906608623e-05,
"loss": 0.2682,
"step": 5500
},
{
"epoch": 0.05337857803280876,
"grad_norm": 0.9543828964233398,
"learning_rate": 2.89505495039403e-05,
"loss": 0.2713,
"step": 5600
},
{
"epoch": 0.05433176692625177,
"grad_norm": 0.9702574014663696,
"learning_rate": 2.8921398101271973e-05,
"loss": 0.2663,
"step": 5700
},
{
"epoch": 0.05528495581969479,
"grad_norm": 0.9306678175926208,
"learning_rate": 2.8892246698603647e-05,
"loss": 0.2712,
"step": 5800
},
{
"epoch": 0.0562381447131378,
"grad_norm": 1.2940869331359863,
"learning_rate": 2.8863095295935322e-05,
"loss": 0.2732,
"step": 5900
},
{
"epoch": 0.05719133360658082,
"grad_norm": 0.8944372534751892,
"learning_rate": 2.8833943893267e-05,
"loss": 0.2675,
"step": 6000
},
{
"epoch": 0.05719133360658082,
"eval_loss": 0.22631041705608368,
"eval_runtime": 24.2322,
"eval_samples_per_second": 619.011,
"eval_steps_per_second": 9.698,
"step": 6000
},
{
"epoch": 0.05814452250002383,
"grad_norm": 1.1152732372283936,
"learning_rate": 2.8804792490598675e-05,
"loss": 0.2632,
"step": 6100
},
{
"epoch": 0.05909771139346685,
"grad_norm": 0.90058833360672,
"learning_rate": 2.8775641087930346e-05,
"loss": 0.2677,
"step": 6200
},
{
"epoch": 0.060050900286909854,
"grad_norm": 0.9290627241134644,
"learning_rate": 2.874648968526202e-05,
"loss": 0.2667,
"step": 6300
},
{
"epoch": 0.06100408918035287,
"grad_norm": 1.0167937278747559,
"learning_rate": 2.87173382825937e-05,
"loss": 0.2658,
"step": 6400
},
{
"epoch": 0.061957278073795884,
"grad_norm": 1.0440782308578491,
"learning_rate": 2.8688186879925374e-05,
"loss": 0.2672,
"step": 6500
},
{
"epoch": 0.0629104669672389,
"grad_norm": 1.0155839920043945,
"learning_rate": 2.865903547725705e-05,
"loss": 0.2657,
"step": 6600
},
{
"epoch": 0.0638636558606819,
"grad_norm": 0.879859209060669,
"learning_rate": 2.862988407458872e-05,
"loss": 0.2674,
"step": 6700
},
{
"epoch": 0.06481684475412493,
"grad_norm": 0.9081212878227234,
"learning_rate": 2.8600732671920398e-05,
"loss": 0.2644,
"step": 6800
},
{
"epoch": 0.06577003364756794,
"grad_norm": 1.1635853052139282,
"learning_rate": 2.8571581269252073e-05,
"loss": 0.2609,
"step": 6900
},
{
"epoch": 0.06672322254101096,
"grad_norm": 1.0756968259811401,
"learning_rate": 2.8542429866583747e-05,
"loss": 0.2682,
"step": 7000
},
{
"epoch": 0.06672322254101096,
"eval_loss": 0.22241491079330444,
"eval_runtime": 25.327,
"eval_samples_per_second": 592.253,
"eval_steps_per_second": 9.279,
"step": 7000
},
{
"epoch": 0.06767641143445396,
"grad_norm": 1.0364997386932373,
"learning_rate": 2.8513278463915425e-05,
"loss": 0.2651,
"step": 7100
},
{
"epoch": 0.06862960032789699,
"grad_norm": 1.0817292928695679,
"learning_rate": 2.8484127061247097e-05,
"loss": 0.2634,
"step": 7200
},
{
"epoch": 0.06958278922134,
"grad_norm": 1.052465796470642,
"learning_rate": 2.845497565857877e-05,
"loss": 0.2672,
"step": 7300
},
{
"epoch": 0.070535978114783,
"grad_norm": 0.8442723155021667,
"learning_rate": 2.8425824255910446e-05,
"loss": 0.2709,
"step": 7400
},
{
"epoch": 0.07148916700822602,
"grad_norm": 1.104926347732544,
"learning_rate": 2.8396672853242124e-05,
"loss": 0.2617,
"step": 7500
},
{
"epoch": 0.07244235590166903,
"grad_norm": 1.0135023593902588,
"learning_rate": 2.83675214505738e-05,
"loss": 0.2625,
"step": 7600
},
{
"epoch": 0.07339554479511205,
"grad_norm": 0.9307543039321899,
"learning_rate": 2.833837004790547e-05,
"loss": 0.2671,
"step": 7700
},
{
"epoch": 0.07434873368855506,
"grad_norm": 1.5013054609298706,
"learning_rate": 2.8309218645237145e-05,
"loss": 0.2656,
"step": 7800
},
{
"epoch": 0.07530192258199807,
"grad_norm": 0.923324465751648,
"learning_rate": 2.8280067242568823e-05,
"loss": 0.2607,
"step": 7900
},
{
"epoch": 0.07625511147544109,
"grad_norm": 1.065769076347351,
"learning_rate": 2.8250915839900498e-05,
"loss": 0.2641,
"step": 8000
},
{
"epoch": 0.07625511147544109,
"eval_loss": 0.22064544260501862,
"eval_runtime": 25.6245,
"eval_samples_per_second": 585.378,
"eval_steps_per_second": 9.171,
"step": 8000
},
{
"epoch": 0.0772083003688841,
"grad_norm": 1.053281545639038,
"learning_rate": 2.8221764437232173e-05,
"loss": 0.2633,
"step": 8100
},
{
"epoch": 0.07816148926232712,
"grad_norm": 1.0560704469680786,
"learning_rate": 2.8192613034563844e-05,
"loss": 0.2602,
"step": 8200
},
{
"epoch": 0.07911467815577013,
"grad_norm": 1.0632127523422241,
"learning_rate": 2.8163461631895522e-05,
"loss": 0.2647,
"step": 8300
},
{
"epoch": 0.08006786704921315,
"grad_norm": 1.0002626180648804,
"learning_rate": 2.8134310229227197e-05,
"loss": 0.2654,
"step": 8400
},
{
"epoch": 0.08102105594265616,
"grad_norm": 1.1899933815002441,
"learning_rate": 2.8105158826558872e-05,
"loss": 0.2631,
"step": 8500
},
{
"epoch": 0.08197424483609916,
"grad_norm": 0.9177943468093872,
"learning_rate": 2.807600742389055e-05,
"loss": 0.264,
"step": 8600
},
{
"epoch": 0.08292743372954219,
"grad_norm": 1.0969672203063965,
"learning_rate": 2.804685602122222e-05,
"loss": 0.2663,
"step": 8700
},
{
"epoch": 0.0838806226229852,
"grad_norm": 0.9465392231941223,
"learning_rate": 2.8017704618553896e-05,
"loss": 0.2599,
"step": 8800
},
{
"epoch": 0.08483381151642821,
"grad_norm": 1.1491124629974365,
"learning_rate": 2.798855321588557e-05,
"loss": 0.2616,
"step": 8900
},
{
"epoch": 0.08578700040987122,
"grad_norm": 1.040123701095581,
"learning_rate": 2.795940181321725e-05,
"loss": 0.2611,
"step": 9000
},
{
"epoch": 0.08578700040987122,
"eval_loss": 0.22252394258975983,
"eval_runtime": 24.4254,
"eval_samples_per_second": 614.114,
"eval_steps_per_second": 9.621,
"step": 9000
},
{
"epoch": 0.08674018930331424,
"grad_norm": 0.8041715621948242,
"learning_rate": 2.7930250410548923e-05,
"loss": 0.2597,
"step": 9100
},
{
"epoch": 0.08769337819675725,
"grad_norm": 1.2013587951660156,
"learning_rate": 2.7901099007880595e-05,
"loss": 0.2627,
"step": 9200
},
{
"epoch": 0.08864656709020026,
"grad_norm": 0.8449276089668274,
"learning_rate": 2.787194760521227e-05,
"loss": 0.2694,
"step": 9300
},
{
"epoch": 0.08959975598364328,
"grad_norm": 0.957938015460968,
"learning_rate": 2.7842796202543948e-05,
"loss": 0.2646,
"step": 9400
},
{
"epoch": 0.09055294487708629,
"grad_norm": 0.9442753195762634,
"learning_rate": 2.7813644799875622e-05,
"loss": 0.2618,
"step": 9500
},
{
"epoch": 0.09150613377052931,
"grad_norm": 1.0630254745483398,
"learning_rate": 2.7784493397207297e-05,
"loss": 0.267,
"step": 9600
},
{
"epoch": 0.09245932266397232,
"grad_norm": 0.9763880372047424,
"learning_rate": 2.775534199453897e-05,
"loss": 0.2631,
"step": 9700
},
{
"epoch": 0.09341251155741534,
"grad_norm": 1.059673547744751,
"learning_rate": 2.7726190591870647e-05,
"loss": 0.264,
"step": 9800
},
{
"epoch": 0.09436570045085835,
"grad_norm": 1.0772706270217896,
"learning_rate": 2.769703918920232e-05,
"loss": 0.26,
"step": 9900
},
{
"epoch": 0.09531888934430136,
"grad_norm": 0.9500916600227356,
"learning_rate": 2.7667887786533996e-05,
"loss": 0.2603,
"step": 10000
},
{
"epoch": 0.09531888934430136,
"eval_loss": 0.22107724845409393,
"eval_runtime": 24.1253,
"eval_samples_per_second": 621.753,
"eval_steps_per_second": 9.741,
"step": 10000
},
{
"epoch": 0.09627207823774438,
"grad_norm": 0.7942706346511841,
"learning_rate": 2.7639027897892354e-05,
"loss": 0.258,
"step": 10100
},
{
"epoch": 0.09722526713118738,
"grad_norm": 1.1196712255477905,
"learning_rate": 2.7610168009250712e-05,
"loss": 0.2594,
"step": 10200
},
{
"epoch": 0.0981784560246304,
"grad_norm": 0.9647284746170044,
"learning_rate": 2.7581016606582387e-05,
"loss": 0.2645,
"step": 10300
},
{
"epoch": 0.09913164491807341,
"grad_norm": 1.0983389616012573,
"learning_rate": 2.7551865203914065e-05,
"loss": 0.2589,
"step": 10400
},
{
"epoch": 0.10008483381151642,
"grad_norm": 0.8184943795204163,
"learning_rate": 2.7522713801245736e-05,
"loss": 0.2604,
"step": 10500
},
{
"epoch": 0.10103802270495944,
"grad_norm": 1.0684343576431274,
"learning_rate": 2.749356239857741e-05,
"loss": 0.2602,
"step": 10600
},
{
"epoch": 0.10199121159840245,
"grad_norm": 0.9852308034896851,
"learning_rate": 2.7464410995909086e-05,
"loss": 0.2688,
"step": 10700
},
{
"epoch": 0.10294440049184547,
"grad_norm": 0.8270373940467834,
"learning_rate": 2.7435259593240764e-05,
"loss": 0.2601,
"step": 10800
},
{
"epoch": 0.10389758938528848,
"grad_norm": 0.9181864857673645,
"learning_rate": 2.740610819057244e-05,
"loss": 0.259,
"step": 10900
},
{
"epoch": 0.1048507782787315,
"grad_norm": 0.8947911858558655,
"learning_rate": 2.737695678790411e-05,
"loss": 0.2616,
"step": 11000
},
{
"epoch": 0.1048507782787315,
"eval_loss": 0.22300027310848236,
"eval_runtime": 26.4519,
"eval_samples_per_second": 567.068,
"eval_steps_per_second": 8.884,
"step": 11000
},
{
"epoch": 0.10580396717217451,
"grad_norm": 1.19639253616333,
"learning_rate": 2.7347805385235785e-05,
"loss": 0.2624,
"step": 11100
},
{
"epoch": 0.10675715606561752,
"grad_norm": 1.3614460229873657,
"learning_rate": 2.7318653982567463e-05,
"loss": 0.2578,
"step": 11200
},
{
"epoch": 0.10771034495906054,
"grad_norm": 0.8842675089836121,
"learning_rate": 2.7289502579899138e-05,
"loss": 0.259,
"step": 11300
},
{
"epoch": 0.10866353385250355,
"grad_norm": 1.1543840169906616,
"learning_rate": 2.7260351177230812e-05,
"loss": 0.2594,
"step": 11400
},
{
"epoch": 0.10961672274594657,
"grad_norm": 1.1461540460586548,
"learning_rate": 2.7231199774562484e-05,
"loss": 0.2576,
"step": 11500
},
{
"epoch": 0.11056991163938958,
"grad_norm": 0.9683176279067993,
"learning_rate": 2.7202048371894162e-05,
"loss": 0.2597,
"step": 11600
},
{
"epoch": 0.1115231005328326,
"grad_norm": 1.1039471626281738,
"learning_rate": 2.7172896969225837e-05,
"loss": 0.2586,
"step": 11700
},
{
"epoch": 0.1124762894262756,
"grad_norm": 0.9412834644317627,
"learning_rate": 2.714374556655751e-05,
"loss": 0.2573,
"step": 11800
},
{
"epoch": 0.11342947831971861,
"grad_norm": 1.1193273067474365,
"learning_rate": 2.711459416388919e-05,
"loss": 0.2564,
"step": 11900
},
{
"epoch": 0.11438266721316163,
"grad_norm": 0.9070214033126831,
"learning_rate": 2.708544276122086e-05,
"loss": 0.2598,
"step": 12000
},
{
"epoch": 0.11438266721316163,
"eval_loss": 0.21802841126918793,
"eval_runtime": 24.3781,
"eval_samples_per_second": 615.305,
"eval_steps_per_second": 9.64,
"step": 12000
},
{
"epoch": 0.11533585610660464,
"grad_norm": 0.9957073330879211,
"learning_rate": 2.7056291358552536e-05,
"loss": 0.2582,
"step": 12100
},
{
"epoch": 0.11628904500004766,
"grad_norm": 0.9560794234275818,
"learning_rate": 2.702713995588421e-05,
"loss": 0.2624,
"step": 12200
},
{
"epoch": 0.11724223389349067,
"grad_norm": 1.0625020265579224,
"learning_rate": 2.699798855321589e-05,
"loss": 0.2606,
"step": 12300
},
{
"epoch": 0.1181954227869337,
"grad_norm": 1.2022795677185059,
"learning_rate": 2.6968837150547563e-05,
"loss": 0.2577,
"step": 12400
},
{
"epoch": 0.1191486116803767,
"grad_norm": 1.005925178527832,
"learning_rate": 2.6939685747879234e-05,
"loss": 0.2609,
"step": 12500
},
{
"epoch": 0.12010180057381971,
"grad_norm": 1.0519824028015137,
"learning_rate": 2.691053434521091e-05,
"loss": 0.2653,
"step": 12600
},
{
"epoch": 0.12105498946726273,
"grad_norm": 1.0782413482666016,
"learning_rate": 2.6881382942542587e-05,
"loss": 0.2537,
"step": 12700
},
{
"epoch": 0.12200817836070574,
"grad_norm": 0.9406309723854065,
"learning_rate": 2.6852231539874262e-05,
"loss": 0.262,
"step": 12800
},
{
"epoch": 0.12296136725414876,
"grad_norm": 0.922545850276947,
"learning_rate": 2.682337165123262e-05,
"loss": 0.2581,
"step": 12900
},
{
"epoch": 0.12391455614759177,
"grad_norm": 0.8488488793373108,
"learning_rate": 2.6794220248564295e-05,
"loss": 0.2611,
"step": 13000
},
{
"epoch": 0.12391455614759177,
"eval_loss": 0.22087305784225464,
"eval_runtime": 23.9914,
"eval_samples_per_second": 625.224,
"eval_steps_per_second": 9.795,
"step": 13000
},
{
"epoch": 0.12486774504103478,
"grad_norm": 0.9024129509925842,
"learning_rate": 2.6765360359922653e-05,
"loss": 0.2604,
"step": 13100
},
{
"epoch": 0.1258209339344778,
"grad_norm": 0.9496759176254272,
"learning_rate": 2.6736208957254328e-05,
"loss": 0.2552,
"step": 13200
},
{
"epoch": 0.1267741228279208,
"grad_norm": 1.0905983448028564,
"learning_rate": 2.6707057554586002e-05,
"loss": 0.2538,
"step": 13300
},
{
"epoch": 0.1277273117213638,
"grad_norm": 1.1556366682052612,
"learning_rate": 2.6677906151917677e-05,
"loss": 0.2585,
"step": 13400
},
{
"epoch": 0.12868050061480685,
"grad_norm": 1.0274028778076172,
"learning_rate": 2.6648754749249352e-05,
"loss": 0.2546,
"step": 13500
},
{
"epoch": 0.12963368950824986,
"grad_norm": 0.9366750717163086,
"learning_rate": 2.6619603346581027e-05,
"loss": 0.2562,
"step": 13600
},
{
"epoch": 0.13058687840169286,
"grad_norm": 0.9076129198074341,
"learning_rate": 2.65904519439127e-05,
"loss": 0.2567,
"step": 13700
},
{
"epoch": 0.13154006729513587,
"grad_norm": 0.9610471725463867,
"learning_rate": 2.6561300541244376e-05,
"loss": 0.2528,
"step": 13800
},
{
"epoch": 0.13249325618857888,
"grad_norm": 1.2852675914764404,
"learning_rate": 2.653214913857605e-05,
"loss": 0.2511,
"step": 13900
},
{
"epoch": 0.13344644508202191,
"grad_norm": 0.8626914024353027,
"learning_rate": 2.6502997735907726e-05,
"loss": 0.2507,
"step": 14000
},
{
"epoch": 0.13344644508202191,
"eval_loss": 0.21873866021633148,
"eval_runtime": 23.5023,
"eval_samples_per_second": 638.235,
"eval_steps_per_second": 9.999,
"step": 14000
}
],
"logging_steps": 100,
"max_steps": 104911,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.96629250834432e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}