atsuki-yamaguchi's picture
Upload folder using huggingface_hub
48eff55 verified
raw
history blame
173 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7724991614826253,
"eval_steps": 500,
"global_step": 30517,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007847256940708912,
"grad_norm": 26.94572639465332,
"learning_rate": 1.0157273918741808e-06,
"loss": 8.5879,
"step": 31
},
{
"epoch": 0.0015694513881417823,
"grad_norm": 14.633014678955078,
"learning_rate": 2.0314547837483616e-06,
"loss": 7.5048,
"step": 62
},
{
"epoch": 0.002354177082212673,
"grad_norm": 15.984803199768066,
"learning_rate": 3.0471821756225426e-06,
"loss": 6.1391,
"step": 93
},
{
"epoch": 0.0031389027762835646,
"grad_norm": 11.297175407409668,
"learning_rate": 4.062909567496723e-06,
"loss": 4.9299,
"step": 124
},
{
"epoch": 0.003923628470354455,
"grad_norm": 14.864474296569824,
"learning_rate": 5.078636959370905e-06,
"loss": 4.3205,
"step": 155
},
{
"epoch": 0.004708354164425346,
"grad_norm": 11.237608909606934,
"learning_rate": 6.094364351245085e-06,
"loss": 4.0,
"step": 186
},
{
"epoch": 0.005493079858496238,
"grad_norm": 23.79303550720215,
"learning_rate": 7.110091743119267e-06,
"loss": 3.7952,
"step": 217
},
{
"epoch": 0.006277805552567129,
"grad_norm": 15.1551513671875,
"learning_rate": 8.125819134993446e-06,
"loss": 3.689,
"step": 248
},
{
"epoch": 0.00706253124663802,
"grad_norm": 14.605571746826172,
"learning_rate": 9.141546526867629e-06,
"loss": 3.5147,
"step": 279
},
{
"epoch": 0.00784725694070891,
"grad_norm": 16.463390350341797,
"learning_rate": 1.015727391874181e-05,
"loss": 3.3901,
"step": 310
},
{
"epoch": 0.008631982634779801,
"grad_norm": 13.09945011138916,
"learning_rate": 1.117300131061599e-05,
"loss": 3.317,
"step": 341
},
{
"epoch": 0.009416708328850693,
"grad_norm": 11.993067741394043,
"learning_rate": 1.218872870249017e-05,
"loss": 3.2508,
"step": 372
},
{
"epoch": 0.010201434022921584,
"grad_norm": 10.388030052185059,
"learning_rate": 1.3204456094364351e-05,
"loss": 3.1239,
"step": 403
},
{
"epoch": 0.010986159716992476,
"grad_norm": 11.977804183959961,
"learning_rate": 1.4220183486238533e-05,
"loss": 3.0739,
"step": 434
},
{
"epoch": 0.011770885411063367,
"grad_norm": 8.925983428955078,
"learning_rate": 1.5235910878112714e-05,
"loss": 3.0169,
"step": 465
},
{
"epoch": 0.012555611105134258,
"grad_norm": 9.57411003112793,
"learning_rate": 1.6251638269986893e-05,
"loss": 2.959,
"step": 496
},
{
"epoch": 0.01334033679920515,
"grad_norm": 7.380288124084473,
"learning_rate": 1.7267365661861077e-05,
"loss": 2.8921,
"step": 527
},
{
"epoch": 0.01412506249327604,
"grad_norm": 8.812368392944336,
"learning_rate": 1.8283093053735257e-05,
"loss": 2.843,
"step": 558
},
{
"epoch": 0.014909788187346932,
"grad_norm": 8.870095252990723,
"learning_rate": 1.9298820445609438e-05,
"loss": 2.7895,
"step": 589
},
{
"epoch": 0.01569451388141782,
"grad_norm": 9.503872871398926,
"learning_rate": 2.031454783748362e-05,
"loss": 2.7757,
"step": 620
},
{
"epoch": 0.016479239575488712,
"grad_norm": 6.582827568054199,
"learning_rate": 2.13302752293578e-05,
"loss": 2.7099,
"step": 651
},
{
"epoch": 0.017263965269559603,
"grad_norm": 6.266632556915283,
"learning_rate": 2.234600262123198e-05,
"loss": 2.6729,
"step": 682
},
{
"epoch": 0.018048690963630494,
"grad_norm": 6.645415306091309,
"learning_rate": 2.336173001310616e-05,
"loss": 2.6616,
"step": 713
},
{
"epoch": 0.018833416657701385,
"grad_norm": 7.8323073387146,
"learning_rate": 2.437745740498034e-05,
"loss": 2.6291,
"step": 744
},
{
"epoch": 0.019618142351772276,
"grad_norm": 5.577521324157715,
"learning_rate": 2.5393184796854525e-05,
"loss": 2.6072,
"step": 775
},
{
"epoch": 0.020402868045843167,
"grad_norm": 5.603636264801025,
"learning_rate": 2.6408912188728702e-05,
"loss": 2.5787,
"step": 806
},
{
"epoch": 0.021187593739914058,
"grad_norm": 6.945438385009766,
"learning_rate": 2.7424639580602886e-05,
"loss": 2.5198,
"step": 837
},
{
"epoch": 0.021972319433984953,
"grad_norm": 5.6279826164245605,
"learning_rate": 2.8440366972477066e-05,
"loss": 2.5417,
"step": 868
},
{
"epoch": 0.022757045128055844,
"grad_norm": 5.517001628875732,
"learning_rate": 2.9456094364351244e-05,
"loss": 2.4849,
"step": 899
},
{
"epoch": 0.023541770822126735,
"grad_norm": 5.865486145019531,
"learning_rate": 3.0471821756225428e-05,
"loss": 2.5103,
"step": 930
},
{
"epoch": 0.024326496516197626,
"grad_norm": 4.949043273925781,
"learning_rate": 3.148754914809961e-05,
"loss": 2.4581,
"step": 961
},
{
"epoch": 0.025111222210268517,
"grad_norm": 4.701717853546143,
"learning_rate": 3.2503276539973785e-05,
"loss": 2.4315,
"step": 992
},
{
"epoch": 0.025895947904339408,
"grad_norm": 4.533145904541016,
"learning_rate": 3.351900393184797e-05,
"loss": 2.4056,
"step": 1023
},
{
"epoch": 0.0266806735984103,
"grad_norm": 4.724672794342041,
"learning_rate": 3.453473132372215e-05,
"loss": 2.3994,
"step": 1054
},
{
"epoch": 0.02746539929248119,
"grad_norm": 4.745669364929199,
"learning_rate": 3.555045871559633e-05,
"loss": 2.3546,
"step": 1085
},
{
"epoch": 0.02825012498655208,
"grad_norm": 4.4554948806762695,
"learning_rate": 3.6566186107470514e-05,
"loss": 2.3642,
"step": 1116
},
{
"epoch": 0.029034850680622972,
"grad_norm": 4.4792304039001465,
"learning_rate": 3.7581913499344695e-05,
"loss": 2.3296,
"step": 1147
},
{
"epoch": 0.029819576374693863,
"grad_norm": 3.9329679012298584,
"learning_rate": 3.8597640891218876e-05,
"loss": 2.3105,
"step": 1178
},
{
"epoch": 0.030604302068764754,
"grad_norm": 4.338287830352783,
"learning_rate": 3.9613368283093056e-05,
"loss": 2.2811,
"step": 1209
},
{
"epoch": 0.03138902776283564,
"grad_norm": 4.130499839782715,
"learning_rate": 4.062909567496724e-05,
"loss": 2.2898,
"step": 1240
},
{
"epoch": 0.03217375345690653,
"grad_norm": 3.5664470195770264,
"learning_rate": 4.164482306684142e-05,
"loss": 2.2786,
"step": 1271
},
{
"epoch": 0.032958479150977424,
"grad_norm": 3.642627716064453,
"learning_rate": 4.26605504587156e-05,
"loss": 2.2439,
"step": 1302
},
{
"epoch": 0.033743204845048315,
"grad_norm": 3.7562780380249023,
"learning_rate": 4.367627785058978e-05,
"loss": 2.2441,
"step": 1333
},
{
"epoch": 0.034527930539119206,
"grad_norm": 3.3117406368255615,
"learning_rate": 4.469200524246396e-05,
"loss": 2.2604,
"step": 1364
},
{
"epoch": 0.0353126562331901,
"grad_norm": 3.4313724040985107,
"learning_rate": 4.570773263433814e-05,
"loss": 2.2069,
"step": 1395
},
{
"epoch": 0.03609738192726099,
"grad_norm": 3.4720091819763184,
"learning_rate": 4.672346002621232e-05,
"loss": 2.2087,
"step": 1426
},
{
"epoch": 0.03688210762133188,
"grad_norm": 3.491856575012207,
"learning_rate": 4.77391874180865e-05,
"loss": 2.1808,
"step": 1457
},
{
"epoch": 0.03766683331540277,
"grad_norm": 3.3730666637420654,
"learning_rate": 4.875491480996068e-05,
"loss": 2.1907,
"step": 1488
},
{
"epoch": 0.03845155900947366,
"grad_norm": 2.894322395324707,
"learning_rate": 4.977064220183487e-05,
"loss": 2.1689,
"step": 1519
},
{
"epoch": 0.03923628470354455,
"grad_norm": 3.195884943008423,
"learning_rate": 4.9999915451558777e-05,
"loss": 2.194,
"step": 1550
},
{
"epoch": 0.04002101039761544,
"grad_norm": 3.154061794281006,
"learning_rate": 4.999955597496219e-05,
"loss": 2.1409,
"step": 1581
},
{
"epoch": 0.040805736091686334,
"grad_norm": 2.8204188346862793,
"learning_rate": 4.9998914381774255e-05,
"loss": 2.145,
"step": 1612
},
{
"epoch": 0.041590461785757225,
"grad_norm": 2.98260760307312,
"learning_rate": 4.999799067923527e-05,
"loss": 2.1523,
"step": 1643
},
{
"epoch": 0.042375187479828116,
"grad_norm": 2.917949914932251,
"learning_rate": 4.999678487776908e-05,
"loss": 2.1221,
"step": 1674
},
{
"epoch": 0.04315991317389901,
"grad_norm": 2.811469554901123,
"learning_rate": 4.9995296990983006e-05,
"loss": 2.1242,
"step": 1705
},
{
"epoch": 0.043944638867969905,
"grad_norm": 3.067636728286743,
"learning_rate": 4.999352703566763e-05,
"loss": 2.1092,
"step": 1736
},
{
"epoch": 0.044729364562040796,
"grad_norm": 2.6231868267059326,
"learning_rate": 4.999147503179668e-05,
"loss": 2.1018,
"step": 1767
},
{
"epoch": 0.04551409025611169,
"grad_norm": 2.8247616291046143,
"learning_rate": 4.998914100252672e-05,
"loss": 2.074,
"step": 1798
},
{
"epoch": 0.04629881595018258,
"grad_norm": 2.5960075855255127,
"learning_rate": 4.998652497419696e-05,
"loss": 2.0824,
"step": 1829
},
{
"epoch": 0.04708354164425347,
"grad_norm": 2.7796943187713623,
"learning_rate": 4.9983626976328927e-05,
"loss": 2.0998,
"step": 1860
},
{
"epoch": 0.04786826733832436,
"grad_norm": 2.49242901802063,
"learning_rate": 4.998044704162613e-05,
"loss": 2.0893,
"step": 1891
},
{
"epoch": 0.04865299303239525,
"grad_norm": 2.4294378757476807,
"learning_rate": 4.9976985205973705e-05,
"loss": 2.0617,
"step": 1922
},
{
"epoch": 0.04943771872646614,
"grad_norm": 2.553217649459839,
"learning_rate": 4.997324150843799e-05,
"loss": 2.0632,
"step": 1953
},
{
"epoch": 0.050222444420537034,
"grad_norm": 2.6711318492889404,
"learning_rate": 4.99692159912661e-05,
"loss": 2.0445,
"step": 1984
},
{
"epoch": 0.051007170114607925,
"grad_norm": 2.714432716369629,
"learning_rate": 4.996490869988546e-05,
"loss": 2.0185,
"step": 2015
},
{
"epoch": 0.051791895808678816,
"grad_norm": 2.6516053676605225,
"learning_rate": 4.996031968290326e-05,
"loss": 2.057,
"step": 2046
},
{
"epoch": 0.05257662150274971,
"grad_norm": 2.4798831939697266,
"learning_rate": 4.995544899210594e-05,
"loss": 2.0199,
"step": 2077
},
{
"epoch": 0.0533613471968206,
"grad_norm": 2.5150041580200195,
"learning_rate": 4.9950296682458583e-05,
"loss": 2.0264,
"step": 2108
},
{
"epoch": 0.05414607289089149,
"grad_norm": 2.637777805328369,
"learning_rate": 4.994486281210429e-05,
"loss": 2.0233,
"step": 2139
},
{
"epoch": 0.05493079858496238,
"grad_norm": 2.330376148223877,
"learning_rate": 4.9939147442363566e-05,
"loss": 2.0201,
"step": 2170
},
{
"epoch": 0.05571552427903327,
"grad_norm": 2.3436174392700195,
"learning_rate": 4.9933150637733574e-05,
"loss": 1.9865,
"step": 2201
},
{
"epoch": 0.05650024997310416,
"grad_norm": 2.7756845951080322,
"learning_rate": 4.992687246588743e-05,
"loss": 1.9983,
"step": 2232
},
{
"epoch": 0.05728497566717505,
"grad_norm": 2.1725504398345947,
"learning_rate": 4.992031299767347e-05,
"loss": 1.9689,
"step": 2263
},
{
"epoch": 0.058069701361245944,
"grad_norm": 2.2163312435150146,
"learning_rate": 4.9913472307114386e-05,
"loss": 1.9829,
"step": 2294
},
{
"epoch": 0.058854427055316835,
"grad_norm": 2.2829232215881348,
"learning_rate": 4.9906350471406446e-05,
"loss": 2.0142,
"step": 2325
},
{
"epoch": 0.059639152749387726,
"grad_norm": 2.239596366882324,
"learning_rate": 4.989894757091861e-05,
"loss": 1.9697,
"step": 2356
},
{
"epoch": 0.06042387844345862,
"grad_norm": 2.2926037311553955,
"learning_rate": 4.989126368919158e-05,
"loss": 1.9688,
"step": 2387
},
{
"epoch": 0.06120860413752951,
"grad_norm": 10.08767032623291,
"learning_rate": 4.988329891293693e-05,
"loss": 1.9845,
"step": 2418
},
{
"epoch": 0.0619933298316004,
"grad_norm": 2.2427194118499756,
"learning_rate": 4.987505333203608e-05,
"loss": 1.9744,
"step": 2449
},
{
"epoch": 0.06277805552567128,
"grad_norm": 2.5111870765686035,
"learning_rate": 4.9866527039539276e-05,
"loss": 1.9526,
"step": 2480
},
{
"epoch": 0.06356278121974218,
"grad_norm": 2.2100026607513428,
"learning_rate": 4.9857720131664594e-05,
"loss": 1.9826,
"step": 2511
},
{
"epoch": 0.06434750691381307,
"grad_norm": 2.2112088203430176,
"learning_rate": 4.9848632707796773e-05,
"loss": 1.9698,
"step": 2542
},
{
"epoch": 0.06513223260788396,
"grad_norm": 2.404014825820923,
"learning_rate": 4.9839264870486155e-05,
"loss": 1.9628,
"step": 2573
},
{
"epoch": 0.06591695830195485,
"grad_norm": 2.526423692703247,
"learning_rate": 4.9829616725447526e-05,
"loss": 1.9481,
"step": 2604
},
{
"epoch": 0.06670168399602575,
"grad_norm": 2.2506027221679688,
"learning_rate": 4.981968838155888e-05,
"loss": 1.9418,
"step": 2635
},
{
"epoch": 0.06748640969009663,
"grad_norm": 2.4334371089935303,
"learning_rate": 4.980947995086024e-05,
"loss": 1.9423,
"step": 2666
},
{
"epoch": 0.06827113538416753,
"grad_norm": 2.3028314113616943,
"learning_rate": 4.979899154855234e-05,
"loss": 1.9391,
"step": 2697
},
{
"epoch": 0.06905586107823841,
"grad_norm": 2.122143030166626,
"learning_rate": 4.9788223292995386e-05,
"loss": 1.933,
"step": 2728
},
{
"epoch": 0.06984058677230931,
"grad_norm": 2.1335129737854004,
"learning_rate": 4.977717530570768e-05,
"loss": 1.9212,
"step": 2759
},
{
"epoch": 0.0706253124663802,
"grad_norm": 2.198650598526001,
"learning_rate": 4.976584771136425e-05,
"loss": 1.9217,
"step": 2790
},
{
"epoch": 0.07141003816045109,
"grad_norm": 2.4985201358795166,
"learning_rate": 4.975424063779547e-05,
"loss": 1.9277,
"step": 2821
},
{
"epoch": 0.07219476385452198,
"grad_norm": 1.9877598285675049,
"learning_rate": 4.974235421598557e-05,
"loss": 1.9278,
"step": 2852
},
{
"epoch": 0.07297948954859287,
"grad_norm": 3.0082573890686035,
"learning_rate": 4.973018858007122e-05,
"loss": 1.9261,
"step": 2883
},
{
"epoch": 0.07376421524266376,
"grad_norm": 2.139742851257324,
"learning_rate": 4.9717743867339963e-05,
"loss": 1.9168,
"step": 2914
},
{
"epoch": 0.07454894093673466,
"grad_norm": 2.1748037338256836,
"learning_rate": 4.9705020218228695e-05,
"loss": 1.9132,
"step": 2945
},
{
"epoch": 0.07533366663080554,
"grad_norm": 2.0570950508117676,
"learning_rate": 4.969201777632205e-05,
"loss": 1.9177,
"step": 2976
},
{
"epoch": 0.07611839232487644,
"grad_norm": 1.9970216751098633,
"learning_rate": 4.9678736688350846e-05,
"loss": 1.9105,
"step": 3007
},
{
"epoch": 0.07690311801894732,
"grad_norm": 1.9640527963638306,
"learning_rate": 4.966517710419033e-05,
"loss": 1.9084,
"step": 3038
},
{
"epoch": 0.07768784371301822,
"grad_norm": 2.172874927520752,
"learning_rate": 4.965133917685858e-05,
"loss": 1.8995,
"step": 3069
},
{
"epoch": 0.0784725694070891,
"grad_norm": 2.1881916522979736,
"learning_rate": 4.9637223062514714e-05,
"loss": 1.9019,
"step": 3100
},
{
"epoch": 0.07925729510116,
"grad_norm": 1.975496530532837,
"learning_rate": 4.962282892045718e-05,
"loss": 1.8967,
"step": 3131
},
{
"epoch": 0.08004202079523089,
"grad_norm": 2.0970685482025146,
"learning_rate": 4.9608156913121904e-05,
"loss": 1.8867,
"step": 3162
},
{
"epoch": 0.08082674648930178,
"grad_norm": 2.096353769302368,
"learning_rate": 4.959320720608049e-05,
"loss": 1.8967,
"step": 3193
},
{
"epoch": 0.08161147218337267,
"grad_norm": 1.998336911201477,
"learning_rate": 4.9577979968038354e-05,
"loss": 1.8876,
"step": 3224
},
{
"epoch": 0.08239619787744357,
"grad_norm": 2.098055362701416,
"learning_rate": 4.956247537083282e-05,
"loss": 1.9,
"step": 3255
},
{
"epoch": 0.08318092357151445,
"grad_norm": 2.0739505290985107,
"learning_rate": 4.9546693589431145e-05,
"loss": 1.8902,
"step": 3286
},
{
"epoch": 0.08396564926558535,
"grad_norm": 1.9556243419647217,
"learning_rate": 4.9530634801928595e-05,
"loss": 1.888,
"step": 3317
},
{
"epoch": 0.08475037495965623,
"grad_norm": 2.096874952316284,
"learning_rate": 4.9514299189546395e-05,
"loss": 1.8785,
"step": 3348
},
{
"epoch": 0.08553510065372713,
"grad_norm": 1.9407072067260742,
"learning_rate": 4.949768693662973e-05,
"loss": 1.8646,
"step": 3379
},
{
"epoch": 0.08631982634779801,
"grad_norm": 1.9928467273712158,
"learning_rate": 4.948079823064559e-05,
"loss": 1.8751,
"step": 3410
},
{
"epoch": 0.08710455204186891,
"grad_norm": 1.9670037031173706,
"learning_rate": 4.946363326218074e-05,
"loss": 1.8831,
"step": 3441
},
{
"epoch": 0.08788927773593981,
"grad_norm": 1.999193787574768,
"learning_rate": 4.9446192224939525e-05,
"loss": 1.8605,
"step": 3472
},
{
"epoch": 0.0886740034300107,
"grad_norm": 1.9073724746704102,
"learning_rate": 4.942847531574167e-05,
"loss": 1.8576,
"step": 3503
},
{
"epoch": 0.08945872912408159,
"grad_norm": 2.179824113845825,
"learning_rate": 4.941048273452008e-05,
"loss": 1.8682,
"step": 3534
},
{
"epoch": 0.09024345481815248,
"grad_norm": 1.954990029335022,
"learning_rate": 4.9392214684318605e-05,
"loss": 1.8807,
"step": 3565
},
{
"epoch": 0.09102818051222338,
"grad_norm": 1.7695640325546265,
"learning_rate": 4.93736713712897e-05,
"loss": 1.879,
"step": 3596
},
{
"epoch": 0.09181290620629426,
"grad_norm": 1.7708550691604614,
"learning_rate": 4.9354853004692124e-05,
"loss": 1.8677,
"step": 3627
},
{
"epoch": 0.09259763190036516,
"grad_norm": 1.9683934450149536,
"learning_rate": 4.93357597968886e-05,
"loss": 1.8595,
"step": 3658
},
{
"epoch": 0.09338235759443604,
"grad_norm": 2.00441312789917,
"learning_rate": 4.931639196334338e-05,
"loss": 1.8462,
"step": 3689
},
{
"epoch": 0.09416708328850694,
"grad_norm": 1.875543475151062,
"learning_rate": 4.9296749722619826e-05,
"loss": 1.8502,
"step": 3720
},
{
"epoch": 0.09495180898257782,
"grad_norm": 1.932658314704895,
"learning_rate": 4.9276833296377966e-05,
"loss": 1.8457,
"step": 3751
},
{
"epoch": 0.09573653467664872,
"grad_norm": 1.9957045316696167,
"learning_rate": 4.925664290937196e-05,
"loss": 1.843,
"step": 3782
},
{
"epoch": 0.0965212603707196,
"grad_norm": 1.8579176664352417,
"learning_rate": 4.9236178789447576e-05,
"loss": 1.8504,
"step": 3813
},
{
"epoch": 0.0973059860647905,
"grad_norm": 1.9646131992340088,
"learning_rate": 4.921544116753962e-05,
"loss": 1.8512,
"step": 3844
},
{
"epoch": 0.09809071175886139,
"grad_norm": 1.8213136196136475,
"learning_rate": 4.919443027766935e-05,
"loss": 1.8618,
"step": 3875
},
{
"epoch": 0.09887543745293229,
"grad_norm": 2.017280101776123,
"learning_rate": 4.91731463569418e-05,
"loss": 1.863,
"step": 3906
},
{
"epoch": 0.09966016314700317,
"grad_norm": 1.9125665426254272,
"learning_rate": 4.915158964554312e-05,
"loss": 1.8259,
"step": 3937
},
{
"epoch": 0.10044488884107407,
"grad_norm": 2.0414695739746094,
"learning_rate": 4.912976038673786e-05,
"loss": 1.8347,
"step": 3968
},
{
"epoch": 0.10122961453514495,
"grad_norm": 1.7705485820770264,
"learning_rate": 4.9107658826866254e-05,
"loss": 1.8502,
"step": 3999
},
{
"epoch": 0.10201434022921585,
"grad_norm": 1.8961102962493896,
"learning_rate": 4.908528521534139e-05,
"loss": 1.84,
"step": 4030
},
{
"epoch": 0.10279906592328673,
"grad_norm": 1.784387230873108,
"learning_rate": 4.906263980464644e-05,
"loss": 1.842,
"step": 4061
},
{
"epoch": 0.10358379161735763,
"grad_norm": 11.229472160339355,
"learning_rate": 4.903972285033178e-05,
"loss": 1.8476,
"step": 4092
},
{
"epoch": 0.10436851731142852,
"grad_norm": 1.9657154083251953,
"learning_rate": 4.901653461101213e-05,
"loss": 1.8465,
"step": 4123
},
{
"epoch": 0.10515324300549941,
"grad_norm": 1.7702244520187378,
"learning_rate": 4.8993075348363626e-05,
"loss": 1.8249,
"step": 4154
},
{
"epoch": 0.1059379686995703,
"grad_norm": 1.8672112226486206,
"learning_rate": 4.896934532712084e-05,
"loss": 1.8232,
"step": 4185
},
{
"epoch": 0.1067226943936412,
"grad_norm": 1.7806147336959839,
"learning_rate": 4.8945344815073846e-05,
"loss": 1.8256,
"step": 4216
},
{
"epoch": 0.10750742008771208,
"grad_norm": 1.7830456495285034,
"learning_rate": 4.892107408306516e-05,
"loss": 1.8271,
"step": 4247
},
{
"epoch": 0.10829214578178298,
"grad_norm": 1.96640944480896,
"learning_rate": 4.889653340498669e-05,
"loss": 1.82,
"step": 4278
},
{
"epoch": 0.10907687147585386,
"grad_norm": 1.8224470615386963,
"learning_rate": 4.8871723057776664e-05,
"loss": 1.8216,
"step": 4309
},
{
"epoch": 0.10986159716992476,
"grad_norm": 2.5164501667022705,
"learning_rate": 4.8846643321416476e-05,
"loss": 1.8252,
"step": 4340
},
{
"epoch": 0.11064632286399564,
"grad_norm": 1.7248613834381104,
"learning_rate": 4.882129447892753e-05,
"loss": 1.8133,
"step": 4371
},
{
"epoch": 0.11143104855806654,
"grad_norm": 2.060304880142212,
"learning_rate": 4.8795676816368076e-05,
"loss": 1.8282,
"step": 4402
},
{
"epoch": 0.11221577425213743,
"grad_norm": 1.8709039688110352,
"learning_rate": 4.876979062282995e-05,
"loss": 1.8154,
"step": 4433
},
{
"epoch": 0.11300049994620832,
"grad_norm": 1.7444674968719482,
"learning_rate": 4.8743636190435325e-05,
"loss": 1.8173,
"step": 4464
},
{
"epoch": 0.11378522564027921,
"grad_norm": 1.7357319593429565,
"learning_rate": 4.871721381433344e-05,
"loss": 1.8351,
"step": 4495
},
{
"epoch": 0.1145699513343501,
"grad_norm": 1.728070855140686,
"learning_rate": 4.869052379269719e-05,
"loss": 1.8119,
"step": 4526
},
{
"epoch": 0.11535467702842099,
"grad_norm": 1.742035984992981,
"learning_rate": 4.866356642671985e-05,
"loss": 1.7967,
"step": 4557
},
{
"epoch": 0.11613940272249189,
"grad_norm": 1.7010915279388428,
"learning_rate": 4.8636342020611634e-05,
"loss": 1.8004,
"step": 4588
},
{
"epoch": 0.11692412841656277,
"grad_norm": 1.6775914430618286,
"learning_rate": 4.860885088159626e-05,
"loss": 1.8173,
"step": 4619
},
{
"epoch": 0.11770885411063367,
"grad_norm": 1.9107964038848877,
"learning_rate": 4.858109331990751e-05,
"loss": 1.7984,
"step": 4650
},
{
"epoch": 0.11849357980470455,
"grad_norm": 1.713429570198059,
"learning_rate": 4.855306964878567e-05,
"loss": 1.7967,
"step": 4681
},
{
"epoch": 0.11927830549877545,
"grad_norm": 1.9373931884765625,
"learning_rate": 4.8524780184474084e-05,
"loss": 1.8072,
"step": 4712
},
{
"epoch": 0.12006303119284634,
"grad_norm": 1.8975365161895752,
"learning_rate": 4.8496225246215496e-05,
"loss": 1.8121,
"step": 4743
},
{
"epoch": 0.12084775688691723,
"grad_norm": 5.285326957702637,
"learning_rate": 4.8467405156248505e-05,
"loss": 1.8189,
"step": 4774
},
{
"epoch": 0.12163248258098812,
"grad_norm": 1.7155263423919678,
"learning_rate": 4.843832023980392e-05,
"loss": 1.8093,
"step": 4805
},
{
"epoch": 0.12241720827505902,
"grad_norm": 1.726831316947937,
"learning_rate": 4.840897082510106e-05,
"loss": 1.7952,
"step": 4836
},
{
"epoch": 0.1232019339691299,
"grad_norm": 1.739639401435852,
"learning_rate": 4.8379357243344084e-05,
"loss": 1.8103,
"step": 4867
},
{
"epoch": 0.1239866596632008,
"grad_norm": 1.6978296041488647,
"learning_rate": 4.8349479828718236e-05,
"loss": 1.8006,
"step": 4898
},
{
"epoch": 0.12477138535727168,
"grad_norm": 1.7154194116592407,
"learning_rate": 4.8319338918386075e-05,
"loss": 1.7876,
"step": 4929
},
{
"epoch": 0.12555611105134257,
"grad_norm": 1.6323316097259521,
"learning_rate": 4.828893485248369e-05,
"loss": 1.8159,
"step": 4960
},
{
"epoch": 0.12634083674541347,
"grad_norm": 1.641784429550171,
"learning_rate": 4.825826797411682e-05,
"loss": 1.7959,
"step": 4991
},
{
"epoch": 0.12712556243948436,
"grad_norm": 1.6947154998779297,
"learning_rate": 4.822733862935702e-05,
"loss": 1.7895,
"step": 5022
},
{
"epoch": 0.12791028813355526,
"grad_norm": 1.6331220865249634,
"learning_rate": 4.819614716723775e-05,
"loss": 1.7707,
"step": 5053
},
{
"epoch": 0.12869501382762613,
"grad_norm": 1.8207937479019165,
"learning_rate": 4.8164693939750425e-05,
"loss": 1.8123,
"step": 5084
},
{
"epoch": 0.12947973952169703,
"grad_norm": 1.6664263010025024,
"learning_rate": 4.813297930184042e-05,
"loss": 1.8089,
"step": 5115
},
{
"epoch": 0.13026446521576793,
"grad_norm": 1.9931398630142212,
"learning_rate": 4.810100361140314e-05,
"loss": 1.7757,
"step": 5146
},
{
"epoch": 0.13104919090983883,
"grad_norm": 1.839200735092163,
"learning_rate": 4.8068767229279885e-05,
"loss": 1.7969,
"step": 5177
},
{
"epoch": 0.1318339166039097,
"grad_norm": 1.781187653541565,
"learning_rate": 4.8036270519253854e-05,
"loss": 1.7937,
"step": 5208
},
{
"epoch": 0.1326186422979806,
"grad_norm": 1.7144343852996826,
"learning_rate": 4.8003513848046e-05,
"loss": 1.7816,
"step": 5239
},
{
"epoch": 0.1334033679920515,
"grad_norm": 1.6819554567337036,
"learning_rate": 4.79704975853109e-05,
"loss": 1.7851,
"step": 5270
},
{
"epoch": 0.1341880936861224,
"grad_norm": 1.6748546361923218,
"learning_rate": 4.793722210363262e-05,
"loss": 1.7941,
"step": 5301
},
{
"epoch": 0.13497281938019326,
"grad_norm": 1.615569829940796,
"learning_rate": 4.7903687778520414e-05,
"loss": 1.7799,
"step": 5332
},
{
"epoch": 0.13575754507426416,
"grad_norm": 1.7959198951721191,
"learning_rate": 4.7869894988404593e-05,
"loss": 1.7802,
"step": 5363
},
{
"epoch": 0.13654227076833506,
"grad_norm": 1.598946452140808,
"learning_rate": 4.783584411463221e-05,
"loss": 1.7929,
"step": 5394
},
{
"epoch": 0.13732699646240595,
"grad_norm": 1.793511986732483,
"learning_rate": 4.780153554146274e-05,
"loss": 1.7591,
"step": 5425
},
{
"epoch": 0.13811172215647682,
"grad_norm": 1.718671202659607,
"learning_rate": 4.7766969656063766e-05,
"loss": 1.7807,
"step": 5456
},
{
"epoch": 0.13889644785054772,
"grad_norm": 1.6548669338226318,
"learning_rate": 4.773214684850662e-05,
"loss": 1.775,
"step": 5487
},
{
"epoch": 0.13968117354461862,
"grad_norm": 1.6727256774902344,
"learning_rate": 4.769706751176193e-05,
"loss": 1.7756,
"step": 5518
},
{
"epoch": 0.14046589923868952,
"grad_norm": 1.7169344425201416,
"learning_rate": 4.7661732041695264e-05,
"loss": 1.7887,
"step": 5549
},
{
"epoch": 0.1412506249327604,
"grad_norm": 1.6376421451568604,
"learning_rate": 4.762614083706258e-05,
"loss": 1.7939,
"step": 5580
},
{
"epoch": 0.14203535062683129,
"grad_norm": 1.7083207368850708,
"learning_rate": 4.759029429950581e-05,
"loss": 1.7705,
"step": 5611
},
{
"epoch": 0.14282007632090218,
"grad_norm": 1.6359349489212036,
"learning_rate": 4.7554192833548235e-05,
"loss": 1.7732,
"step": 5642
},
{
"epoch": 0.14360480201497308,
"grad_norm": 1.684005618095398,
"learning_rate": 4.751783684659e-05,
"loss": 1.7766,
"step": 5673
},
{
"epoch": 0.14438952770904395,
"grad_norm": 1.7531359195709229,
"learning_rate": 4.748122674890348e-05,
"loss": 1.7815,
"step": 5704
},
{
"epoch": 0.14517425340311485,
"grad_norm": 1.5898247957229614,
"learning_rate": 4.7444362953628654e-05,
"loss": 1.7837,
"step": 5735
},
{
"epoch": 0.14595897909718575,
"grad_norm": 1.6781623363494873,
"learning_rate": 4.7407245876768424e-05,
"loss": 1.7381,
"step": 5766
},
{
"epoch": 0.14674370479125665,
"grad_norm": 1.6126357316970825,
"learning_rate": 4.736987593718397e-05,
"loss": 1.7714,
"step": 5797
},
{
"epoch": 0.14752843048532752,
"grad_norm": 1.6623587608337402,
"learning_rate": 4.733225355658999e-05,
"loss": 1.7625,
"step": 5828
},
{
"epoch": 0.14831315617939841,
"grad_norm": 1.6715524196624756,
"learning_rate": 4.7294379159549926e-05,
"loss": 1.7631,
"step": 5859
},
{
"epoch": 0.1490978818734693,
"grad_norm": 1.6739026308059692,
"learning_rate": 4.725625317347119e-05,
"loss": 1.775,
"step": 5890
},
{
"epoch": 0.1498826075675402,
"grad_norm": 1.8141075372695923,
"learning_rate": 4.7217876028600374e-05,
"loss": 1.7881,
"step": 5921
},
{
"epoch": 0.15066733326161108,
"grad_norm": 1.6842069625854492,
"learning_rate": 4.717924815801832e-05,
"loss": 1.7707,
"step": 5952
},
{
"epoch": 0.15145205895568198,
"grad_norm": 1.7032698392868042,
"learning_rate": 4.714036999763532e-05,
"loss": 1.7631,
"step": 5983
},
{
"epoch": 0.15223678464975288,
"grad_norm": 1.7856013774871826,
"learning_rate": 4.7101241986186116e-05,
"loss": 1.7545,
"step": 6014
},
{
"epoch": 0.15302151034382377,
"grad_norm": 1.679623007774353,
"learning_rate": 4.7061864565225e-05,
"loss": 1.7676,
"step": 6045
},
{
"epoch": 0.15380623603789464,
"grad_norm": 1.626792073249817,
"learning_rate": 4.702223817912081e-05,
"loss": 1.7434,
"step": 6076
},
{
"epoch": 0.15459096173196554,
"grad_norm": 1.850042700767517,
"learning_rate": 4.698236327505195e-05,
"loss": 1.7805,
"step": 6107
},
{
"epoch": 0.15537568742603644,
"grad_norm": 1.6403062343597412,
"learning_rate": 4.694224030300127e-05,
"loss": 1.7495,
"step": 6138
},
{
"epoch": 0.15616041312010734,
"grad_norm": 1.5897477865219116,
"learning_rate": 4.690186971575107e-05,
"loss": 1.779,
"step": 6169
},
{
"epoch": 0.1569451388141782,
"grad_norm": 1.8173433542251587,
"learning_rate": 4.6861251968877916e-05,
"loss": 1.7705,
"step": 6200
},
{
"epoch": 0.1577298645082491,
"grad_norm": 1.788022756576538,
"learning_rate": 4.68203875207476e-05,
"loss": 1.7457,
"step": 6231
},
{
"epoch": 0.15851459020232,
"grad_norm": 1.6219838857650757,
"learning_rate": 4.677927683250983e-05,
"loss": 1.7758,
"step": 6262
},
{
"epoch": 0.1592993158963909,
"grad_norm": 1.678890347480774,
"learning_rate": 4.6737920368093156e-05,
"loss": 1.7394,
"step": 6293
},
{
"epoch": 0.16008404159046177,
"grad_norm": 1.5719743967056274,
"learning_rate": 4.669631859419965e-05,
"loss": 1.7549,
"step": 6324
},
{
"epoch": 0.16086876728453267,
"grad_norm": 1.6332769393920898,
"learning_rate": 4.6654471980299676e-05,
"loss": 1.7462,
"step": 6355
},
{
"epoch": 0.16165349297860357,
"grad_norm": 1.6942561864852905,
"learning_rate": 4.661238099862658e-05,
"loss": 1.7506,
"step": 6386
},
{
"epoch": 0.16243821867267447,
"grad_norm": 1.8173885345458984,
"learning_rate": 4.657004612417138e-05,
"loss": 1.7455,
"step": 6417
},
{
"epoch": 0.16322294436674534,
"grad_norm": 1.6209042072296143,
"learning_rate": 4.6527467834677374e-05,
"loss": 1.7413,
"step": 6448
},
{
"epoch": 0.16400767006081624,
"grad_norm": 1.5801094770431519,
"learning_rate": 4.648464661063478e-05,
"loss": 1.7491,
"step": 6479
},
{
"epoch": 0.16479239575488713,
"grad_norm": 1.5499264001846313,
"learning_rate": 4.6441582935275264e-05,
"loss": 1.7276,
"step": 6510
},
{
"epoch": 0.16557712144895803,
"grad_norm": 1.6154171228408813,
"learning_rate": 4.6398277294566586e-05,
"loss": 1.7816,
"step": 6541
},
{
"epoch": 0.1663618471430289,
"grad_norm": 1.5633410215377808,
"learning_rate": 4.6354730177207e-05,
"loss": 1.7447,
"step": 6572
},
{
"epoch": 0.1671465728370998,
"grad_norm": 1.7070655822753906,
"learning_rate": 4.6310942074619787e-05,
"loss": 1.7477,
"step": 6603
},
{
"epoch": 0.1679312985311707,
"grad_norm": 1.7502373456954956,
"learning_rate": 4.626691348094777e-05,
"loss": 1.74,
"step": 6634
},
{
"epoch": 0.1687160242252416,
"grad_norm": 1.9541263580322266,
"learning_rate": 4.622264489304762e-05,
"loss": 1.7389,
"step": 6665
},
{
"epoch": 0.16950074991931247,
"grad_norm": 1.64599609375,
"learning_rate": 4.617813681048434e-05,
"loss": 1.7445,
"step": 6696
},
{
"epoch": 0.17028547561338336,
"grad_norm": 1.9360859394073486,
"learning_rate": 4.61333897355256e-05,
"loss": 1.73,
"step": 6727
},
{
"epoch": 0.17107020130745426,
"grad_norm": 1.693892240524292,
"learning_rate": 4.608840417313604e-05,
"loss": 1.7229,
"step": 6758
},
{
"epoch": 0.17185492700152516,
"grad_norm": 1.6243150234222412,
"learning_rate": 4.6043180630971646e-05,
"loss": 1.7421,
"step": 6789
},
{
"epoch": 0.17263965269559603,
"grad_norm": 1.5926107168197632,
"learning_rate": 4.599771961937391e-05,
"loss": 1.7447,
"step": 6820
},
{
"epoch": 0.17342437838966693,
"grad_norm": 1.695167064666748,
"learning_rate": 4.5952021651364204e-05,
"loss": 1.7463,
"step": 6851
},
{
"epoch": 0.17420910408373783,
"grad_norm": 1.5915182828903198,
"learning_rate": 4.590608724263786e-05,
"loss": 1.7198,
"step": 6882
},
{
"epoch": 0.17499382977780872,
"grad_norm": 1.6135920286178589,
"learning_rate": 4.585991691155845e-05,
"loss": 1.7233,
"step": 6913
},
{
"epoch": 0.17577855547187962,
"grad_norm": 1.5855350494384766,
"learning_rate": 4.581351117915188e-05,
"loss": 1.7519,
"step": 6944
},
{
"epoch": 0.1765632811659505,
"grad_norm": 1.5782060623168945,
"learning_rate": 4.5766870569100534e-05,
"loss": 1.729,
"step": 6975
},
{
"epoch": 0.1773480068600214,
"grad_norm": 1.4931174516677856,
"learning_rate": 4.571999560773736e-05,
"loss": 1.7197,
"step": 7006
},
{
"epoch": 0.1781327325540923,
"grad_norm": 1.809645414352417,
"learning_rate": 4.5672886824039915e-05,
"loss": 1.7409,
"step": 7037
},
{
"epoch": 0.17891745824816319,
"grad_norm": 1.544233798980713,
"learning_rate": 4.5625544749624435e-05,
"loss": 1.7331,
"step": 7068
},
{
"epoch": 0.17970218394223406,
"grad_norm": 1.5316941738128662,
"learning_rate": 4.5577969918739794e-05,
"loss": 1.7245,
"step": 7099
},
{
"epoch": 0.18048690963630495,
"grad_norm": 1.4646427631378174,
"learning_rate": 4.5530162868261486e-05,
"loss": 1.7341,
"step": 7130
},
{
"epoch": 0.18127163533037585,
"grad_norm": 1.6266372203826904,
"learning_rate": 4.548212413768558e-05,
"loss": 1.7311,
"step": 7161
},
{
"epoch": 0.18205636102444675,
"grad_norm": 1.6372709274291992,
"learning_rate": 4.543385426912261e-05,
"loss": 1.7344,
"step": 7192
},
{
"epoch": 0.18284108671851762,
"grad_norm": 1.642005443572998,
"learning_rate": 4.53853538072915e-05,
"loss": 1.7472,
"step": 7223
},
{
"epoch": 0.18362581241258852,
"grad_norm": 1.7344322204589844,
"learning_rate": 4.533662329951336e-05,
"loss": 1.7379,
"step": 7254
},
{
"epoch": 0.18441053810665942,
"grad_norm": 1.6593672037124634,
"learning_rate": 4.528766329570536e-05,
"loss": 1.7363,
"step": 7285
},
{
"epoch": 0.18519526380073031,
"grad_norm": 1.590846300125122,
"learning_rate": 4.523847434837447e-05,
"loss": 1.7432,
"step": 7316
},
{
"epoch": 0.18597998949480118,
"grad_norm": 1.6701788902282715,
"learning_rate": 4.518905701261128e-05,
"loss": 1.7287,
"step": 7347
},
{
"epoch": 0.18676471518887208,
"grad_norm": 1.6129958629608154,
"learning_rate": 4.5139411846083715e-05,
"loss": 1.7252,
"step": 7378
},
{
"epoch": 0.18754944088294298,
"grad_norm": 1.5602383613586426,
"learning_rate": 4.508953940903073e-05,
"loss": 1.7365,
"step": 7409
},
{
"epoch": 0.18833416657701388,
"grad_norm": 1.60308039188385,
"learning_rate": 4.5039440264255994e-05,
"loss": 1.7361,
"step": 7440
},
{
"epoch": 0.18911889227108475,
"grad_norm": 1.588299036026001,
"learning_rate": 4.498911497712155e-05,
"loss": 1.7574,
"step": 7471
},
{
"epoch": 0.18990361796515565,
"grad_norm": 1.5599571466445923,
"learning_rate": 4.493856411554142e-05,
"loss": 1.738,
"step": 7502
},
{
"epoch": 0.19068834365922654,
"grad_norm": 1.5749436616897583,
"learning_rate": 4.4887788249975206e-05,
"loss": 1.7272,
"step": 7533
},
{
"epoch": 0.19147306935329744,
"grad_norm": 1.5536047220230103,
"learning_rate": 4.4836787953421656e-05,
"loss": 1.7249,
"step": 7564
},
{
"epoch": 0.1922577950473683,
"grad_norm": 1.5227411985397339,
"learning_rate": 4.478556380141218e-05,
"loss": 1.7137,
"step": 7595
},
{
"epoch": 0.1930425207414392,
"grad_norm": 1.5771219730377197,
"learning_rate": 4.4734116372004375e-05,
"loss": 1.7094,
"step": 7626
},
{
"epoch": 0.1938272464355101,
"grad_norm": 1.4533522129058838,
"learning_rate": 4.4682446245775477e-05,
"loss": 1.7493,
"step": 7657
},
{
"epoch": 0.194611972129581,
"grad_norm": 1.5640264749526978,
"learning_rate": 4.463055400581586e-05,
"loss": 1.7228,
"step": 7688
},
{
"epoch": 0.19539669782365188,
"grad_norm": 1.4606215953826904,
"learning_rate": 4.4578440237722374e-05,
"loss": 1.7414,
"step": 7719
},
{
"epoch": 0.19618142351772277,
"grad_norm": 1.5216374397277832,
"learning_rate": 4.452610552959183e-05,
"loss": 1.7155,
"step": 7750
},
{
"epoch": 0.19696614921179367,
"grad_norm": 1.683119535446167,
"learning_rate": 4.447355047201428e-05,
"loss": 1.7346,
"step": 7781
},
{
"epoch": 0.19775087490586457,
"grad_norm": 1.6055350303649902,
"learning_rate": 4.4420775658066414e-05,
"loss": 1.7112,
"step": 7812
},
{
"epoch": 0.19853560059993544,
"grad_norm": 1.514739751815796,
"learning_rate": 4.436778168330484e-05,
"loss": 1.7274,
"step": 7843
},
{
"epoch": 0.19932032629400634,
"grad_norm": 2.131218433380127,
"learning_rate": 4.4314569145759353e-05,
"loss": 1.7127,
"step": 7874
},
{
"epoch": 0.20010505198807724,
"grad_norm": 1.4867665767669678,
"learning_rate": 4.42611386459262e-05,
"loss": 1.7245,
"step": 7905
},
{
"epoch": 0.20088977768214814,
"grad_norm": 1.6395418643951416,
"learning_rate": 4.420749078676133e-05,
"loss": 1.7146,
"step": 7936
},
{
"epoch": 0.201674503376219,
"grad_norm": 1.629939079284668,
"learning_rate": 4.4153626173673516e-05,
"loss": 1.7153,
"step": 7967
},
{
"epoch": 0.2024592290702899,
"grad_norm": 1.5973584651947021,
"learning_rate": 4.409954541451762e-05,
"loss": 1.7102,
"step": 7998
},
{
"epoch": 0.2032439547643608,
"grad_norm": 1.4822708368301392,
"learning_rate": 4.404524911958764e-05,
"loss": 1.7046,
"step": 8029
},
{
"epoch": 0.2040286804584317,
"grad_norm": 1.4706634283065796,
"learning_rate": 4.399073790160989e-05,
"loss": 1.7022,
"step": 8060
},
{
"epoch": 0.20481340615250257,
"grad_norm": 1.5917459726333618,
"learning_rate": 4.393601237573607e-05,
"loss": 1.6983,
"step": 8091
},
{
"epoch": 0.20559813184657347,
"grad_norm": 1.7328417301177979,
"learning_rate": 4.388107315953628e-05,
"loss": 1.7164,
"step": 8122
},
{
"epoch": 0.20638285754064437,
"grad_norm": 1.6152797937393188,
"learning_rate": 4.382592087299212e-05,
"loss": 1.7302,
"step": 8153
},
{
"epoch": 0.20716758323471526,
"grad_norm": 1.7153429985046387,
"learning_rate": 4.377055613848964e-05,
"loss": 1.7278,
"step": 8184
},
{
"epoch": 0.20795230892878613,
"grad_norm": 1.7167855501174927,
"learning_rate": 4.3714979580812355e-05,
"loss": 1.7021,
"step": 8215
},
{
"epoch": 0.20873703462285703,
"grad_norm": 1.458811640739441,
"learning_rate": 4.365919182713416e-05,
"loss": 1.7099,
"step": 8246
},
{
"epoch": 0.20952176031692793,
"grad_norm": 5.516291618347168,
"learning_rate": 4.360319350701226e-05,
"loss": 1.7069,
"step": 8277
},
{
"epoch": 0.21030648601099883,
"grad_norm": 1.5669766664505005,
"learning_rate": 4.3546985252380115e-05,
"loss": 1.6983,
"step": 8308
},
{
"epoch": 0.2110912117050697,
"grad_norm": 1.4598067998886108,
"learning_rate": 4.349056769754021e-05,
"loss": 1.7265,
"step": 8339
},
{
"epoch": 0.2118759373991406,
"grad_norm": 1.5436547994613647,
"learning_rate": 4.3433941479156994e-05,
"loss": 1.7128,
"step": 8370
},
{
"epoch": 0.2126606630932115,
"grad_norm": 1.6275660991668701,
"learning_rate": 4.3377107236249647e-05,
"loss": 1.7229,
"step": 8401
},
{
"epoch": 0.2134453887872824,
"grad_norm": 1.6207513809204102,
"learning_rate": 4.332006561018488e-05,
"loss": 1.702,
"step": 8432
},
{
"epoch": 0.21423011448135326,
"grad_norm": 1.6795597076416016,
"learning_rate": 4.3262817244669683e-05,
"loss": 1.6808,
"step": 8463
},
{
"epoch": 0.21501484017542416,
"grad_norm": 1.660192608833313,
"learning_rate": 4.3205362785744083e-05,
"loss": 1.7071,
"step": 8494
},
{
"epoch": 0.21579956586949506,
"grad_norm": 1.6086353063583374,
"learning_rate": 4.314770288177384e-05,
"loss": 1.7083,
"step": 8525
},
{
"epoch": 0.21658429156356596,
"grad_norm": 1.475216269493103,
"learning_rate": 4.308983818344313e-05,
"loss": 1.7234,
"step": 8556
},
{
"epoch": 0.21736901725763683,
"grad_norm": 1.7111340761184692,
"learning_rate": 4.3031769343747206e-05,
"loss": 1.6872,
"step": 8587
},
{
"epoch": 0.21815374295170772,
"grad_norm": 1.4544799327850342,
"learning_rate": 4.297349701798505e-05,
"loss": 1.692,
"step": 8618
},
{
"epoch": 0.21893846864577862,
"grad_norm": 1.6593588590621948,
"learning_rate": 4.2915021863751916e-05,
"loss": 1.6886,
"step": 8649
},
{
"epoch": 0.21972319433984952,
"grad_norm": 1.641408085823059,
"learning_rate": 4.285634454093198e-05,
"loss": 1.6872,
"step": 8680
},
{
"epoch": 0.2205079200339204,
"grad_norm": 1.6036972999572754,
"learning_rate": 4.279746571169086e-05,
"loss": 1.7055,
"step": 8711
},
{
"epoch": 0.2212926457279913,
"grad_norm": 1.4984327554702759,
"learning_rate": 4.2738386040468136e-05,
"loss": 1.6997,
"step": 8742
},
{
"epoch": 0.2220773714220622,
"grad_norm": 1.471111536026001,
"learning_rate": 4.2679106193969866e-05,
"loss": 1.6926,
"step": 8773
},
{
"epoch": 0.22286209711613308,
"grad_norm": 1.521364688873291,
"learning_rate": 4.261962684116106e-05,
"loss": 1.6851,
"step": 8804
},
{
"epoch": 0.22364682281020395,
"grad_norm": 1.6068321466445923,
"learning_rate": 4.2559948653258145e-05,
"loss": 1.7113,
"step": 8835
},
{
"epoch": 0.22443154850427485,
"grad_norm": 1.453379511833191,
"learning_rate": 4.250007230372134e-05,
"loss": 1.7025,
"step": 8866
},
{
"epoch": 0.22521627419834575,
"grad_norm": 1.5845959186553955,
"learning_rate": 4.2439998468247126e-05,
"loss": 1.6978,
"step": 8897
},
{
"epoch": 0.22600099989241665,
"grad_norm": 1.5308622121810913,
"learning_rate": 4.2379727824760566e-05,
"loss": 1.6956,
"step": 8928
},
{
"epoch": 0.22678572558648752,
"grad_norm": 1.6339962482452393,
"learning_rate": 4.231926105340768e-05,
"loss": 1.6831,
"step": 8959
},
{
"epoch": 0.22757045128055842,
"grad_norm": 1.4533487558364868,
"learning_rate": 4.225859883654776e-05,
"loss": 1.7025,
"step": 8990
},
{
"epoch": 0.22835517697462931,
"grad_norm": 3.971897840499878,
"learning_rate": 4.219774185874569e-05,
"loss": 1.689,
"step": 9021
},
{
"epoch": 0.2291399026687002,
"grad_norm": 1.4394114017486572,
"learning_rate": 4.213669080676418e-05,
"loss": 1.6841,
"step": 9052
},
{
"epoch": 0.22992462836277108,
"grad_norm": 1.821142315864563,
"learning_rate": 4.2075446369556056e-05,
"loss": 1.6883,
"step": 9083
},
{
"epoch": 0.23070935405684198,
"grad_norm": 1.6653649806976318,
"learning_rate": 4.201400923825648e-05,
"loss": 1.7011,
"step": 9114
},
{
"epoch": 0.23149407975091288,
"grad_norm": 1.5895901918411255,
"learning_rate": 4.195238010617511e-05,
"loss": 1.7004,
"step": 9145
},
{
"epoch": 0.23227880544498378,
"grad_norm": 1.4648844003677368,
"learning_rate": 4.1890559668788344e-05,
"loss": 1.6872,
"step": 9176
},
{
"epoch": 0.23306353113905465,
"grad_norm": 1.5886753797531128,
"learning_rate": 4.1828548623731405e-05,
"loss": 1.6851,
"step": 9207
},
{
"epoch": 0.23384825683312555,
"grad_norm": 1.4713412523269653,
"learning_rate": 4.1766347670790506e-05,
"loss": 1.6818,
"step": 9238
},
{
"epoch": 0.23463298252719644,
"grad_norm": 1.5660710334777832,
"learning_rate": 4.170395751189495e-05,
"loss": 1.6844,
"step": 9269
},
{
"epoch": 0.23541770822126734,
"grad_norm": 1.7024312019348145,
"learning_rate": 4.164137885110921e-05,
"loss": 1.6839,
"step": 9300
},
{
"epoch": 0.2362024339153382,
"grad_norm": 1.5936214923858643,
"learning_rate": 4.157861239462495e-05,
"loss": 1.6953,
"step": 9331
},
{
"epoch": 0.2369871596094091,
"grad_norm": 1.4709779024124146,
"learning_rate": 4.1515658850753114e-05,
"loss": 1.6806,
"step": 9362
},
{
"epoch": 0.23777188530348,
"grad_norm": 1.4303510189056396,
"learning_rate": 4.145251892991588e-05,
"loss": 1.6792,
"step": 9393
},
{
"epoch": 0.2385566109975509,
"grad_norm": 1.5452120304107666,
"learning_rate": 4.138919334463868e-05,
"loss": 1.6712,
"step": 9424
},
{
"epoch": 0.23934133669162178,
"grad_norm": 1.4944697618484497,
"learning_rate": 4.1325682809542124e-05,
"loss": 1.6777,
"step": 9455
},
{
"epoch": 0.24012606238569267,
"grad_norm": 1.6359312534332275,
"learning_rate": 4.126198804133398e-05,
"loss": 1.6782,
"step": 9486
},
{
"epoch": 0.24091078807976357,
"grad_norm": 1.3874454498291016,
"learning_rate": 4.1198109758801055e-05,
"loss": 1.6805,
"step": 9517
},
{
"epoch": 0.24169551377383447,
"grad_norm": 1.4747340679168701,
"learning_rate": 4.113404868280107e-05,
"loss": 1.6704,
"step": 9548
},
{
"epoch": 0.24248023946790534,
"grad_norm": 1.95576012134552,
"learning_rate": 4.106980553625457e-05,
"loss": 1.7008,
"step": 9579
},
{
"epoch": 0.24326496516197624,
"grad_norm": 1.454005479812622,
"learning_rate": 4.100538104413674e-05,
"loss": 1.6771,
"step": 9610
},
{
"epoch": 0.24404969085604714,
"grad_norm": 1.5640463829040527,
"learning_rate": 4.09407759334692e-05,
"loss": 1.6763,
"step": 9641
},
{
"epoch": 0.24483441655011803,
"grad_norm": 1.5076780319213867,
"learning_rate": 4.087599093331186e-05,
"loss": 1.6977,
"step": 9672
},
{
"epoch": 0.2456191422441889,
"grad_norm": 1.5072520971298218,
"learning_rate": 4.081102677475462e-05,
"loss": 1.6749,
"step": 9703
},
{
"epoch": 0.2464038679382598,
"grad_norm": 1.6311815977096558,
"learning_rate": 4.0745884190909194e-05,
"loss": 1.684,
"step": 9734
},
{
"epoch": 0.2471885936323307,
"grad_norm": 1.5691202878952026,
"learning_rate": 4.0680563916900796e-05,
"loss": 1.6804,
"step": 9765
},
{
"epoch": 0.2479733193264016,
"grad_norm": 1.4325530529022217,
"learning_rate": 4.0615066689859815e-05,
"loss": 1.719,
"step": 9796
},
{
"epoch": 0.24875804502047247,
"grad_norm": 1.439177393913269,
"learning_rate": 4.0549393248913584e-05,
"loss": 1.6873,
"step": 9827
},
{
"epoch": 0.24954277071454337,
"grad_norm": 1.4155471324920654,
"learning_rate": 4.048354433517794e-05,
"loss": 1.692,
"step": 9858
},
{
"epoch": 0.25032749640861424,
"grad_norm": 1.5917115211486816,
"learning_rate": 4.0417520691748916e-05,
"loss": 1.6752,
"step": 9889
},
{
"epoch": 0.25111222210268513,
"grad_norm": 1.649154543876648,
"learning_rate": 4.035132306369438e-05,
"loss": 1.6603,
"step": 9920
},
{
"epoch": 0.25189694779675603,
"grad_norm": 1.5114792585372925,
"learning_rate": 4.028495219804555e-05,
"loss": 1.7005,
"step": 9951
},
{
"epoch": 0.25268167349082693,
"grad_norm": 16.910812377929688,
"learning_rate": 4.021840884378864e-05,
"loss": 1.6846,
"step": 9982
},
{
"epoch": 0.25346639918489783,
"grad_norm": 1.4342628717422485,
"learning_rate": 4.015169375185633e-05,
"loss": 1.6678,
"step": 10013
},
{
"epoch": 0.2542511248789687,
"grad_norm": 1.4815376996994019,
"learning_rate": 4.0084807675119396e-05,
"loss": 1.671,
"step": 10044
},
{
"epoch": 0.2550358505730396,
"grad_norm": 1.4633368253707886,
"learning_rate": 4.0017751368378106e-05,
"loss": 1.6824,
"step": 10075
},
{
"epoch": 0.2558205762671105,
"grad_norm": 1.3904149532318115,
"learning_rate": 3.995052558835377e-05,
"loss": 1.6775,
"step": 10106
},
{
"epoch": 0.25660530196118136,
"grad_norm": 1.5234646797180176,
"learning_rate": 3.988313109368017e-05,
"loss": 1.6854,
"step": 10137
},
{
"epoch": 0.25739002765525226,
"grad_norm": 1.4530494213104248,
"learning_rate": 3.981556864489504e-05,
"loss": 1.6727,
"step": 10168
},
{
"epoch": 0.25817475334932316,
"grad_norm": 1.5600273609161377,
"learning_rate": 3.974783900443142e-05,
"loss": 1.6645,
"step": 10199
},
{
"epoch": 0.25895947904339406,
"grad_norm": 1.4213160276412964,
"learning_rate": 3.9679942936609095e-05,
"loss": 1.6898,
"step": 10230
},
{
"epoch": 0.25974420473746496,
"grad_norm": 1.5741041898727417,
"learning_rate": 3.961188120762596e-05,
"loss": 1.693,
"step": 10261
},
{
"epoch": 0.26052893043153585,
"grad_norm": 1.564493179321289,
"learning_rate": 3.954365458554938e-05,
"loss": 1.6836,
"step": 10292
},
{
"epoch": 0.26131365612560675,
"grad_norm": 1.5584787130355835,
"learning_rate": 3.947526384030751e-05,
"loss": 1.6852,
"step": 10323
},
{
"epoch": 0.26209838181967765,
"grad_norm": 1.4936350584030151,
"learning_rate": 3.9406709743680624e-05,
"loss": 1.6777,
"step": 10354
},
{
"epoch": 0.26288310751374855,
"grad_norm": 1.504725694656372,
"learning_rate": 3.9337993069292366e-05,
"loss": 1.6765,
"step": 10385
},
{
"epoch": 0.2636678332078194,
"grad_norm": 1.4809914827346802,
"learning_rate": 3.926911459260109e-05,
"loss": 1.6578,
"step": 10416
},
{
"epoch": 0.2644525589018903,
"grad_norm": 1.529976725578308,
"learning_rate": 3.920007509089102e-05,
"loss": 1.6709,
"step": 10447
},
{
"epoch": 0.2652372845959612,
"grad_norm": 1.483694076538086,
"learning_rate": 3.913087534326357e-05,
"loss": 1.6713,
"step": 10478
},
{
"epoch": 0.2660220102900321,
"grad_norm": 1.4282972812652588,
"learning_rate": 3.9061516130628475e-05,
"loss": 1.6784,
"step": 10509
},
{
"epoch": 0.266806735984103,
"grad_norm": 1.5122032165527344,
"learning_rate": 3.8991998235695025e-05,
"loss": 1.6603,
"step": 10540
},
{
"epoch": 0.2675914616781739,
"grad_norm": 1.5154742002487183,
"learning_rate": 3.8922322442963224e-05,
"loss": 1.6831,
"step": 10571
},
{
"epoch": 0.2683761873722448,
"grad_norm": 1.4630860090255737,
"learning_rate": 3.885248953871491e-05,
"loss": 1.6715,
"step": 10602
},
{
"epoch": 0.2691609130663157,
"grad_norm": 1.4164702892303467,
"learning_rate": 3.8782500311004915e-05,
"loss": 1.6654,
"step": 10633
},
{
"epoch": 0.2699456387603865,
"grad_norm": 1.5865578651428223,
"learning_rate": 3.871235554965218e-05,
"loss": 1.6829,
"step": 10664
},
{
"epoch": 0.2707303644544574,
"grad_norm": 1.4984766244888306,
"learning_rate": 3.864205604623078e-05,
"loss": 1.673,
"step": 10695
},
{
"epoch": 0.2715150901485283,
"grad_norm": 1.5477566719055176,
"learning_rate": 3.857160259406107e-05,
"loss": 1.6711,
"step": 10726
},
{
"epoch": 0.2722998158425992,
"grad_norm": 1.5356842279434204,
"learning_rate": 3.8500995988200674e-05,
"loss": 1.6556,
"step": 10757
},
{
"epoch": 0.2730845415366701,
"grad_norm": 1.413104772567749,
"learning_rate": 3.843023702543556e-05,
"loss": 1.658,
"step": 10788
},
{
"epoch": 0.273869267230741,
"grad_norm": 1.5174081325531006,
"learning_rate": 3.8359326504270984e-05,
"loss": 1.6672,
"step": 10819
},
{
"epoch": 0.2746539929248119,
"grad_norm": 1.4649910926818848,
"learning_rate": 3.828826522492255e-05,
"loss": 1.6625,
"step": 10850
},
{
"epoch": 0.2754387186188828,
"grad_norm": 1.5240408182144165,
"learning_rate": 3.821705398930713e-05,
"loss": 1.6619,
"step": 10881
},
{
"epoch": 0.27622344431295365,
"grad_norm": 1.4349104166030884,
"learning_rate": 3.814569360103385e-05,
"loss": 1.6595,
"step": 10912
},
{
"epoch": 0.27700817000702455,
"grad_norm": 1.4311225414276123,
"learning_rate": 3.807418486539499e-05,
"loss": 1.6557,
"step": 10943
},
{
"epoch": 0.27779289570109544,
"grad_norm": 1.5817755460739136,
"learning_rate": 3.80025285893569e-05,
"loss": 1.6882,
"step": 10974
},
{
"epoch": 0.27857762139516634,
"grad_norm": 1.5182181596755981,
"learning_rate": 3.793072558155093e-05,
"loss": 1.6697,
"step": 11005
},
{
"epoch": 0.27936234708923724,
"grad_norm": 1.4836517572402954,
"learning_rate": 3.785877665226426e-05,
"loss": 1.6576,
"step": 11036
},
{
"epoch": 0.28014707278330814,
"grad_norm": 1.460788607597351,
"learning_rate": 3.778668261343079e-05,
"loss": 1.6607,
"step": 11067
},
{
"epoch": 0.28093179847737904,
"grad_norm": 1.4307125806808472,
"learning_rate": 3.771444427862192e-05,
"loss": 1.662,
"step": 11098
},
{
"epoch": 0.28171652417144993,
"grad_norm": 1.4999738931655884,
"learning_rate": 3.7642062463037465e-05,
"loss": 1.6406,
"step": 11129
},
{
"epoch": 0.2825012498655208,
"grad_norm": 1.4646129608154297,
"learning_rate": 3.7569537983496373e-05,
"loss": 1.6653,
"step": 11160
},
{
"epoch": 0.2832859755595917,
"grad_norm": 1.4709292650222778,
"learning_rate": 3.749687165842753e-05,
"loss": 1.6704,
"step": 11191
},
{
"epoch": 0.28407070125366257,
"grad_norm": 1.494458556175232,
"learning_rate": 3.7424064307860536e-05,
"loss": 1.6534,
"step": 11222
},
{
"epoch": 0.28485542694773347,
"grad_norm": 1.4409736394882202,
"learning_rate": 3.735111675341645e-05,
"loss": 1.6645,
"step": 11253
},
{
"epoch": 0.28564015264180437,
"grad_norm": 1.4628338813781738,
"learning_rate": 3.7278029818298524e-05,
"loss": 1.6611,
"step": 11284
},
{
"epoch": 0.28642487833587527,
"grad_norm": 1.3659113645553589,
"learning_rate": 3.720480432728287e-05,
"loss": 1.6435,
"step": 11315
},
{
"epoch": 0.28720960402994616,
"grad_norm": 1.3704752922058105,
"learning_rate": 3.71314411067092e-05,
"loss": 1.6507,
"step": 11346
},
{
"epoch": 0.28799432972401706,
"grad_norm": 1.579837441444397,
"learning_rate": 3.70579409844715e-05,
"loss": 1.6716,
"step": 11377
},
{
"epoch": 0.2887790554180879,
"grad_norm": 1.5566996335983276,
"learning_rate": 3.698430479000865e-05,
"loss": 1.6439,
"step": 11408
},
{
"epoch": 0.2895637811121588,
"grad_norm": 1.4722687005996704,
"learning_rate": 3.691053335429509e-05,
"loss": 1.683,
"step": 11439
},
{
"epoch": 0.2903485068062297,
"grad_norm": 1.491283893585205,
"learning_rate": 3.683662750983147e-05,
"loss": 1.6606,
"step": 11470
},
{
"epoch": 0.2911332325003006,
"grad_norm": 1.402040719985962,
"learning_rate": 3.676258809063518e-05,
"loss": 1.6582,
"step": 11501
},
{
"epoch": 0.2919179581943715,
"grad_norm": 1.4377038478851318,
"learning_rate": 3.6688415932231004e-05,
"loss": 1.6398,
"step": 11532
},
{
"epoch": 0.2927026838884424,
"grad_norm": 1.4151259660720825,
"learning_rate": 3.661411187164166e-05,
"loss": 1.6645,
"step": 11563
},
{
"epoch": 0.2934874095825133,
"grad_norm": 1.5219615697860718,
"learning_rate": 3.65396767473784e-05,
"loss": 1.6705,
"step": 11594
},
{
"epoch": 0.2942721352765842,
"grad_norm": 1.533252239227295,
"learning_rate": 3.6465111399431465e-05,
"loss": 1.6714,
"step": 11625
},
{
"epoch": 0.29505686097065503,
"grad_norm": 1.410959243774414,
"learning_rate": 3.6390416669260674e-05,
"loss": 1.6533,
"step": 11656
},
{
"epoch": 0.29584158666472593,
"grad_norm": 1.5377541780471802,
"learning_rate": 3.63155933997859e-05,
"loss": 1.6505,
"step": 11687
},
{
"epoch": 0.29662631235879683,
"grad_norm": 1.4504135847091675,
"learning_rate": 3.624064243537758e-05,
"loss": 1.6287,
"step": 11718
},
{
"epoch": 0.2974110380528677,
"grad_norm": 1.4606986045837402,
"learning_rate": 3.616556462184716e-05,
"loss": 1.6592,
"step": 11749
},
{
"epoch": 0.2981957637469386,
"grad_norm": 1.4440289735794067,
"learning_rate": 3.609036080643755e-05,
"loss": 1.6598,
"step": 11780
},
{
"epoch": 0.2989804894410095,
"grad_norm": 1.5399249792099,
"learning_rate": 3.60150318378136e-05,
"loss": 1.6852,
"step": 11811
},
{
"epoch": 0.2997652151350804,
"grad_norm": 1.4778543710708618,
"learning_rate": 3.5939578566052465e-05,
"loss": 1.6462,
"step": 11842
},
{
"epoch": 0.3005499408291513,
"grad_norm": 1.4979726076126099,
"learning_rate": 3.586400184263408e-05,
"loss": 1.6576,
"step": 11873
},
{
"epoch": 0.30133466652322216,
"grad_norm": 1.4904232025146484,
"learning_rate": 3.578830252043148e-05,
"loss": 1.6476,
"step": 11904
},
{
"epoch": 0.30211939221729306,
"grad_norm": 1.5472886562347412,
"learning_rate": 3.571248145370125e-05,
"loss": 1.6721,
"step": 11935
},
{
"epoch": 0.30290411791136396,
"grad_norm": 1.4954209327697754,
"learning_rate": 3.5636539498073794e-05,
"loss": 1.6483,
"step": 11966
},
{
"epoch": 0.30368884360543486,
"grad_norm": 1.4504363536834717,
"learning_rate": 3.556047751054378e-05,
"loss": 1.657,
"step": 11997
},
{
"epoch": 0.30447356929950575,
"grad_norm": 1.3581033945083618,
"learning_rate": 3.548429634946039e-05,
"loss": 1.6579,
"step": 12028
},
{
"epoch": 0.30525829499357665,
"grad_norm": 1.4421014785766602,
"learning_rate": 3.540799687451768e-05,
"loss": 1.6496,
"step": 12059
},
{
"epoch": 0.30604302068764755,
"grad_norm": 1.523169994354248,
"learning_rate": 3.533157994674485e-05,
"loss": 1.6714,
"step": 12090
},
{
"epoch": 0.30682774638171845,
"grad_norm": 1.455269455909729,
"learning_rate": 3.5255046428496546e-05,
"loss": 1.6695,
"step": 12121
},
{
"epoch": 0.3076124720757893,
"grad_norm": 1.4330891370773315,
"learning_rate": 3.517839718344311e-05,
"loss": 1.6519,
"step": 12152
},
{
"epoch": 0.3083971977698602,
"grad_norm": 1.3913158178329468,
"learning_rate": 3.510163307656086e-05,
"loss": 1.6329,
"step": 12183
},
{
"epoch": 0.3091819234639311,
"grad_norm": 1.355193018913269,
"learning_rate": 3.5024754974122324e-05,
"loss": 1.624,
"step": 12214
},
{
"epoch": 0.309966649158002,
"grad_norm": 1.4055231809616089,
"learning_rate": 3.494776374368643e-05,
"loss": 1.6491,
"step": 12245
},
{
"epoch": 0.3107513748520729,
"grad_norm": 1.4227032661437988,
"learning_rate": 3.4870660254088724e-05,
"loss": 1.6274,
"step": 12276
},
{
"epoch": 0.3115361005461438,
"grad_norm": 1.4558427333831787,
"learning_rate": 3.479344537543164e-05,
"loss": 1.6419,
"step": 12307
},
{
"epoch": 0.3123208262402147,
"grad_norm": 1.5154629945755005,
"learning_rate": 3.4716119979074565e-05,
"loss": 1.6443,
"step": 12338
},
{
"epoch": 0.3131055519342856,
"grad_norm": 1.4458774328231812,
"learning_rate": 3.463868493762412e-05,
"loss": 1.6615,
"step": 12369
},
{
"epoch": 0.3138902776283564,
"grad_norm": 1.4116544723510742,
"learning_rate": 3.456114112492418e-05,
"loss": 1.6481,
"step": 12400
},
{
"epoch": 0.3146750033224273,
"grad_norm": 1.8497071266174316,
"learning_rate": 3.4483489416046164e-05,
"loss": 1.6262,
"step": 12431
},
{
"epoch": 0.3154597290164982,
"grad_norm": 1.3854331970214844,
"learning_rate": 3.440573068727905e-05,
"loss": 1.6387,
"step": 12462
},
{
"epoch": 0.3162444547105691,
"grad_norm": 1.509178876876831,
"learning_rate": 3.4327865816119495e-05,
"loss": 1.6566,
"step": 12493
},
{
"epoch": 0.31702918040464,
"grad_norm": 1.3977612257003784,
"learning_rate": 3.4249895681262025e-05,
"loss": 1.6676,
"step": 12524
},
{
"epoch": 0.3178139060987109,
"grad_norm": 1.3736423254013062,
"learning_rate": 3.417182116258899e-05,
"loss": 1.6238,
"step": 12555
},
{
"epoch": 0.3185986317927818,
"grad_norm": 1.4226630926132202,
"learning_rate": 3.409364314116074e-05,
"loss": 1.6513,
"step": 12586
},
{
"epoch": 0.3193833574868527,
"grad_norm": 1.4804571866989136,
"learning_rate": 3.401536249920559e-05,
"loss": 1.6383,
"step": 12617
},
{
"epoch": 0.32016808318092355,
"grad_norm": 1.456168532371521,
"learning_rate": 3.393698012010998e-05,
"loss": 1.6621,
"step": 12648
},
{
"epoch": 0.32095280887499444,
"grad_norm": 1.3990952968597412,
"learning_rate": 3.385849688840839e-05,
"loss": 1.6376,
"step": 12679
},
{
"epoch": 0.32173753456906534,
"grad_norm": 1.3588812351226807,
"learning_rate": 3.3779913689773414e-05,
"loss": 1.656,
"step": 12710
},
{
"epoch": 0.32252226026313624,
"grad_norm": 1.4718931913375854,
"learning_rate": 3.370123141100578e-05,
"loss": 1.6255,
"step": 12741
},
{
"epoch": 0.32330698595720714,
"grad_norm": 1.3603503704071045,
"learning_rate": 3.3622450940024305e-05,
"loss": 1.6517,
"step": 12772
},
{
"epoch": 0.32409171165127804,
"grad_norm": 1.4493441581726074,
"learning_rate": 3.35435731658559e-05,
"loss": 1.643,
"step": 12803
},
{
"epoch": 0.32487643734534893,
"grad_norm": 1.3813337087631226,
"learning_rate": 3.346459897862552e-05,
"loss": 1.6449,
"step": 12834
},
{
"epoch": 0.32566116303941983,
"grad_norm": 1.5027899742126465,
"learning_rate": 3.338552926954613e-05,
"loss": 1.6497,
"step": 12865
},
{
"epoch": 0.3264458887334907,
"grad_norm": 1.3805309534072876,
"learning_rate": 3.330636493090868e-05,
"loss": 1.6449,
"step": 12896
},
{
"epoch": 0.3272306144275616,
"grad_norm": 1.642248511314392,
"learning_rate": 3.322710685607193e-05,
"loss": 1.6261,
"step": 12927
},
{
"epoch": 0.32801534012163247,
"grad_norm": 1.4579522609710693,
"learning_rate": 3.314775593945251e-05,
"loss": 1.6648,
"step": 12958
},
{
"epoch": 0.32880006581570337,
"grad_norm": 1.3579092025756836,
"learning_rate": 3.3068313076514714e-05,
"loss": 1.6468,
"step": 12989
},
{
"epoch": 0.32958479150977427,
"grad_norm": 1.406051754951477,
"learning_rate": 3.298877916376047e-05,
"loss": 1.6249,
"step": 13020
},
{
"epoch": 0.33036951720384516,
"grad_norm": 1.457335114479065,
"learning_rate": 3.290915509871915e-05,
"loss": 1.6353,
"step": 13051
},
{
"epoch": 0.33115424289791606,
"grad_norm": 1.4548041820526123,
"learning_rate": 3.282944177993753e-05,
"loss": 1.6272,
"step": 13082
},
{
"epoch": 0.33193896859198696,
"grad_norm": 1.4140032529830933,
"learning_rate": 3.274964010696957e-05,
"loss": 1.6479,
"step": 13113
},
{
"epoch": 0.3327236942860578,
"grad_norm": 1.3436623811721802,
"learning_rate": 3.266975098036629e-05,
"loss": 1.6452,
"step": 13144
},
{
"epoch": 0.3335084199801287,
"grad_norm": 1.4224274158477783,
"learning_rate": 3.258977530166562e-05,
"loss": 1.6242,
"step": 13175
},
{
"epoch": 0.3342931456741996,
"grad_norm": 1.5661940574645996,
"learning_rate": 3.250971397338227e-05,
"loss": 1.6404,
"step": 13206
},
{
"epoch": 0.3350778713682705,
"grad_norm": 1.4696576595306396,
"learning_rate": 3.2429567898997404e-05,
"loss": 1.6436,
"step": 13237
},
{
"epoch": 0.3358625970623414,
"grad_norm": 1.4438591003417969,
"learning_rate": 3.234933798294859e-05,
"loss": 1.6404,
"step": 13268
},
{
"epoch": 0.3366473227564123,
"grad_norm": 1.4548406600952148,
"learning_rate": 3.2269025130619535e-05,
"loss": 1.6461,
"step": 13299
},
{
"epoch": 0.3374320484504832,
"grad_norm": 1.4180691242218018,
"learning_rate": 3.218863024832985e-05,
"loss": 1.6377,
"step": 13330
},
{
"epoch": 0.3382167741445541,
"grad_norm": 1.4060105085372925,
"learning_rate": 3.2108154243324864e-05,
"loss": 1.6045,
"step": 13361
},
{
"epoch": 0.33900149983862493,
"grad_norm": 1.4134920835494995,
"learning_rate": 3.2027598023765345e-05,
"loss": 1.6264,
"step": 13392
},
{
"epoch": 0.33978622553269583,
"grad_norm": 1.4582122564315796,
"learning_rate": 3.194696249871729e-05,
"loss": 1.623,
"step": 13423
},
{
"epoch": 0.3405709512267667,
"grad_norm": 1.4027389287948608,
"learning_rate": 3.186624857814164e-05,
"loss": 1.6337,
"step": 13454
},
{
"epoch": 0.3413556769208376,
"grad_norm": 1.3397070169448853,
"learning_rate": 3.178545717288401e-05,
"loss": 1.6334,
"step": 13485
},
{
"epoch": 0.3421404026149085,
"grad_norm": 1.5358332395553589,
"learning_rate": 3.170458919466444e-05,
"loss": 1.6393,
"step": 13516
},
{
"epoch": 0.3429251283089794,
"grad_norm": 1.5479260683059692,
"learning_rate": 3.1623645556067063e-05,
"loss": 1.6357,
"step": 13547
},
{
"epoch": 0.3437098540030503,
"grad_norm": 1.3949965238571167,
"learning_rate": 3.154262717052985e-05,
"loss": 1.6325,
"step": 13578
},
{
"epoch": 0.3444945796971212,
"grad_norm": 1.392903208732605,
"learning_rate": 3.146153495233426e-05,
"loss": 1.6071,
"step": 13609
},
{
"epoch": 0.34527930539119206,
"grad_norm": 1.4290788173675537,
"learning_rate": 3.1380369816594944e-05,
"loss": 1.6266,
"step": 13640
},
{
"epoch": 0.34606403108526296,
"grad_norm": 1.4005228281021118,
"learning_rate": 3.129913267924946e-05,
"loss": 1.6391,
"step": 13671
},
{
"epoch": 0.34684875677933386,
"grad_norm": 1.378369927406311,
"learning_rate": 3.121782445704782e-05,
"loss": 1.6495,
"step": 13702
},
{
"epoch": 0.34763348247340475,
"grad_norm": 1.4202784299850464,
"learning_rate": 3.11364460675423e-05,
"loss": 1.637,
"step": 13733
},
{
"epoch": 0.34841820816747565,
"grad_norm": 1.3670291900634766,
"learning_rate": 3.1054998429076934e-05,
"loss": 1.5941,
"step": 13764
},
{
"epoch": 0.34920293386154655,
"grad_norm": 1.3714202642440796,
"learning_rate": 3.097348246077728e-05,
"loss": 1.6096,
"step": 13795
},
{
"epoch": 0.34998765955561745,
"grad_norm": 1.4889552593231201,
"learning_rate": 3.0891899082539924e-05,
"loss": 1.6245,
"step": 13826
},
{
"epoch": 0.35077238524968835,
"grad_norm": 1.4640086889266968,
"learning_rate": 3.0810249215022233e-05,
"loss": 1.6197,
"step": 13857
},
{
"epoch": 0.35155711094375924,
"grad_norm": 1.385380506515503,
"learning_rate": 3.0728533779631865e-05,
"loss": 1.61,
"step": 13888
},
{
"epoch": 0.3523418366378301,
"grad_norm": 1.3958945274353027,
"learning_rate": 3.064675369851637e-05,
"loss": 1.6139,
"step": 13919
},
{
"epoch": 0.353126562331901,
"grad_norm": 1.3746731281280518,
"learning_rate": 3.056490989455289e-05,
"loss": 1.6307,
"step": 13950
},
{
"epoch": 0.3539112880259719,
"grad_norm": 1.4196429252624512,
"learning_rate": 3.0483003291337596e-05,
"loss": 1.6192,
"step": 13981
},
{
"epoch": 0.3546960137200428,
"grad_norm": 1.3648637533187866,
"learning_rate": 3.040103481317539e-05,
"loss": 1.6124,
"step": 14012
},
{
"epoch": 0.3554807394141137,
"grad_norm": 1.422004222869873,
"learning_rate": 3.03190053850694e-05,
"loss": 1.6288,
"step": 14043
},
{
"epoch": 0.3562654651081846,
"grad_norm": 1.4687801599502563,
"learning_rate": 3.0236915932710573e-05,
"loss": 1.6118,
"step": 14074
},
{
"epoch": 0.3570501908022555,
"grad_norm": 1.30635404586792,
"learning_rate": 3.0154767382467232e-05,
"loss": 1.6341,
"step": 14105
},
{
"epoch": 0.35783491649632637,
"grad_norm": 1.4216945171356201,
"learning_rate": 3.0072560661374582e-05,
"loss": 1.6385,
"step": 14136
},
{
"epoch": 0.3586196421903972,
"grad_norm": 1.4296518564224243,
"learning_rate": 2.999029669712431e-05,
"loss": 1.6262,
"step": 14167
},
{
"epoch": 0.3594043678844681,
"grad_norm": 1.4529691934585571,
"learning_rate": 2.990797641805408e-05,
"loss": 1.6136,
"step": 14198
},
{
"epoch": 0.360189093578539,
"grad_norm": 1.389478325843811,
"learning_rate": 2.982560075313704e-05,
"loss": 1.6263,
"step": 14229
},
{
"epoch": 0.3609738192726099,
"grad_norm": 1.3917667865753174,
"learning_rate": 2.9743170631971368e-05,
"loss": 1.6456,
"step": 14260
},
{
"epoch": 0.3617585449666808,
"grad_norm": 1.3452563285827637,
"learning_rate": 2.9660686984769792e-05,
"loss": 1.6284,
"step": 14291
},
{
"epoch": 0.3625432706607517,
"grad_norm": 1.421159029006958,
"learning_rate": 2.9578150742349047e-05,
"loss": 1.6232,
"step": 14322
},
{
"epoch": 0.3633279963548226,
"grad_norm": 1.4312077760696411,
"learning_rate": 2.949556283611942e-05,
"loss": 1.6006,
"step": 14353
},
{
"epoch": 0.3641127220488935,
"grad_norm": 1.4271692037582397,
"learning_rate": 2.9412924198074206e-05,
"loss": 1.6177,
"step": 14384
},
{
"epoch": 0.36489744774296434,
"grad_norm": 1.3584555387496948,
"learning_rate": 2.9330235760779208e-05,
"loss": 1.6148,
"step": 14415
},
{
"epoch": 0.36568217343703524,
"grad_norm": 1.3882123231887817,
"learning_rate": 2.9247498457362188e-05,
"loss": 1.6327,
"step": 14446
},
{
"epoch": 0.36646689913110614,
"grad_norm": 1.540114402770996,
"learning_rate": 2.9164713221502373e-05,
"loss": 1.6052,
"step": 14477
},
{
"epoch": 0.36725162482517704,
"grad_norm": 1.3554641008377075,
"learning_rate": 2.9081880987419912e-05,
"loss": 1.6091,
"step": 14508
},
{
"epoch": 0.36803635051924793,
"grad_norm": 1.3693712949752808,
"learning_rate": 2.8999002689865296e-05,
"loss": 1.5936,
"step": 14539
},
{
"epoch": 0.36882107621331883,
"grad_norm": 1.354278564453125,
"learning_rate": 2.8916079264108852e-05,
"loss": 1.612,
"step": 14570
},
{
"epoch": 0.36960580190738973,
"grad_norm": 1.3731021881103516,
"learning_rate": 2.883311164593017e-05,
"loss": 1.6064,
"step": 14601
},
{
"epoch": 0.37039052760146063,
"grad_norm": 1.3914356231689453,
"learning_rate": 2.875010077160754e-05,
"loss": 1.6036,
"step": 14632
},
{
"epoch": 0.37117525329553147,
"grad_norm": 1.4811164140701294,
"learning_rate": 2.866704757790741e-05,
"loss": 1.6195,
"step": 14663
},
{
"epoch": 0.37195997898960237,
"grad_norm": 1.4619332551956177,
"learning_rate": 2.858395300207376e-05,
"loss": 1.6315,
"step": 14694
},
{
"epoch": 0.37274470468367327,
"grad_norm": 1.456950306892395,
"learning_rate": 2.8500817981817607e-05,
"loss": 1.6276,
"step": 14725
},
{
"epoch": 0.37352943037774416,
"grad_norm": 5.129410266876221,
"learning_rate": 2.8417643455306336e-05,
"loss": 1.6234,
"step": 14756
},
{
"epoch": 0.37431415607181506,
"grad_norm": 1.3831191062927246,
"learning_rate": 2.8334430361153185e-05,
"loss": 1.6163,
"step": 14787
},
{
"epoch": 0.37509888176588596,
"grad_norm": 1.3817623853683472,
"learning_rate": 2.8251179638406612e-05,
"loss": 1.6206,
"step": 14818
},
{
"epoch": 0.37588360745995686,
"grad_norm": 1.5285260677337646,
"learning_rate": 2.8167892226539704e-05,
"loss": 1.6117,
"step": 14849
},
{
"epoch": 0.37666833315402776,
"grad_norm": 1.403324007987976,
"learning_rate": 2.8084569065439588e-05,
"loss": 1.5962,
"step": 14880
},
{
"epoch": 0.3774530588480986,
"grad_norm": 1.3314014673233032,
"learning_rate": 2.8001211095396807e-05,
"loss": 1.6116,
"step": 14911
},
{
"epoch": 0.3782377845421695,
"grad_norm": 1.4300462007522583,
"learning_rate": 2.791781925709473e-05,
"loss": 1.6234,
"step": 14942
},
{
"epoch": 0.3790225102362404,
"grad_norm": 1.424811601638794,
"learning_rate": 2.7834394491598908e-05,
"loss": 1.5986,
"step": 14973
},
{
"epoch": 0.3798072359303113,
"grad_norm": 1.3818182945251465,
"learning_rate": 2.7750937740346485e-05,
"loss": 1.6012,
"step": 15004
},
{
"epoch": 0.3805919616243822,
"grad_norm": 1.4053683280944824,
"learning_rate": 2.7667449945135564e-05,
"loss": 1.6018,
"step": 15035
},
{
"epoch": 0.3813766873184531,
"grad_norm": 1.5093421936035156,
"learning_rate": 2.7583932048114557e-05,
"loss": 1.61,
"step": 15066
},
{
"epoch": 0.382161413012524,
"grad_norm": 1.412494421005249,
"learning_rate": 2.7500384991771587e-05,
"loss": 1.613,
"step": 15097
},
{
"epoch": 0.3829461387065949,
"grad_norm": 1.335167646408081,
"learning_rate": 2.7416809718923825e-05,
"loss": 1.6197,
"step": 15128
},
{
"epoch": 0.3837308644006657,
"grad_norm": 1.334786295890808,
"learning_rate": 2.7333207172706864e-05,
"loss": 1.6284,
"step": 15159
},
{
"epoch": 0.3845155900947366,
"grad_norm": 1.4039522409439087,
"learning_rate": 2.7249578296564088e-05,
"loss": 1.5889,
"step": 15190
},
{
"epoch": 0.3853003157888075,
"grad_norm": 1.4196487665176392,
"learning_rate": 2.7165924034235973e-05,
"loss": 1.6132,
"step": 15221
},
{
"epoch": 0.3860850414828784,
"grad_norm": 1.4701744318008423,
"learning_rate": 2.708224532974953e-05,
"loss": 1.6009,
"step": 15252
},
{
"epoch": 0.3868697671769493,
"grad_norm": 1.319935917854309,
"learning_rate": 2.6998543127407538e-05,
"loss": 1.6333,
"step": 15283
},
{
"epoch": 0.3876544928710202,
"grad_norm": 1.3962234258651733,
"learning_rate": 2.6914818371777988e-05,
"loss": 1.6175,
"step": 15314
},
{
"epoch": 0.3884392185650911,
"grad_norm": 1.4284230470657349,
"learning_rate": 2.6831072007683373e-05,
"loss": 1.6007,
"step": 15345
},
{
"epoch": 0.389223944259162,
"grad_norm": 1.298251748085022,
"learning_rate": 2.6747304980190018e-05,
"loss": 1.605,
"step": 15376
},
{
"epoch": 0.39000866995323286,
"grad_norm": 1.294994831085205,
"learning_rate": 2.6663518234597453e-05,
"loss": 1.6025,
"step": 15407
},
{
"epoch": 0.39079339564730375,
"grad_norm": 1.440958023071289,
"learning_rate": 2.6579712716427696e-05,
"loss": 1.6002,
"step": 15438
},
{
"epoch": 0.39157812134137465,
"grad_norm": 1.439590573310852,
"learning_rate": 2.6495889371414652e-05,
"loss": 1.6025,
"step": 15469
},
{
"epoch": 0.39236284703544555,
"grad_norm": 1.4235502481460571,
"learning_rate": 2.6412049145493367e-05,
"loss": 1.5993,
"step": 15500
},
{
"epoch": 0.39314757272951645,
"grad_norm": 1.4449518918991089,
"learning_rate": 2.632819298478939e-05,
"loss": 1.63,
"step": 15531
},
{
"epoch": 0.39393229842358735,
"grad_norm": 1.4422321319580078,
"learning_rate": 2.6244321835608105e-05,
"loss": 1.6193,
"step": 15562
},
{
"epoch": 0.39471702411765824,
"grad_norm": 1.4232275485992432,
"learning_rate": 2.6160436644424024e-05,
"loss": 1.6193,
"step": 15593
},
{
"epoch": 0.39550174981172914,
"grad_norm": 1.5187265872955322,
"learning_rate": 2.6076538357870133e-05,
"loss": 1.618,
"step": 15624
},
{
"epoch": 0.3962864755058,
"grad_norm": 1.4493205547332764,
"learning_rate": 2.5992627922727196e-05,
"loss": 1.6082,
"step": 15655
},
{
"epoch": 0.3970712011998709,
"grad_norm": 1.5100423097610474,
"learning_rate": 2.5908706285913066e-05,
"loss": 1.6081,
"step": 15686
},
{
"epoch": 0.3978559268939418,
"grad_norm": 1.465114712715149,
"learning_rate": 2.5824774394472008e-05,
"loss": 1.6125,
"step": 15717
},
{
"epoch": 0.3986406525880127,
"grad_norm": 1.4160761833190918,
"learning_rate": 2.5740833195563996e-05,
"loss": 1.5951,
"step": 15748
},
{
"epoch": 0.3994253782820836,
"grad_norm": 1.381658673286438,
"learning_rate": 2.5656883636454067e-05,
"loss": 1.6051,
"step": 15779
},
{
"epoch": 0.4002101039761545,
"grad_norm": 1.3883142471313477,
"learning_rate": 2.557292666450159e-05,
"loss": 1.6039,
"step": 15810
},
{
"epoch": 0.4009948296702254,
"grad_norm": 1.506911039352417,
"learning_rate": 2.5488963227149566e-05,
"loss": 1.5761,
"step": 15841
},
{
"epoch": 0.40177955536429627,
"grad_norm": 1.4450113773345947,
"learning_rate": 2.5404994271913983e-05,
"loss": 1.5734,
"step": 15872
},
{
"epoch": 0.4025642810583671,
"grad_norm": 1.3970619440078735,
"learning_rate": 2.5321020746373085e-05,
"loss": 1.6094,
"step": 15903
},
{
"epoch": 0.403349006752438,
"grad_norm": 1.4761073589324951,
"learning_rate": 2.52370435981567e-05,
"loss": 1.6075,
"step": 15934
},
{
"epoch": 0.4041337324465089,
"grad_norm": 1.3969392776489258,
"learning_rate": 2.5153063774935533e-05,
"loss": 1.5788,
"step": 15965
},
{
"epoch": 0.4049184581405798,
"grad_norm": 1.3772737979888916,
"learning_rate": 2.506908222441045e-05,
"loss": 1.61,
"step": 15996
},
{
"epoch": 0.4057031838346507,
"grad_norm": 1.3969396352767944,
"learning_rate": 2.498509989430187e-05,
"loss": 1.5943,
"step": 16027
},
{
"epoch": 0.4064879095287216,
"grad_norm": 1.3052096366882324,
"learning_rate": 2.4901117732338958e-05,
"loss": 1.61,
"step": 16058
},
{
"epoch": 0.4072726352227925,
"grad_norm": 1.394612193107605,
"learning_rate": 2.481713668624899e-05,
"loss": 1.6018,
"step": 16089
},
{
"epoch": 0.4080573609168634,
"grad_norm": 1.3575886487960815,
"learning_rate": 2.4733157703746663e-05,
"loss": 1.5883,
"step": 16120
},
{
"epoch": 0.40884208661093424,
"grad_norm": 1.3952176570892334,
"learning_rate": 2.4649181732523392e-05,
"loss": 1.6152,
"step": 16151
},
{
"epoch": 0.40962681230500514,
"grad_norm": 1.5711455345153809,
"learning_rate": 2.4565209720236582e-05,
"loss": 1.61,
"step": 16182
},
{
"epoch": 0.41041153799907604,
"grad_norm": 1.5258722305297852,
"learning_rate": 2.4481242614498975e-05,
"loss": 1.628,
"step": 16213
},
{
"epoch": 0.41119626369314694,
"grad_norm": 1.425764799118042,
"learning_rate": 2.439728136286796e-05,
"loss": 1.5872,
"step": 16244
},
{
"epoch": 0.41198098938721783,
"grad_norm": 1.3165446519851685,
"learning_rate": 2.4313326912834852e-05,
"loss": 1.6008,
"step": 16275
},
{
"epoch": 0.41276571508128873,
"grad_norm": 1.386579155921936,
"learning_rate": 2.4229380211814206e-05,
"loss": 1.5783,
"step": 16306
},
{
"epoch": 0.41355044077535963,
"grad_norm": 1.464693307876587,
"learning_rate": 2.4145442207133124e-05,
"loss": 1.5947,
"step": 16337
},
{
"epoch": 0.4143351664694305,
"grad_norm": 1.334782600402832,
"learning_rate": 2.406151384602059e-05,
"loss": 1.5886,
"step": 16368
},
{
"epoch": 0.41511989216350137,
"grad_norm": 1.4115489721298218,
"learning_rate": 2.3977596075596747e-05,
"loss": 1.5821,
"step": 16399
},
{
"epoch": 0.41590461785757227,
"grad_norm": 1.391065001487732,
"learning_rate": 2.3893689842862223e-05,
"loss": 1.6141,
"step": 16430
},
{
"epoch": 0.41668934355164317,
"grad_norm": 1.4244657754898071,
"learning_rate": 2.3809796094687475e-05,
"loss": 1.6008,
"step": 16461
},
{
"epoch": 0.41747406924571406,
"grad_norm": 1.3113791942596436,
"learning_rate": 2.372591577780202e-05,
"loss": 1.608,
"step": 16492
},
{
"epoch": 0.41825879493978496,
"grad_norm": 1.4262186288833618,
"learning_rate": 2.3642049838783838e-05,
"loss": 1.5801,
"step": 16523
},
{
"epoch": 0.41904352063385586,
"grad_norm": 1.4219175577163696,
"learning_rate": 2.3558199224048666e-05,
"loss": 1.592,
"step": 16554
},
{
"epoch": 0.41982824632792676,
"grad_norm": 1.4542045593261719,
"learning_rate": 2.347436487983929e-05,
"loss": 1.6062,
"step": 16585
},
{
"epoch": 0.42061297202199766,
"grad_norm": 1.4484211206436157,
"learning_rate": 2.3390547752214888e-05,
"loss": 1.6042,
"step": 16616
},
{
"epoch": 0.4213976977160685,
"grad_norm": 1.4561681747436523,
"learning_rate": 2.330674878704035e-05,
"loss": 1.617,
"step": 16647
},
{
"epoch": 0.4221824234101394,
"grad_norm": 1.4250808954238892,
"learning_rate": 2.322296892997561e-05,
"loss": 1.5947,
"step": 16678
},
{
"epoch": 0.4229671491042103,
"grad_norm": 1.3762766122817993,
"learning_rate": 2.313920912646497e-05,
"loss": 1.5962,
"step": 16709
},
{
"epoch": 0.4237518747982812,
"grad_norm": 1.3508645296096802,
"learning_rate": 2.305547032172643e-05,
"loss": 1.5969,
"step": 16740
},
{
"epoch": 0.4245366004923521,
"grad_norm": 1.4839844703674316,
"learning_rate": 2.2971753460741014e-05,
"loss": 1.5697,
"step": 16771
},
{
"epoch": 0.425321326186423,
"grad_norm": 1.4027475118637085,
"learning_rate": 2.288805948824212e-05,
"loss": 1.5758,
"step": 16802
},
{
"epoch": 0.4261060518804939,
"grad_norm": 1.3288599252700806,
"learning_rate": 2.2804389348704858e-05,
"loss": 1.5817,
"step": 16833
},
{
"epoch": 0.4268907775745648,
"grad_norm": 1.411028265953064,
"learning_rate": 2.2720743986335374e-05,
"loss": 1.6059,
"step": 16864
},
{
"epoch": 0.4276755032686356,
"grad_norm": 1.4803740978240967,
"learning_rate": 2.2637124345060233e-05,
"loss": 1.6061,
"step": 16895
},
{
"epoch": 0.4284602289627065,
"grad_norm": 1.6195276975631714,
"learning_rate": 2.2553531368515695e-05,
"loss": 1.5948,
"step": 16926
},
{
"epoch": 0.4292449546567774,
"grad_norm": 1.368160605430603,
"learning_rate": 2.2469966000037144e-05,
"loss": 1.5884,
"step": 16957
},
{
"epoch": 0.4300296803508483,
"grad_norm": 2.9462714195251465,
"learning_rate": 2.2386429182648417e-05,
"loss": 1.5834,
"step": 16988
},
{
"epoch": 0.4308144060449192,
"grad_norm": 1.319602370262146,
"learning_rate": 2.230292185905114e-05,
"loss": 1.571,
"step": 17019
},
{
"epoch": 0.4315991317389901,
"grad_norm": 1.412001371383667,
"learning_rate": 2.2219444971614116e-05,
"loss": 1.6091,
"step": 17050
},
{
"epoch": 0.432383857433061,
"grad_norm": 1.4459586143493652,
"learning_rate": 2.2135999462362655e-05,
"loss": 1.5803,
"step": 17081
},
{
"epoch": 0.4331685831271319,
"grad_norm": 1.3342795372009277,
"learning_rate": 2.2052586272968003e-05,
"loss": 1.5809,
"step": 17112
},
{
"epoch": 0.43395330882120275,
"grad_norm": 1.3263877630233765,
"learning_rate": 2.196920634473666e-05,
"loss": 1.5742,
"step": 17143
},
{
"epoch": 0.43473803451527365,
"grad_norm": 1.3818809986114502,
"learning_rate": 2.1885860618599787e-05,
"loss": 1.5701,
"step": 17174
},
{
"epoch": 0.43552276020934455,
"grad_norm": 1.4324009418487549,
"learning_rate": 2.1802550035102577e-05,
"loss": 1.5622,
"step": 17205
},
{
"epoch": 0.43630748590341545,
"grad_norm": 1.3489223718643188,
"learning_rate": 2.171927553439363e-05,
"loss": 1.5737,
"step": 17236
},
{
"epoch": 0.43709221159748635,
"grad_norm": 1.6844401359558105,
"learning_rate": 2.1636038056214376e-05,
"loss": 1.5916,
"step": 17267
},
{
"epoch": 0.43787693729155724,
"grad_norm": 1.3632712364196777,
"learning_rate": 2.155283853988844e-05,
"loss": 1.6055,
"step": 17298
},
{
"epoch": 0.43866166298562814,
"grad_norm": 1.4866870641708374,
"learning_rate": 2.146967792431106e-05,
"loss": 1.5858,
"step": 17329
},
{
"epoch": 0.43944638867969904,
"grad_norm": 1.5456846952438354,
"learning_rate": 2.138655714793849e-05,
"loss": 1.6098,
"step": 17360
},
{
"epoch": 0.44023111437376994,
"grad_norm": 1.4177597761154175,
"learning_rate": 2.1303477148777367e-05,
"loss": 1.5833,
"step": 17391
},
{
"epoch": 0.4410158400678408,
"grad_norm": 1.4126933813095093,
"learning_rate": 2.122043886437421e-05,
"loss": 1.599,
"step": 17422
},
{
"epoch": 0.4418005657619117,
"grad_norm": 1.4183374643325806,
"learning_rate": 2.1137443231804765e-05,
"loss": 1.5941,
"step": 17453
},
{
"epoch": 0.4425852914559826,
"grad_norm": 1.4230761528015137,
"learning_rate": 2.105449118766347e-05,
"loss": 1.5743,
"step": 17484
},
{
"epoch": 0.4433700171500535,
"grad_norm": 1.6844847202301025,
"learning_rate": 2.097158366805287e-05,
"loss": 1.5672,
"step": 17515
},
{
"epoch": 0.4441547428441244,
"grad_norm": 1.410435438156128,
"learning_rate": 2.0888721608573047e-05,
"loss": 1.5896,
"step": 17546
},
{
"epoch": 0.44493946853819527,
"grad_norm": 1.3948931694030762,
"learning_rate": 2.0805905944311087e-05,
"loss": 1.5899,
"step": 17577
},
{
"epoch": 0.44572419423226617,
"grad_norm": 1.3747113943099976,
"learning_rate": 2.0723137609830497e-05,
"loss": 1.5576,
"step": 17608
},
{
"epoch": 0.44650891992633707,
"grad_norm": 1.477161169052124,
"learning_rate": 2.0640417539160686e-05,
"loss": 1.5576,
"step": 17639
},
{
"epoch": 0.4472936456204079,
"grad_norm": 1.372091293334961,
"learning_rate": 2.0557746665786427e-05,
"loss": 1.5958,
"step": 17670
},
{
"epoch": 0.4480783713144788,
"grad_norm": 1.361820936203003,
"learning_rate": 2.0475125922637256e-05,
"loss": 1.5917,
"step": 17701
},
{
"epoch": 0.4488630970085497,
"grad_norm": 1.367297887802124,
"learning_rate": 2.0392556242077047e-05,
"loss": 1.5965,
"step": 17732
},
{
"epoch": 0.4496478227026206,
"grad_norm": 1.538565754890442,
"learning_rate": 2.031003855589343e-05,
"loss": 1.5814,
"step": 17763
},
{
"epoch": 0.4504325483966915,
"grad_norm": 1.4618374109268188,
"learning_rate": 2.022757379528727e-05,
"loss": 1.5852,
"step": 17794
},
{
"epoch": 0.4512172740907624,
"grad_norm": 1.3954309225082397,
"learning_rate": 2.0145162890862184e-05,
"loss": 1.5576,
"step": 17825
},
{
"epoch": 0.4520019997848333,
"grad_norm": 1.33854079246521,
"learning_rate": 2.0062806772614022e-05,
"loss": 1.5793,
"step": 17856
},
{
"epoch": 0.4527867254789042,
"grad_norm": 1.4751428365707397,
"learning_rate": 1.9980506369920392e-05,
"loss": 1.5831,
"step": 17887
},
{
"epoch": 0.45357145117297504,
"grad_norm": 1.3836451768875122,
"learning_rate": 1.989826261153015e-05,
"loss": 1.5967,
"step": 17918
},
{
"epoch": 0.45435617686704594,
"grad_norm": 1.4987123012542725,
"learning_rate": 1.9816076425552923e-05,
"loss": 1.5953,
"step": 17949
},
{
"epoch": 0.45514090256111683,
"grad_norm": 1.3838002681732178,
"learning_rate": 1.9733948739448676e-05,
"loss": 1.5614,
"step": 17980
},
{
"epoch": 0.45592562825518773,
"grad_norm": 1.358023762702942,
"learning_rate": 1.9651880480017155e-05,
"loss": 1.5737,
"step": 18011
},
{
"epoch": 0.45671035394925863,
"grad_norm": 1.3181227445602417,
"learning_rate": 1.9569872573387516e-05,
"loss": 1.5806,
"step": 18042
},
{
"epoch": 0.4574950796433295,
"grad_norm": 1.3574905395507812,
"learning_rate": 1.9487925945007854e-05,
"loss": 1.5779,
"step": 18073
},
{
"epoch": 0.4582798053374004,
"grad_norm": 1.3550188541412354,
"learning_rate": 1.9406041519634726e-05,
"loss": 1.5723,
"step": 18104
},
{
"epoch": 0.4590645310314713,
"grad_norm": 1.3672763109207153,
"learning_rate": 1.932422022132275e-05,
"loss": 1.5869,
"step": 18135
},
{
"epoch": 0.45984925672554217,
"grad_norm": 1.428689956665039,
"learning_rate": 1.924246297341414e-05,
"loss": 1.5743,
"step": 18166
},
{
"epoch": 0.46063398241961306,
"grad_norm": 1.3313350677490234,
"learning_rate": 1.9160770698528338e-05,
"loss": 1.5836,
"step": 18197
},
{
"epoch": 0.46141870811368396,
"grad_norm": 1.3049378395080566,
"learning_rate": 1.907914431855156e-05,
"loss": 1.5753,
"step": 18228
},
{
"epoch": 0.46220343380775486,
"grad_norm": 1.3737244606018066,
"learning_rate": 1.8997584754626412e-05,
"loss": 1.589,
"step": 18259
},
{
"epoch": 0.46298815950182576,
"grad_norm": 1.4522390365600586,
"learning_rate": 1.8916092927141486e-05,
"loss": 1.5898,
"step": 18290
},
{
"epoch": 0.46377288519589666,
"grad_norm": 1.3189274072647095,
"learning_rate": 1.883466975572098e-05,
"loss": 1.5721,
"step": 18321
},
{
"epoch": 0.46455761088996755,
"grad_norm": 1.3040895462036133,
"learning_rate": 1.8753316159214312e-05,
"loss": 1.58,
"step": 18352
},
{
"epoch": 0.46534233658403845,
"grad_norm": 1.3528228998184204,
"learning_rate": 1.8672033055685766e-05,
"loss": 1.5812,
"step": 18383
},
{
"epoch": 0.4661270622781093,
"grad_norm": 1.3759435415267944,
"learning_rate": 1.8590821362404116e-05,
"loss": 1.5905,
"step": 18414
},
{
"epoch": 0.4669117879721802,
"grad_norm": 1.374550223350525,
"learning_rate": 1.8509681995832294e-05,
"loss": 1.5737,
"step": 18445
},
{
"epoch": 0.4676965136662511,
"grad_norm": 1.4290833473205566,
"learning_rate": 1.8428615871617004e-05,
"loss": 1.577,
"step": 18476
},
{
"epoch": 0.468481239360322,
"grad_norm": 1.287758231163025,
"learning_rate": 1.8347623904578448e-05,
"loss": 1.5652,
"step": 18507
},
{
"epoch": 0.4692659650543929,
"grad_norm": 1.3034193515777588,
"learning_rate": 1.8266707008699975e-05,
"loss": 1.5708,
"step": 18538
},
{
"epoch": 0.4700506907484638,
"grad_norm": 1.3413418531417847,
"learning_rate": 1.818586609711774e-05,
"loss": 1.5629,
"step": 18569
},
{
"epoch": 0.4708354164425347,
"grad_norm": 1.3434704542160034,
"learning_rate": 1.8105102082110462e-05,
"loss": 1.5726,
"step": 18600
},
{
"epoch": 0.4716201421366056,
"grad_norm": 1.3321512937545776,
"learning_rate": 1.8024415875089058e-05,
"loss": 1.5767,
"step": 18631
},
{
"epoch": 0.4724048678306764,
"grad_norm": 1.3440663814544678,
"learning_rate": 1.7943808386586407e-05,
"loss": 1.5971,
"step": 18662
},
{
"epoch": 0.4731895935247473,
"grad_norm": 1.356490135192871,
"learning_rate": 1.7863280526247073e-05,
"loss": 1.5511,
"step": 18693
},
{
"epoch": 0.4739743192188182,
"grad_norm": 1.5594719648361206,
"learning_rate": 1.7782833202817003e-05,
"loss": 1.5807,
"step": 18724
},
{
"epoch": 0.4747590449128891,
"grad_norm": 1.3007055521011353,
"learning_rate": 1.7702467324133327e-05,
"loss": 1.5864,
"step": 18755
},
{
"epoch": 0.47554377060696,
"grad_norm": 1.3085851669311523,
"learning_rate": 1.7622183797114042e-05,
"loss": 1.5624,
"step": 18786
},
{
"epoch": 0.4763284963010309,
"grad_norm": 1.4323654174804688,
"learning_rate": 1.7541983527747838e-05,
"loss": 1.5759,
"step": 18817
},
{
"epoch": 0.4771132219951018,
"grad_norm": 1.6249394416809082,
"learning_rate": 1.746186742108387e-05,
"loss": 1.5853,
"step": 18848
},
{
"epoch": 0.4778979476891727,
"grad_norm": 1.4717755317687988,
"learning_rate": 1.73818363812215e-05,
"loss": 1.5627,
"step": 18879
},
{
"epoch": 0.47868267338324355,
"grad_norm": 1.4533812999725342,
"learning_rate": 1.7301891311300153e-05,
"loss": 1.5582,
"step": 18910
},
{
"epoch": 0.47946739907731445,
"grad_norm": 1.4233548641204834,
"learning_rate": 1.7222033113489055e-05,
"loss": 1.5829,
"step": 18941
},
{
"epoch": 0.48025212477138535,
"grad_norm": 1.4943761825561523,
"learning_rate": 1.7142262688977127e-05,
"loss": 1.563,
"step": 18972
},
{
"epoch": 0.48103685046545624,
"grad_norm": 1.4122124910354614,
"learning_rate": 1.7062580937962764e-05,
"loss": 1.5723,
"step": 19003
},
{
"epoch": 0.48182157615952714,
"grad_norm": 1.3874859809875488,
"learning_rate": 1.698298875964369e-05,
"loss": 1.5606,
"step": 19034
},
{
"epoch": 0.48260630185359804,
"grad_norm": 1.3442684412002563,
"learning_rate": 1.690348705220684e-05,
"loss": 1.5794,
"step": 19065
},
{
"epoch": 0.48339102754766894,
"grad_norm": 1.5870423316955566,
"learning_rate": 1.6824076712818156e-05,
"loss": 1.5782,
"step": 19096
},
{
"epoch": 0.48417575324173984,
"grad_norm": 1.3558776378631592,
"learning_rate": 1.6744758637612533e-05,
"loss": 1.5642,
"step": 19127
},
{
"epoch": 0.4849604789358107,
"grad_norm": 1.4363101720809937,
"learning_rate": 1.6665533721683664e-05,
"loss": 1.5698,
"step": 19158
},
{
"epoch": 0.4857452046298816,
"grad_norm": 1.423425555229187,
"learning_rate": 1.6586402859073974e-05,
"loss": 1.5712,
"step": 19189
},
{
"epoch": 0.4865299303239525,
"grad_norm": 1.3792959451675415,
"learning_rate": 1.6507366942764463e-05,
"loss": 1.567,
"step": 19220
},
{
"epoch": 0.4873146560180234,
"grad_norm": 1.4269790649414062,
"learning_rate": 1.6428426864664732e-05,
"loss": 1.5616,
"step": 19251
},
{
"epoch": 0.48809938171209427,
"grad_norm": 1.4407951831817627,
"learning_rate": 1.6349583515602816e-05,
"loss": 1.5786,
"step": 19282
},
{
"epoch": 0.48888410740616517,
"grad_norm": 1.4874082803726196,
"learning_rate": 1.6270837785315208e-05,
"loss": 1.5907,
"step": 19313
},
{
"epoch": 0.48966883310023607,
"grad_norm": 1.382135272026062,
"learning_rate": 1.619219056243676e-05,
"loss": 1.5673,
"step": 19344
},
{
"epoch": 0.49045355879430697,
"grad_norm": 1.3598939180374146,
"learning_rate": 1.6113642734490698e-05,
"loss": 1.5548,
"step": 19375
},
{
"epoch": 0.4912382844883778,
"grad_norm": 1.4186638593673706,
"learning_rate": 1.6035195187878577e-05,
"loss": 1.5834,
"step": 19406
},
{
"epoch": 0.4920230101824487,
"grad_norm": 1.3320554494857788,
"learning_rate": 1.5956848807870305e-05,
"loss": 1.5435,
"step": 19437
},
{
"epoch": 0.4928077358765196,
"grad_norm": 1.3170437812805176,
"learning_rate": 1.587860447859413e-05,
"loss": 1.5538,
"step": 19468
},
{
"epoch": 0.4935924615705905,
"grad_norm": 1.463334321975708,
"learning_rate": 1.5800463083026686e-05,
"loss": 1.5603,
"step": 19499
},
{
"epoch": 0.4943771872646614,
"grad_norm": 1.4043060541152954,
"learning_rate": 1.572242550298298e-05,
"loss": 1.5778,
"step": 19530
},
{
"epoch": 0.4951619129587323,
"grad_norm": 1.3377630710601807,
"learning_rate": 1.56444926191065e-05,
"loss": 1.5836,
"step": 19561
},
{
"epoch": 0.4959466386528032,
"grad_norm": 1.4007608890533447,
"learning_rate": 1.5566665310859257e-05,
"loss": 1.5691,
"step": 19592
},
{
"epoch": 0.4967313643468741,
"grad_norm": 1.3231667280197144,
"learning_rate": 1.5488944456511846e-05,
"loss": 1.5517,
"step": 19623
},
{
"epoch": 0.49751609004094494,
"grad_norm": 1.4343535900115967,
"learning_rate": 1.5411330933133546e-05,
"loss": 1.5753,
"step": 19654
},
{
"epoch": 0.49830081573501583,
"grad_norm": 1.2943058013916016,
"learning_rate": 1.533382561658241e-05,
"loss": 1.5571,
"step": 19685
},
{
"epoch": 0.49908554142908673,
"grad_norm": 1.2815899848937988,
"learning_rate": 1.525642938149541e-05,
"loss": 1.5796,
"step": 19716
},
{
"epoch": 0.49987026712315763,
"grad_norm": 1.4025834798812866,
"learning_rate": 1.5179143101278536e-05,
"loss": 1.5672,
"step": 19747
},
{
"epoch": 0.5006549928172285,
"grad_norm": 1.4670218229293823,
"learning_rate": 1.5101967648096955e-05,
"loss": 1.5702,
"step": 19778
},
{
"epoch": 0.5014397185112994,
"grad_norm": 1.4222999811172485,
"learning_rate": 1.5024903892865172e-05,
"loss": 1.5842,
"step": 19809
},
{
"epoch": 0.5022244442053703,
"grad_norm": 1.4714964628219604,
"learning_rate": 1.4947952705237184e-05,
"loss": 1.5552,
"step": 19840
},
{
"epoch": 0.5030091698994412,
"grad_norm": 1.3124053478240967,
"learning_rate": 1.4871114953596682e-05,
"loss": 1.567,
"step": 19871
},
{
"epoch": 0.5037938955935121,
"grad_norm": 1.343239188194275,
"learning_rate": 1.4794391505047256e-05,
"loss": 1.5829,
"step": 19902
},
{
"epoch": 0.504578621287583,
"grad_norm": 1.4160040616989136,
"learning_rate": 1.4717783225402596e-05,
"loss": 1.5479,
"step": 19933
},
{
"epoch": 0.5053633469816539,
"grad_norm": 1.3658647537231445,
"learning_rate": 1.4641290979176735e-05,
"loss": 1.558,
"step": 19964
},
{
"epoch": 0.5061480726757248,
"grad_norm": 1.2913247346878052,
"learning_rate": 1.4564915629574246e-05,
"loss": 1.5795,
"step": 19995
},
{
"epoch": 0.5069327983697957,
"grad_norm": 1.3975298404693604,
"learning_rate": 1.4488658038480601e-05,
"loss": 1.5557,
"step": 20026
},
{
"epoch": 0.5077175240638665,
"grad_norm": 1.342119812965393,
"learning_rate": 1.4412519066452323e-05,
"loss": 1.5727,
"step": 20057
},
{
"epoch": 0.5085022497579375,
"grad_norm": 1.3325005769729614,
"learning_rate": 1.4336499572707373e-05,
"loss": 1.5573,
"step": 20088
},
{
"epoch": 0.5092869754520083,
"grad_norm": 1.3986520767211914,
"learning_rate": 1.4260600415115433e-05,
"loss": 1.5537,
"step": 20119
},
{
"epoch": 0.5100717011460792,
"grad_norm": 1.3560576438903809,
"learning_rate": 1.4184822450188137e-05,
"loss": 1.5529,
"step": 20150
},
{
"epoch": 0.5108564268401501,
"grad_norm": 1.4381458759307861,
"learning_rate": 1.410916653306954e-05,
"loss": 1.5845,
"step": 20181
},
{
"epoch": 0.511641152534221,
"grad_norm": 1.6817706823349,
"learning_rate": 1.403363351752639e-05,
"loss": 1.569,
"step": 20212
},
{
"epoch": 0.5124258782282919,
"grad_norm": 1.3956488370895386,
"learning_rate": 1.3958224255938485e-05,
"loss": 1.5561,
"step": 20243
},
{
"epoch": 0.5132106039223627,
"grad_norm": 1.3474819660186768,
"learning_rate": 1.388293959928911e-05,
"loss": 1.5608,
"step": 20274
},
{
"epoch": 0.5139953296164337,
"grad_norm": 1.286340594291687,
"learning_rate": 1.3807780397155379e-05,
"loss": 1.5661,
"step": 20305
},
{
"epoch": 0.5147800553105045,
"grad_norm": 1.3667712211608887,
"learning_rate": 1.3732747497698655e-05,
"loss": 1.5778,
"step": 20336
},
{
"epoch": 0.5155647810045755,
"grad_norm": 1.4048058986663818,
"learning_rate": 1.3657841747655038e-05,
"loss": 1.5444,
"step": 20367
},
{
"epoch": 0.5163495066986463,
"grad_norm": 1.5085017681121826,
"learning_rate": 1.3583063992325706e-05,
"loss": 1.5657,
"step": 20398
},
{
"epoch": 0.5171342323927173,
"grad_norm": 1.3968846797943115,
"learning_rate": 1.3508415075567496e-05,
"loss": 1.5641,
"step": 20429
},
{
"epoch": 0.5179189580867881,
"grad_norm": 1.403813123703003,
"learning_rate": 1.343389583978327e-05,
"loss": 1.5768,
"step": 20460
},
{
"epoch": 0.5187036837808591,
"grad_norm": 1.3661153316497803,
"learning_rate": 1.3359507125912468e-05,
"loss": 1.5511,
"step": 20491
},
{
"epoch": 0.5194884094749299,
"grad_norm": 1.4918231964111328,
"learning_rate": 1.3285249773421627e-05,
"loss": 1.5552,
"step": 20522
},
{
"epoch": 0.5202731351690008,
"grad_norm": 1.366255521774292,
"learning_rate": 1.3211124620294884e-05,
"loss": 1.5573,
"step": 20553
},
{
"epoch": 0.5210578608630717,
"grad_norm": 1.360115885734558,
"learning_rate": 1.313713250302451e-05,
"loss": 1.5743,
"step": 20584
},
{
"epoch": 0.5218425865571426,
"grad_norm": 1.396219253540039,
"learning_rate": 1.3063274256601479e-05,
"loss": 1.5313,
"step": 20615
},
{
"epoch": 0.5226273122512135,
"grad_norm": 1.3751533031463623,
"learning_rate": 1.2989550714506086e-05,
"loss": 1.554,
"step": 20646
},
{
"epoch": 0.5234120379452843,
"grad_norm": 1.3931307792663574,
"learning_rate": 1.291596270869846e-05,
"loss": 1.572,
"step": 20677
},
{
"epoch": 0.5241967636393553,
"grad_norm": 1.3172565698623657,
"learning_rate": 1.284251106960927e-05,
"loss": 1.556,
"step": 20708
},
{
"epoch": 0.5249814893334261,
"grad_norm": 1.4660224914550781,
"learning_rate": 1.2769196626130263e-05,
"loss": 1.563,
"step": 20739
},
{
"epoch": 0.5257662150274971,
"grad_norm": 1.3981261253356934,
"learning_rate": 1.2696020205604969e-05,
"loss": 1.536,
"step": 20770
},
{
"epoch": 0.5265509407215679,
"grad_norm": 1.3775140047073364,
"learning_rate": 1.2622982633819359e-05,
"loss": 1.5538,
"step": 20801
},
{
"epoch": 0.5273356664156388,
"grad_norm": 1.3806031942367554,
"learning_rate": 1.2550084734992484e-05,
"loss": 1.5717,
"step": 20832
},
{
"epoch": 0.5281203921097097,
"grad_norm": 1.663273572921753,
"learning_rate": 1.247732733176724e-05,
"loss": 1.5474,
"step": 20863
},
{
"epoch": 0.5289051178037806,
"grad_norm": 1.4349000453948975,
"learning_rate": 1.2404711245201044e-05,
"loss": 1.563,
"step": 20894
},
{
"epoch": 0.5296898434978515,
"grad_norm": 1.4207381010055542,
"learning_rate": 1.2332237294756535e-05,
"loss": 1.5769,
"step": 20925
},
{
"epoch": 0.5304745691919224,
"grad_norm": 1.3234254121780396,
"learning_rate": 1.225990629829241e-05,
"loss": 1.5419,
"step": 20956
},
{
"epoch": 0.5312592948859933,
"grad_norm": 1.3426439762115479,
"learning_rate": 1.2187719072054136e-05,
"loss": 1.5479,
"step": 20987
},
{
"epoch": 0.5320440205800642,
"grad_norm": 1.3690837621688843,
"learning_rate": 1.2115676430664735e-05,
"loss": 1.5668,
"step": 21018
},
{
"epoch": 0.532828746274135,
"grad_norm": 1.4441026449203491,
"learning_rate": 1.2043779187115647e-05,
"loss": 1.5663,
"step": 21049
},
{
"epoch": 0.533613471968206,
"grad_norm": 1.379137396812439,
"learning_rate": 1.1972028152757476e-05,
"loss": 1.5704,
"step": 21080
},
{
"epoch": 0.5343981976622768,
"grad_norm": 1.3750004768371582,
"learning_rate": 1.1900424137290889e-05,
"loss": 1.5518,
"step": 21111
},
{
"epoch": 0.5351829233563478,
"grad_norm": 1.465265154838562,
"learning_rate": 1.1828967948757482e-05,
"loss": 1.5539,
"step": 21142
},
{
"epoch": 0.5359676490504186,
"grad_norm": 1.3172025680541992,
"learning_rate": 1.175766039353062e-05,
"loss": 1.5544,
"step": 21173
},
{
"epoch": 0.5367523747444896,
"grad_norm": 1.4065696001052856,
"learning_rate": 1.1686502276306382e-05,
"loss": 1.5586,
"step": 21204
},
{
"epoch": 0.5375371004385604,
"grad_norm": 1.45732581615448,
"learning_rate": 1.1615494400094445e-05,
"loss": 1.5728,
"step": 21235
},
{
"epoch": 0.5383218261326314,
"grad_norm": 1.3364806175231934,
"learning_rate": 1.1544637566209029e-05,
"loss": 1.5569,
"step": 21266
},
{
"epoch": 0.5391065518267022,
"grad_norm": 1.3799667358398438,
"learning_rate": 1.1473932574259886e-05,
"loss": 1.5344,
"step": 21297
},
{
"epoch": 0.539891277520773,
"grad_norm": 1.4128960371017456,
"learning_rate": 1.1403380222143247e-05,
"loss": 1.5546,
"step": 21328
},
{
"epoch": 0.540676003214844,
"grad_norm": 1.5169612169265747,
"learning_rate": 1.1332981306032808e-05,
"loss": 1.5471,
"step": 21359
},
{
"epoch": 0.5414607289089148,
"grad_norm": 1.4209131002426147,
"learning_rate": 1.1262736620370762e-05,
"loss": 1.5654,
"step": 21390
},
{
"epoch": 0.5422454546029858,
"grad_norm": 1.3103234767913818,
"learning_rate": 1.1192646957858854e-05,
"loss": 1.5492,
"step": 21421
},
{
"epoch": 0.5430301802970566,
"grad_norm": 1.7383350133895874,
"learning_rate": 1.1122713109449381e-05,
"loss": 1.5502,
"step": 21452
},
{
"epoch": 0.5438149059911276,
"grad_norm": 1.3104016780853271,
"learning_rate": 1.105293586433634e-05,
"loss": 1.5564,
"step": 21483
},
{
"epoch": 0.5445996316851984,
"grad_norm": 1.3233284950256348,
"learning_rate": 1.0983316009946446e-05,
"loss": 1.5274,
"step": 21514
},
{
"epoch": 0.5453843573792693,
"grad_norm": 1.4942415952682495,
"learning_rate": 1.0913854331930282e-05,
"loss": 1.5643,
"step": 21545
},
{
"epoch": 0.5461690830733402,
"grad_norm": 1.3964463472366333,
"learning_rate": 1.0844551614153456e-05,
"loss": 1.5575,
"step": 21576
},
{
"epoch": 0.5469538087674111,
"grad_norm": 1.4472683668136597,
"learning_rate": 1.0775408638687725e-05,
"loss": 1.5459,
"step": 21607
},
{
"epoch": 0.547738534461482,
"grad_norm": 1.3240516185760498,
"learning_rate": 1.0706426185802165e-05,
"loss": 1.5703,
"step": 21638
},
{
"epoch": 0.5485232601555529,
"grad_norm": 1.3561683893203735,
"learning_rate": 1.0637605033954371e-05,
"loss": 1.5429,
"step": 21669
},
{
"epoch": 0.5493079858496238,
"grad_norm": 1.3770638704299927,
"learning_rate": 1.05689459597817e-05,
"loss": 1.5575,
"step": 21700
},
{
"epoch": 0.5500927115436947,
"grad_norm": 1.4219211339950562,
"learning_rate": 1.050044973809246e-05,
"loss": 1.5392,
"step": 21731
},
{
"epoch": 0.5508774372377656,
"grad_norm": 1.3968154191970825,
"learning_rate": 1.043211714185722e-05,
"loss": 1.559,
"step": 21762
},
{
"epoch": 0.5516621629318365,
"grad_norm": 1.3730138540267944,
"learning_rate": 1.036394894220003e-05,
"loss": 1.5452,
"step": 21793
},
{
"epoch": 0.5524468886259073,
"grad_norm": 1.407535433769226,
"learning_rate": 1.0295945908389751e-05,
"loss": 1.5477,
"step": 21824
},
{
"epoch": 0.5532316143199782,
"grad_norm": 1.440319299697876,
"learning_rate": 1.0228108807831393e-05,
"loss": 1.5483,
"step": 21855
},
{
"epoch": 0.5540163400140491,
"grad_norm": 1.38417649269104,
"learning_rate": 1.01604384060574e-05,
"loss": 1.569,
"step": 21886
},
{
"epoch": 0.55480106570812,
"grad_norm": 1.51227867603302,
"learning_rate": 1.009293546671907e-05,
"loss": 1.5441,
"step": 21917
},
{
"epoch": 0.5555857914021909,
"grad_norm": 1.3792462348937988,
"learning_rate": 1.002560075157791e-05,
"loss": 1.5537,
"step": 21948
},
{
"epoch": 0.5563705170962618,
"grad_norm": 1.3728954792022705,
"learning_rate": 9.958435020496995e-06,
"loss": 1.5463,
"step": 21979
},
{
"epoch": 0.5571552427903327,
"grad_norm": 1.4337445497512817,
"learning_rate": 9.89143903143249e-06,
"loss": 1.5409,
"step": 22010
},
{
"epoch": 0.5579399684844035,
"grad_norm": 1.317431092262268,
"learning_rate": 9.824613540425038e-06,
"loss": 1.5541,
"step": 22041
},
{
"epoch": 0.5587246941784745,
"grad_norm": 1.3596452474594116,
"learning_rate": 9.757959301591197e-06,
"loss": 1.5465,
"step": 22072
},
{
"epoch": 0.5595094198725453,
"grad_norm": 1.4173970222473145,
"learning_rate": 9.691477067115017e-06,
"loss": 1.5534,
"step": 22103
},
{
"epoch": 0.5602941455666163,
"grad_norm": 2.4860451221466064,
"learning_rate": 9.625167587239467e-06,
"loss": 1.5458,
"step": 22134
},
{
"epoch": 0.5610788712606871,
"grad_norm": 1.440307378768921,
"learning_rate": 9.559031610258007e-06,
"loss": 1.5581,
"step": 22165
},
{
"epoch": 0.5618635969547581,
"grad_norm": 1.5789539813995361,
"learning_rate": 9.493069882506164e-06,
"loss": 1.5589,
"step": 22196
},
{
"epoch": 0.5626483226488289,
"grad_norm": 1.3445873260498047,
"learning_rate": 9.427283148353056e-06,
"loss": 1.5533,
"step": 22227
},
{
"epoch": 0.5634330483428999,
"grad_norm": 1.3744895458221436,
"learning_rate": 9.361672150193052e-06,
"loss": 1.5497,
"step": 22258
},
{
"epoch": 0.5642177740369707,
"grad_norm": 1.4480764865875244,
"learning_rate": 9.29623762843734e-06,
"loss": 1.5521,
"step": 22289
},
{
"epoch": 0.5650024997310416,
"grad_norm": 1.3482125997543335,
"learning_rate": 9.230980321505594e-06,
"loss": 1.5514,
"step": 22320
},
{
"epoch": 0.5657872254251125,
"grad_norm": 1.4724624156951904,
"learning_rate": 9.165900965817668e-06,
"loss": 1.558,
"step": 22351
},
{
"epoch": 0.5665719511191833,
"grad_norm": 1.4756817817687988,
"learning_rate": 9.101000295785245e-06,
"loss": 1.5519,
"step": 22382
},
{
"epoch": 0.5673566768132543,
"grad_norm": 1.4908230304718018,
"learning_rate": 9.036279043803565e-06,
"loss": 1.5649,
"step": 22413
},
{
"epoch": 0.5681414025073251,
"grad_norm": 1.2823692560195923,
"learning_rate": 8.971737940243147e-06,
"loss": 1.5561,
"step": 22444
},
{
"epoch": 0.5689261282013961,
"grad_norm": 1.3445894718170166,
"learning_rate": 8.907377713441592e-06,
"loss": 1.5296,
"step": 22475
},
{
"epoch": 0.5697108538954669,
"grad_norm": 1.3359887599945068,
"learning_rate": 8.843199089695293e-06,
"loss": 1.5299,
"step": 22506
},
{
"epoch": 0.5704955795895378,
"grad_norm": 1.4024282693862915,
"learning_rate": 8.779202793251311e-06,
"loss": 1.555,
"step": 22537
},
{
"epoch": 0.5712803052836087,
"grad_norm": 1.402908444404602,
"learning_rate": 8.715389546299149e-06,
"loss": 1.5442,
"step": 22568
},
{
"epoch": 0.5720650309776796,
"grad_norm": 1.3054429292678833,
"learning_rate": 8.651760068962617e-06,
"loss": 1.5491,
"step": 22599
},
{
"epoch": 0.5728497566717505,
"grad_norm": 1.314642071723938,
"learning_rate": 8.588315079291733e-06,
"loss": 1.531,
"step": 22630
},
{
"epoch": 0.5736344823658214,
"grad_norm": 1.2906594276428223,
"learning_rate": 8.52505529325457e-06,
"loss": 1.525,
"step": 22661
},
{
"epoch": 0.5744192080598923,
"grad_norm": 1.391607403755188,
"learning_rate": 8.461981424729216e-06,
"loss": 1.5578,
"step": 22692
},
{
"epoch": 0.5752039337539632,
"grad_norm": 1.5275055170059204,
"learning_rate": 8.399094185495725e-06,
"loss": 1.5468,
"step": 22723
},
{
"epoch": 0.5759886594480341,
"grad_norm": 1.4094804525375366,
"learning_rate": 8.336394285228017e-06,
"loss": 1.5336,
"step": 22754
},
{
"epoch": 0.576773385142105,
"grad_norm": 1.4096417427062988,
"learning_rate": 8.273882431485952e-06,
"loss": 1.5386,
"step": 22785
},
{
"epoch": 0.5775581108361758,
"grad_norm": 1.4015659093856812,
"learning_rate": 8.211559329707316e-06,
"loss": 1.5514,
"step": 22816
},
{
"epoch": 0.5783428365302468,
"grad_norm": 1.4353171586990356,
"learning_rate": 8.149425683199823e-06,
"loss": 1.5432,
"step": 22847
},
{
"epoch": 0.5791275622243176,
"grad_norm": 1.3493109941482544,
"learning_rate": 8.08748219313325e-06,
"loss": 1.5387,
"step": 22878
},
{
"epoch": 0.5799122879183886,
"grad_norm": 1.376868486404419,
"learning_rate": 8.025729558531453e-06,
"loss": 1.5397,
"step": 22909
},
{
"epoch": 0.5806970136124594,
"grad_norm": 1.4415427446365356,
"learning_rate": 7.964168476264508e-06,
"loss": 1.5556,
"step": 22940
},
{
"epoch": 0.5814817393065304,
"grad_norm": 1.4281046390533447,
"learning_rate": 7.902799641040884e-06,
"loss": 1.5312,
"step": 22971
},
{
"epoch": 0.5822664650006012,
"grad_norm": 1.372336983680725,
"learning_rate": 7.841623745399523e-06,
"loss": 1.5437,
"step": 23002
},
{
"epoch": 0.583051190694672,
"grad_norm": 1.3720817565917969,
"learning_rate": 7.780641479702114e-06,
"loss": 1.5599,
"step": 23033
},
{
"epoch": 0.583835916388743,
"grad_norm": 1.3714765310287476,
"learning_rate": 7.719853532125227e-06,
"loss": 1.5256,
"step": 23064
},
{
"epoch": 0.5846206420828138,
"grad_norm": 1.3198277950286865,
"learning_rate": 7.65926058865258e-06,
"loss": 1.5609,
"step": 23095
},
{
"epoch": 0.5854053677768848,
"grad_norm": 1.3970394134521484,
"learning_rate": 7.598863333067313e-06,
"loss": 1.552,
"step": 23126
},
{
"epoch": 0.5861900934709556,
"grad_norm": 1.3451225757598877,
"learning_rate": 7.538662446944253e-06,
"loss": 1.5407,
"step": 23157
},
{
"epoch": 0.5869748191650266,
"grad_norm": 1.3626407384872437,
"learning_rate": 7.478658609642211e-06,
"loss": 1.528,
"step": 23188
},
{
"epoch": 0.5877595448590974,
"grad_norm": 1.295155644416809,
"learning_rate": 7.418852498296327e-06,
"loss": 1.5396,
"step": 23219
},
{
"epoch": 0.5885442705531684,
"grad_norm": 1.4162577390670776,
"learning_rate": 7.359244787810457e-06,
"loss": 1.5442,
"step": 23250
},
{
"epoch": 0.5893289962472392,
"grad_norm": 1.4795522689819336,
"learning_rate": 7.299836150849493e-06,
"loss": 1.5724,
"step": 23281
},
{
"epoch": 0.5901137219413101,
"grad_norm": 1.4080073833465576,
"learning_rate": 7.240627257831847e-06,
"loss": 1.5673,
"step": 23312
},
{
"epoch": 0.590898447635381,
"grad_norm": 1.2865021228790283,
"learning_rate": 7.1816187769218195e-06,
"loss": 1.5529,
"step": 23343
},
{
"epoch": 0.5916831733294519,
"grad_norm": 2.568460464477539,
"learning_rate": 7.1228113740220895e-06,
"loss": 1.5379,
"step": 23374
},
{
"epoch": 0.5924678990235228,
"grad_norm": 1.4487184286117554,
"learning_rate": 7.064205712766226e-06,
"loss": 1.5417,
"step": 23405
},
{
"epoch": 0.5932526247175937,
"grad_norm": 1.3384840488433838,
"learning_rate": 7.005802454511129e-06,
"loss": 1.5481,
"step": 23436
},
{
"epoch": 0.5940373504116646,
"grad_norm": 1.3432554006576538,
"learning_rate": 6.947602258329639e-06,
"loss": 1.521,
"step": 23467
},
{
"epoch": 0.5948220761057355,
"grad_norm": 1.3277153968811035,
"learning_rate": 6.889605781003078e-06,
"loss": 1.5348,
"step": 23498
},
{
"epoch": 0.5956068017998063,
"grad_norm": 1.4018425941467285,
"learning_rate": 6.831813677013776e-06,
"loss": 1.5319,
"step": 23529
},
{
"epoch": 0.5963915274938772,
"grad_norm": 1.44899582862854,
"learning_rate": 6.774226598537792e-06,
"loss": 1.5624,
"step": 23560
},
{
"epoch": 0.5971762531879481,
"grad_norm": 1.4060876369476318,
"learning_rate": 6.716845195437482e-06,
"loss": 1.5487,
"step": 23591
},
{
"epoch": 0.597960978882019,
"grad_norm": 1.4121522903442383,
"learning_rate": 6.659670115254168e-06,
"loss": 1.5332,
"step": 23622
},
{
"epoch": 0.5987457045760899,
"grad_norm": 1.3269188404083252,
"learning_rate": 6.602702003200872e-06,
"loss": 1.5276,
"step": 23653
},
{
"epoch": 0.5995304302701608,
"grad_norm": 1.3662550449371338,
"learning_rate": 6.545941502154992e-06,
"loss": 1.5629,
"step": 23684
},
{
"epoch": 0.6003151559642317,
"grad_norm": 1.4438221454620361,
"learning_rate": 6.489389252651057e-06,
"loss": 1.5496,
"step": 23715
},
{
"epoch": 0.6010998816583026,
"grad_norm": 1.422269344329834,
"learning_rate": 6.4330458928735325e-06,
"loss": 1.533,
"step": 23746
},
{
"epoch": 0.6018846073523735,
"grad_norm": 1.3922473192214966,
"learning_rate": 6.376912058649559e-06,
"loss": 1.5198,
"step": 23777
},
{
"epoch": 0.6026693330464443,
"grad_norm": 1.4476711750030518,
"learning_rate": 6.320988383441845e-06,
"loss": 1.55,
"step": 23808
},
{
"epoch": 0.6034540587405153,
"grad_norm": 1.3881078958511353,
"learning_rate": 6.265275498341452e-06,
"loss": 1.524,
"step": 23839
},
{
"epoch": 0.6042387844345861,
"grad_norm": 1.4356231689453125,
"learning_rate": 6.209774032060714e-06,
"loss": 1.5334,
"step": 23870
},
{
"epoch": 0.6050235101286571,
"grad_norm": 1.34247624874115,
"learning_rate": 6.1544846109261365e-06,
"loss": 1.5309,
"step": 23901
},
{
"epoch": 0.6058082358227279,
"grad_norm": 1.3616281747817993,
"learning_rate": 6.099407858871342e-06,
"loss": 1.5202,
"step": 23932
},
{
"epoch": 0.6065929615167989,
"grad_norm": 1.4779770374298096,
"learning_rate": 6.044544397429958e-06,
"loss": 1.5266,
"step": 23963
},
{
"epoch": 0.6073776872108697,
"grad_norm": 1.3740448951721191,
"learning_rate": 5.989894845728708e-06,
"loss": 1.5251,
"step": 23994
},
{
"epoch": 0.6081624129049406,
"grad_norm": 1.3835887908935547,
"learning_rate": 5.9354598204803605e-06,
"loss": 1.5349,
"step": 24025
},
{
"epoch": 0.6089471385990115,
"grad_norm": 1.419488549232483,
"learning_rate": 5.881239935976762e-06,
"loss": 1.5236,
"step": 24056
},
{
"epoch": 0.6097318642930823,
"grad_norm": 1.3918389081954956,
"learning_rate": 5.827235804081954e-06,
"loss": 1.5534,
"step": 24087
},
{
"epoch": 0.6105165899871533,
"grad_norm": 1.4750800132751465,
"learning_rate": 5.773448034225221e-06,
"loss": 1.5322,
"step": 24118
},
{
"epoch": 0.6113013156812241,
"grad_norm": 1.4278340339660645,
"learning_rate": 5.719877233394228e-06,
"loss": 1.5626,
"step": 24149
},
{
"epoch": 0.6120860413752951,
"grad_norm": 1.43100106716156,
"learning_rate": 5.666524006128191e-06,
"loss": 1.5411,
"step": 24180
},
{
"epoch": 0.6128707670693659,
"grad_norm": 1.397022008895874,
"learning_rate": 5.613388954511015e-06,
"loss": 1.5233,
"step": 24211
},
{
"epoch": 0.6136554927634369,
"grad_norm": 1.2984530925750732,
"learning_rate": 5.560472678164552e-06,
"loss": 1.5487,
"step": 24242
},
{
"epoch": 0.6144402184575077,
"grad_norm": 1.318934679031372,
"learning_rate": 5.507775774241775e-06,
"loss": 1.5627,
"step": 24273
},
{
"epoch": 0.6152249441515786,
"grad_norm": 1.4760456085205078,
"learning_rate": 5.4552988374200945e-06,
"loss": 1.5222,
"step": 24304
},
{
"epoch": 0.6160096698456495,
"grad_norm": 1.350392460823059,
"learning_rate": 5.403042459894597e-06,
"loss": 1.535,
"step": 24335
},
{
"epoch": 0.6167943955397204,
"grad_norm": 1.3857702016830444,
"learning_rate": 5.3510072313714135e-06,
"loss": 1.5483,
"step": 24366
},
{
"epoch": 0.6175791212337913,
"grad_norm": 1.4854798316955566,
"learning_rate": 5.2991937390610205e-06,
"loss": 1.5381,
"step": 24397
},
{
"epoch": 0.6183638469278622,
"grad_norm": 1.3600910902023315,
"learning_rate": 5.247602567671625e-06,
"loss": 1.5277,
"step": 24428
},
{
"epoch": 0.6191485726219331,
"grad_norm": 1.3631632328033447,
"learning_rate": 5.196234299402603e-06,
"loss": 1.5583,
"step": 24459
},
{
"epoch": 0.619933298316004,
"grad_norm": 1.4225085973739624,
"learning_rate": 5.145089513937865e-06,
"loss": 1.5346,
"step": 24490
},
{
"epoch": 0.6207180240100749,
"grad_norm": 1.3548002243041992,
"learning_rate": 5.094168788439369e-06,
"loss": 1.546,
"step": 24521
},
{
"epoch": 0.6215027497041458,
"grad_norm": 1.4630082845687866,
"learning_rate": 5.043472697540594e-06,
"loss": 1.549,
"step": 24552
},
{
"epoch": 0.6222874753982166,
"grad_norm": 1.4638261795043945,
"learning_rate": 4.993001813340012e-06,
"loss": 1.5224,
"step": 24583
},
{
"epoch": 0.6230722010922876,
"grad_norm": 1.3274465799331665,
"learning_rate": 4.942756705394702e-06,
"loss": 1.538,
"step": 24614
},
{
"epoch": 0.6238569267863584,
"grad_norm": 1.4302935600280762,
"learning_rate": 4.892737940713884e-06,
"loss": 1.545,
"step": 24645
},
{
"epoch": 0.6246416524804294,
"grad_norm": 1.4292621612548828,
"learning_rate": 4.842946083752511e-06,
"loss": 1.5275,
"step": 24676
},
{
"epoch": 0.6254263781745002,
"grad_norm": 1.3631361722946167,
"learning_rate": 4.79338169640493e-06,
"loss": 1.5552,
"step": 24707
},
{
"epoch": 0.6262111038685712,
"grad_norm": 1.4284039735794067,
"learning_rate": 4.74404533799851e-06,
"loss": 1.5298,
"step": 24738
},
{
"epoch": 0.626995829562642,
"grad_norm": 1.4611119031906128,
"learning_rate": 4.694937565287344e-06,
"loss": 1.5414,
"step": 24769
},
{
"epoch": 0.6277805552567128,
"grad_norm": 1.37677800655365,
"learning_rate": 4.646058932445985e-06,
"loss": 1.5392,
"step": 24800
},
{
"epoch": 0.6285652809507838,
"grad_norm": 1.4582575559616089,
"learning_rate": 4.597409991063148e-06,
"loss": 1.5317,
"step": 24831
},
{
"epoch": 0.6293500066448546,
"grad_norm": 1.3665950298309326,
"learning_rate": 4.5489912901355375e-06,
"loss": 1.5514,
"step": 24862
},
{
"epoch": 0.6301347323389256,
"grad_norm": 1.3817001581192017,
"learning_rate": 4.500803376061608e-06,
"loss": 1.5343,
"step": 24893
},
{
"epoch": 0.6309194580329964,
"grad_norm": 1.4217463731765747,
"learning_rate": 4.45284679263541e-06,
"loss": 1.5247,
"step": 24924
},
{
"epoch": 0.6317041837270674,
"grad_norm": 1.3985430002212524,
"learning_rate": 4.4051220810404775e-06,
"loss": 1.5348,
"step": 24955
},
{
"epoch": 0.6324889094211382,
"grad_norm": 1.4616161584854126,
"learning_rate": 4.3576297798437025e-06,
"loss": 1.5563,
"step": 24986
},
{
"epoch": 0.6332736351152092,
"grad_norm": 1.3955610990524292,
"learning_rate": 4.3103704249892436e-06,
"loss": 1.5204,
"step": 25017
},
{
"epoch": 0.63405836080928,
"grad_norm": 1.3720837831497192,
"learning_rate": 4.263344549792487e-06,
"loss": 1.5379,
"step": 25048
},
{
"epoch": 0.6348430865033509,
"grad_norm": 1.347891092300415,
"learning_rate": 4.216552684934056e-06,
"loss": 1.5285,
"step": 25079
},
{
"epoch": 0.6356278121974218,
"grad_norm": 1.5957375764846802,
"learning_rate": 4.169995358453777e-06,
"loss": 1.5163,
"step": 25110
},
{
"epoch": 0.6364125378914927,
"grad_norm": 1.3431944847106934,
"learning_rate": 4.123673095744757e-06,
"loss": 1.5378,
"step": 25141
},
{
"epoch": 0.6371972635855636,
"grad_norm": 1.4405794143676758,
"learning_rate": 4.077586419547435e-06,
"loss": 1.5563,
"step": 25172
},
{
"epoch": 0.6379819892796345,
"grad_norm": 1.3969746828079224,
"learning_rate": 4.03173584994368e-06,
"loss": 1.5441,
"step": 25203
},
{
"epoch": 0.6387667149737054,
"grad_norm": 1.542013168334961,
"learning_rate": 3.986121904350948e-06,
"loss": 1.5249,
"step": 25234
},
{
"epoch": 0.6395514406677762,
"grad_norm": 1.4267256259918213,
"learning_rate": 3.940745097516407e-06,
"loss": 1.5184,
"step": 25265
},
{
"epoch": 0.6403361663618471,
"grad_norm": 1.331272840499878,
"learning_rate": 3.89560594151116e-06,
"loss": 1.5437,
"step": 25296
},
{
"epoch": 0.641120892055918,
"grad_norm": 1.368691086769104,
"learning_rate": 3.850704945724456e-06,
"loss": 1.5265,
"step": 25327
},
{
"epoch": 0.6419056177499889,
"grad_norm": 1.3770484924316406,
"learning_rate": 3.8060426168579077e-06,
"loss": 1.5291,
"step": 25358
},
{
"epoch": 0.6426903434440598,
"grad_norm": 1.4727221727371216,
"learning_rate": 3.7616194589198407e-06,
"loss": 1.5326,
"step": 25389
},
{
"epoch": 0.6434750691381307,
"grad_norm": 1.3571360111236572,
"learning_rate": 3.7174359732195574e-06,
"loss": 1.5278,
"step": 25420
},
{
"epoch": 0.6442597948322016,
"grad_norm": 1.4054335355758667,
"learning_rate": 3.673492658361677e-06,
"loss": 1.5405,
"step": 25451
},
{
"epoch": 0.6450445205262725,
"grad_norm": 1.4510763883590698,
"learning_rate": 3.6297900102405467e-06,
"loss": 1.5409,
"step": 25482
},
{
"epoch": 0.6458292462203434,
"grad_norm": 1.5653456449508667,
"learning_rate": 3.586328522034607e-06,
"loss": 1.5224,
"step": 25513
},
{
"epoch": 0.6466139719144143,
"grad_norm": 1.4818406105041504,
"learning_rate": 3.543108684200838e-06,
"loss": 1.5251,
"step": 25544
},
{
"epoch": 0.6473986976084851,
"grad_norm": 1.4254684448242188,
"learning_rate": 3.5001309844692464e-06,
"loss": 1.5219,
"step": 25575
},
{
"epoch": 0.6481834233025561,
"grad_norm": 1.348809838294983,
"learning_rate": 3.4573959078373215e-06,
"loss": 1.5285,
"step": 25606
},
{
"epoch": 0.6489681489966269,
"grad_norm": 1.4553576707839966,
"learning_rate": 3.4149039365646063e-06,
"loss": 1.5419,
"step": 25637
},
{
"epoch": 0.6497528746906979,
"grad_norm": 1.412490963935852,
"learning_rate": 3.3726555501672143e-06,
"loss": 1.5186,
"step": 25668
},
{
"epoch": 0.6505376003847687,
"grad_norm": 1.4104843139648438,
"learning_rate": 3.33065122541244e-06,
"loss": 1.5254,
"step": 25699
},
{
"epoch": 0.6513223260788397,
"grad_norm": 1.3806548118591309,
"learning_rate": 3.288891436313385e-06,
"loss": 1.5272,
"step": 25730
},
{
"epoch": 0.6521070517729105,
"grad_norm": 1.4207285642623901,
"learning_rate": 3.2473766541235963e-06,
"loss": 1.536,
"step": 25761
},
{
"epoch": 0.6528917774669813,
"grad_norm": 1.3559178113937378,
"learning_rate": 3.2061073473317466e-06,
"loss": 1.5394,
"step": 25792
},
{
"epoch": 0.6536765031610523,
"grad_norm": 1.3517690896987915,
"learning_rate": 3.1650839816563444e-06,
"loss": 1.5488,
"step": 25823
},
{
"epoch": 0.6544612288551231,
"grad_norm": 1.3978461027145386,
"learning_rate": 3.1243070200405093e-06,
"loss": 1.5261,
"step": 25854
},
{
"epoch": 0.6552459545491941,
"grad_norm": 1.3550540208816528,
"learning_rate": 3.0837769226467e-06,
"loss": 1.5254,
"step": 25885
},
{
"epoch": 0.6560306802432649,
"grad_norm": 1.3790268898010254,
"learning_rate": 3.0434941468515666e-06,
"loss": 1.5224,
"step": 25916
},
{
"epoch": 0.6568154059373359,
"grad_norm": 1.3558413982391357,
"learning_rate": 3.003459147240753e-06,
"loss": 1.5179,
"step": 25947
},
{
"epoch": 0.6576001316314067,
"grad_norm": 1.3683024644851685,
"learning_rate": 2.9636723756037875e-06,
"loss": 1.5191,
"step": 25978
},
{
"epoch": 0.6583848573254777,
"grad_norm": 1.4349849224090576,
"learning_rate": 2.9241342809289833e-06,
"loss": 1.5417,
"step": 26009
},
{
"epoch": 0.6591695830195485,
"grad_norm": 1.3950988054275513,
"learning_rate": 2.8848453093983594e-06,
"loss": 1.5267,
"step": 26040
},
{
"epoch": 0.6599543087136194,
"grad_norm": 1.3628458976745605,
"learning_rate": 2.8458059043826257e-06,
"loss": 1.5294,
"step": 26071
},
{
"epoch": 0.6607390344076903,
"grad_norm": 1.3483256101608276,
"learning_rate": 2.807016506436172e-06,
"loss": 1.5498,
"step": 26102
},
{
"epoch": 0.6615237601017612,
"grad_norm": 1.3618528842926025,
"learning_rate": 2.7684775532920566e-06,
"loss": 1.5271,
"step": 26133
},
{
"epoch": 0.6623084857958321,
"grad_norm": 1.49851393699646,
"learning_rate": 2.7301894798571425e-06,
"loss": 1.526,
"step": 26164
},
{
"epoch": 0.663093211489903,
"grad_norm": 1.5132079124450684,
"learning_rate": 2.6921527182071386e-06,
"loss": 1.5418,
"step": 26195
},
{
"epoch": 0.6638779371839739,
"grad_norm": 1.4265996217727661,
"learning_rate": 2.654367697581725e-06,
"loss": 1.5455,
"step": 26226
},
{
"epoch": 0.6646626628780448,
"grad_norm": 1.506589412689209,
"learning_rate": 2.6168348443797175e-06,
"loss": 1.5209,
"step": 26257
},
{
"epoch": 0.6654473885721156,
"grad_norm": 1.3662431240081787,
"learning_rate": 2.5795545821542757e-06,
"loss": 1.5169,
"step": 26288
},
{
"epoch": 0.6662321142661866,
"grad_norm": 1.4398752450942993,
"learning_rate": 2.54252733160808e-06,
"loss": 1.5491,
"step": 26319
},
{
"epoch": 0.6670168399602574,
"grad_norm": 1.4776362180709839,
"learning_rate": 2.5057535105886294e-06,
"loss": 1.5192,
"step": 26350
},
{
"epoch": 0.6678015656543284,
"grad_norm": 1.3796826601028442,
"learning_rate": 2.4692335340834953e-06,
"loss": 1.5245,
"step": 26381
},
{
"epoch": 0.6685862913483992,
"grad_norm": 1.3923054933547974,
"learning_rate": 2.432967814215639e-06,
"loss": 1.5252,
"step": 26412
},
{
"epoch": 0.6693710170424702,
"grad_norm": 1.3372383117675781,
"learning_rate": 2.396956760238794e-06,
"loss": 1.5227,
"step": 26443
},
{
"epoch": 0.670155742736541,
"grad_norm": 1.3287001848220825,
"learning_rate": 2.361200778532796e-06,
"loss": 1.5335,
"step": 26474
},
{
"epoch": 0.670940468430612,
"grad_norm": 1.3403995037078857,
"learning_rate": 2.325700272599049e-06,
"loss": 1.5304,
"step": 26505
},
{
"epoch": 0.6717251941246828,
"grad_norm": 1.3469324111938477,
"learning_rate": 2.2904556430559415e-06,
"loss": 1.5329,
"step": 26536
},
{
"epoch": 0.6725099198187536,
"grad_norm": 1.4993536472320557,
"learning_rate": 2.2554672876343106e-06,
"loss": 1.5228,
"step": 26567
},
{
"epoch": 0.6732946455128246,
"grad_norm": 1.3785438537597656,
"learning_rate": 2.220735601173002e-06,
"loss": 1.516,
"step": 26598
},
{
"epoch": 0.6740793712068954,
"grad_norm": 1.3642317056655884,
"learning_rate": 2.186260975614382e-06,
"loss": 1.5467,
"step": 26629
},
{
"epoch": 0.6748640969009664,
"grad_norm": 1.3815925121307373,
"learning_rate": 2.1520437999999034e-06,
"loss": 1.5449,
"step": 26660
},
{
"epoch": 0.6756488225950372,
"grad_norm": 1.3854280710220337,
"learning_rate": 2.1180844604657526e-06,
"loss": 1.5177,
"step": 26691
},
{
"epoch": 0.6764335482891082,
"grad_norm": 1.4565620422363281,
"learning_rate": 2.084383340238455e-06,
"loss": 1.5119,
"step": 26722
},
{
"epoch": 0.677218273983179,
"grad_norm": 1.35818612575531,
"learning_rate": 2.0509408196305704e-06,
"loss": 1.5084,
"step": 26753
},
{
"epoch": 0.6780029996772499,
"grad_norm": 1.4125559329986572,
"learning_rate": 2.017757276036403e-06,
"loss": 1.5101,
"step": 26784
},
{
"epoch": 0.6787877253713208,
"grad_norm": 1.43025803565979,
"learning_rate": 1.984833083927726e-06,
"loss": 1.5318,
"step": 26815
},
{
"epoch": 0.6795724510653917,
"grad_norm": 1.3963549137115479,
"learning_rate": 1.952168614849581e-06,
"loss": 1.5248,
"step": 26846
},
{
"epoch": 0.6803571767594626,
"grad_norm": 1.4896256923675537,
"learning_rate": 1.919764237416058e-06,
"loss": 1.5409,
"step": 26877
},
{
"epoch": 0.6811419024535335,
"grad_norm": 1.3385494947433472,
"learning_rate": 1.8876203173061463e-06,
"loss": 1.5371,
"step": 26908
},
{
"epoch": 0.6819266281476044,
"grad_norm": 1.3572068214416504,
"learning_rate": 1.8557372172596206e-06,
"loss": 1.5394,
"step": 26939
},
{
"epoch": 0.6827113538416753,
"grad_norm": 1.455278992652893,
"learning_rate": 1.8241152970729341e-06,
"loss": 1.5345,
"step": 26970
},
{
"epoch": 0.6834960795357462,
"grad_norm": 1.4417409896850586,
"learning_rate": 1.7927549135951572e-06,
"loss": 1.5252,
"step": 27001
},
{
"epoch": 0.684280805229817,
"grad_norm": 1.4233667850494385,
"learning_rate": 1.7616564207239477e-06,
"loss": 1.5221,
"step": 27032
},
{
"epoch": 0.6850655309238879,
"grad_norm": 1.4328643083572388,
"learning_rate": 1.730820169401584e-06,
"loss": 1.508,
"step": 27063
},
{
"epoch": 0.6858502566179588,
"grad_norm": 1.3445032835006714,
"learning_rate": 1.7002465076109558e-06,
"loss": 1.5209,
"step": 27094
},
{
"epoch": 0.6866349823120297,
"grad_norm": 1.4214242696762085,
"learning_rate": 1.6699357803716898e-06,
"loss": 1.5297,
"step": 27125
},
{
"epoch": 0.6874197080061006,
"grad_norm": 1.3590694665908813,
"learning_rate": 1.6398883297362305e-06,
"loss": 1.5351,
"step": 27156
},
{
"epoch": 0.6882044337001715,
"grad_norm": 1.4039976596832275,
"learning_rate": 1.6101044947859606e-06,
"loss": 1.5529,
"step": 27187
},
{
"epoch": 0.6889891593942424,
"grad_norm": 1.3939241170883179,
"learning_rate": 1.5805846116274114e-06,
"loss": 1.509,
"step": 27218
},
{
"epoch": 0.6897738850883133,
"grad_norm": 1.4963489770889282,
"learning_rate": 1.5513290133884611e-06,
"loss": 1.5526,
"step": 27249
},
{
"epoch": 0.6905586107823841,
"grad_norm": 1.413089632987976,
"learning_rate": 1.5223380302145512e-06,
"loss": 1.5271,
"step": 27280
},
{
"epoch": 0.6913433364764551,
"grad_norm": 1.4136161804199219,
"learning_rate": 1.4936119892649925e-06,
"loss": 1.5365,
"step": 27311
},
{
"epoch": 0.6921280621705259,
"grad_norm": 1.4144634008407593,
"learning_rate": 1.4651512147092482e-06,
"loss": 1.5255,
"step": 27342
},
{
"epoch": 0.6929127878645969,
"grad_norm": 1.3424650430679321,
"learning_rate": 1.4369560277232908e-06,
"loss": 1.5275,
"step": 27373
},
{
"epoch": 0.6936975135586677,
"grad_norm": 1.4057984352111816,
"learning_rate": 1.409026746485978e-06,
"loss": 1.5273,
"step": 27404
},
{
"epoch": 0.6944822392527387,
"grad_norm": 1.4132764339447021,
"learning_rate": 1.3813636861754464e-06,
"loss": 1.5219,
"step": 27435
},
{
"epoch": 0.6952669649468095,
"grad_norm": 1.541971206665039,
"learning_rate": 1.3539671589655773e-06,
"loss": 1.5413,
"step": 27466
},
{
"epoch": 0.6960516906408805,
"grad_norm": 1.4268949031829834,
"learning_rate": 1.3268374740224548e-06,
"loss": 1.5298,
"step": 27497
},
{
"epoch": 0.6968364163349513,
"grad_norm": 1.427729606628418,
"learning_rate": 1.2999749375008807e-06,
"loss": 1.5239,
"step": 27528
},
{
"epoch": 0.6976211420290221,
"grad_norm": 1.4411410093307495,
"learning_rate": 1.2733798525409346e-06,
"loss": 1.5215,
"step": 27559
},
{
"epoch": 0.6984058677230931,
"grad_norm": 1.4318063259124756,
"learning_rate": 1.2470525192645383e-06,
"loss": 1.5238,
"step": 27590
},
{
"epoch": 0.6991905934171639,
"grad_norm": 1.4129235744476318,
"learning_rate": 1.2209932347720666e-06,
"loss": 1.5239,
"step": 27621
},
{
"epoch": 0.6999753191112349,
"grad_norm": 1.355039358139038,
"learning_rate": 1.1952022931389972e-06,
"loss": 1.5205,
"step": 27652
},
{
"epoch": 0.7007600448053057,
"grad_norm": 1.6766573190689087,
"learning_rate": 1.1696799854126083e-06,
"loss": 1.5369,
"step": 27683
},
{
"epoch": 0.7015447704993767,
"grad_norm": 1.380895733833313,
"learning_rate": 1.1444265996086694e-06,
"loss": 1.5116,
"step": 27714
},
{
"epoch": 0.7023294961934475,
"grad_norm": 1.3622218370437622,
"learning_rate": 1.119442420708211e-06,
"loss": 1.5265,
"step": 27745
},
{
"epoch": 0.7031142218875185,
"grad_norm": 1.4054621458053589,
"learning_rate": 1.0947277306542964e-06,
"loss": 1.5249,
"step": 27776
},
{
"epoch": 0.7038989475815893,
"grad_norm": 1.27810537815094,
"learning_rate": 1.0702828083488353e-06,
"loss": 1.5321,
"step": 27807
},
{
"epoch": 0.7046836732756602,
"grad_norm": 1.380940318107605,
"learning_rate": 1.0461079296494647e-06,
"loss": 1.5381,
"step": 27838
},
{
"epoch": 0.7054683989697311,
"grad_norm": 1.3913681507110596,
"learning_rate": 1.0222033673663978e-06,
"loss": 1.5334,
"step": 27869
},
{
"epoch": 0.706253124663802,
"grad_norm": 1.3444581031799316,
"learning_rate": 9.985693912593713e-07,
"loss": 1.5329,
"step": 27900
},
{
"epoch": 0.7070378503578729,
"grad_norm": 1.3935922384262085,
"learning_rate": 9.752062680346035e-07,
"loss": 1.5152,
"step": 27931
},
{
"epoch": 0.7078225760519438,
"grad_norm": 1.354137897491455,
"learning_rate": 9.521142613417494e-07,
"loss": 1.5419,
"step": 27962
},
{
"epoch": 0.7086073017460147,
"grad_norm": 1.417913556098938,
"learning_rate": 9.292936317709722e-07,
"loss": 1.5379,
"step": 27993
},
{
"epoch": 0.7093920274400856,
"grad_norm": 1.3628367185592651,
"learning_rate": 9.067446368499793e-07,
"loss": 1.551,
"step": 28024
},
{
"epoch": 0.7101767531341564,
"grad_norm": 1.3860423564910889,
"learning_rate": 8.844675310411055e-07,
"loss": 1.5221,
"step": 28055
},
{
"epoch": 0.7109614788282274,
"grad_norm": 1.374284267425537,
"learning_rate": 8.6246256573847e-07,
"loss": 1.4989,
"step": 28086
},
{
"epoch": 0.7117462045222982,
"grad_norm": 1.3612192869186401,
"learning_rate": 8.407299892651127e-07,
"loss": 1.5131,
"step": 28117
},
{
"epoch": 0.7125309302163692,
"grad_norm": 1.4345417022705078,
"learning_rate": 8.19270046870202e-07,
"loss": 1.5248,
"step": 28148
},
{
"epoch": 0.71331565591044,
"grad_norm": 1.388961911201477,
"learning_rate": 7.980829807262752e-07,
"loss": 1.5283,
"step": 28179
},
{
"epoch": 0.714100381604511,
"grad_norm": 1.5089038610458374,
"learning_rate": 7.771690299264889e-07,
"loss": 1.528,
"step": 28210
},
{
"epoch": 0.7148851072985818,
"grad_norm": 1.334241509437561,
"learning_rate": 7.565284304819426e-07,
"loss": 1.5018,
"step": 28241
},
{
"epoch": 0.7156698329926527,
"grad_norm": 1.3419960737228394,
"learning_rate": 7.361614153189922e-07,
"loss": 1.5168,
"step": 28272
},
{
"epoch": 0.7164545586867236,
"grad_norm": 1.3039295673370361,
"learning_rate": 7.160682142766328e-07,
"loss": 1.547,
"step": 28303
},
{
"epoch": 0.7172392843807944,
"grad_norm": 1.5037273168563843,
"learning_rate": 6.962490541039091e-07,
"loss": 1.5523,
"step": 28334
},
{
"epoch": 0.7180240100748654,
"grad_norm": 1.3340791463851929,
"learning_rate": 6.767041584573531e-07,
"loss": 1.5431,
"step": 28365
},
{
"epoch": 0.7188087357689362,
"grad_norm": 1.8582775592803955,
"learning_rate": 6.574337478984532e-07,
"loss": 1.5477,
"step": 28396
},
{
"epoch": 0.7195934614630072,
"grad_norm": 1.4675524234771729,
"learning_rate": 6.384380398911732e-07,
"loss": 1.5331,
"step": 28427
},
{
"epoch": 0.720378187157078,
"grad_norm": 1.338765025138855,
"learning_rate": 6.197172487994951e-07,
"loss": 1.5164,
"step": 28458
},
{
"epoch": 0.721162912851149,
"grad_norm": 1.3680214881896973,
"learning_rate": 6.012715858850021e-07,
"loss": 1.5406,
"step": 28489
},
{
"epoch": 0.7219476385452198,
"grad_norm": 1.3477078676223755,
"learning_rate": 5.831012593044971e-07,
"loss": 1.5144,
"step": 28520
},
{
"epoch": 0.7227323642392907,
"grad_norm": 1.4228640794754028,
"learning_rate": 5.652064741076435e-07,
"loss": 1.5467,
"step": 28551
},
{
"epoch": 0.7235170899333616,
"grad_norm": 1.4127750396728516,
"learning_rate": 5.475874322346558e-07,
"loss": 1.5395,
"step": 28582
},
{
"epoch": 0.7243018156274325,
"grad_norm": 1.3759944438934326,
"learning_rate": 5.30244332514035e-07,
"loss": 1.5379,
"step": 28613
},
{
"epoch": 0.7250865413215034,
"grad_norm": 1.3767083883285522,
"learning_rate": 5.131773706602977e-07,
"loss": 1.5401,
"step": 28644
},
{
"epoch": 0.7258712670155743,
"grad_norm": 1.3337562084197998,
"learning_rate": 4.963867392717897e-07,
"loss": 1.5305,
"step": 28675
},
{
"epoch": 0.7266559927096452,
"grad_norm": 1.407812476158142,
"learning_rate": 4.798726278285093e-07,
"loss": 1.5161,
"step": 28706
},
{
"epoch": 0.727440718403716,
"grad_norm": 1.4337633848190308,
"learning_rate": 4.6363522268995097e-07,
"loss": 1.5357,
"step": 28737
},
{
"epoch": 0.728225444097787,
"grad_norm": 1.4671465158462524,
"learning_rate": 4.4767470709302927e-07,
"loss": 1.5153,
"step": 28768
},
{
"epoch": 0.7290101697918578,
"grad_norm": 1.3277357816696167,
"learning_rate": 4.319912611499971e-07,
"loss": 1.519,
"step": 28799
},
{
"epoch": 0.7297948954859287,
"grad_norm": 1.4174885749816895,
"learning_rate": 4.1658506184640564e-07,
"loss": 1.5265,
"step": 28830
},
{
"epoch": 0.7305796211799996,
"grad_norm": 1.4684560298919678,
"learning_rate": 4.0145628303911996e-07,
"loss": 1.5182,
"step": 28861
},
{
"epoch": 0.7313643468740705,
"grad_norm": 1.3946303129196167,
"learning_rate": 3.866050954543565e-07,
"loss": 1.5254,
"step": 28892
},
{
"epoch": 0.7321490725681414,
"grad_norm": 1.441939353942871,
"learning_rate": 3.720316666857432e-07,
"loss": 1.5315,
"step": 28923
},
{
"epoch": 0.7329337982622123,
"grad_norm": 1.3914129734039307,
"learning_rate": 3.5773616119244845e-07,
"loss": 1.5241,
"step": 28954
},
{
"epoch": 0.7337185239562832,
"grad_norm": 1.4593554735183716,
"learning_rate": 3.437187402973052e-07,
"loss": 1.5354,
"step": 28985
},
{
"epoch": 0.7345032496503541,
"grad_norm": 1.524565577507019,
"learning_rate": 3.2997956218500104e-07,
"loss": 1.5286,
"step": 29016
},
{
"epoch": 0.7352879753444249,
"grad_norm": 1.4271135330200195,
"learning_rate": 3.165187819003018e-07,
"loss": 1.5184,
"step": 29047
},
{
"epoch": 0.7360727010384959,
"grad_norm": 1.4928288459777832,
"learning_rate": 3.033365513462755e-07,
"loss": 1.5114,
"step": 29078
},
{
"epoch": 0.7368574267325667,
"grad_norm": 1.414491057395935,
"learning_rate": 2.9043301928260437e-07,
"loss": 1.5574,
"step": 29109
},
{
"epoch": 0.7376421524266377,
"grad_norm": 1.392284631729126,
"learning_rate": 2.7780833132389773e-07,
"loss": 1.5316,
"step": 29140
},
{
"epoch": 0.7384268781207085,
"grad_norm": 1.416320562362671,
"learning_rate": 2.6546262993803473e-07,
"loss": 1.539,
"step": 29171
},
{
"epoch": 0.7392116038147795,
"grad_norm": 1.418097734451294,
"learning_rate": 2.533960544445879e-07,
"loss": 1.5296,
"step": 29202
},
{
"epoch": 0.7399963295088503,
"grad_norm": 1.3826491832733154,
"learning_rate": 2.416087410132134e-07,
"loss": 1.5418,
"step": 29233
},
{
"epoch": 0.7407810552029213,
"grad_norm": 1.431630253791809,
"learning_rate": 2.301008226621465e-07,
"loss": 1.537,
"step": 29264
},
{
"epoch": 0.7415657808969921,
"grad_norm": 1.351166009902954,
"learning_rate": 2.1887242925668073e-07,
"loss": 1.5006,
"step": 29295
},
{
"epoch": 0.7423505065910629,
"grad_norm": 1.377264380455017,
"learning_rate": 2.0792368750770785e-07,
"loss": 1.5,
"step": 29326
},
{
"epoch": 0.7431352322851339,
"grad_norm": 1.4020991325378418,
"learning_rate": 1.9725472097028851e-07,
"loss": 1.5495,
"step": 29357
},
{
"epoch": 0.7439199579792047,
"grad_norm": 1.395375370979309,
"learning_rate": 1.8686565004226718e-07,
"loss": 1.535,
"step": 29388
},
{
"epoch": 0.7447046836732757,
"grad_norm": 1.424237608909607,
"learning_rate": 1.7675659196288995e-07,
"loss": 1.5265,
"step": 29419
},
{
"epoch": 0.7454894093673465,
"grad_norm": 1.432045817375183,
"learning_rate": 1.6692766081150556e-07,
"loss": 1.5005,
"step": 29450
},
{
"epoch": 0.7462741350614175,
"grad_norm": 1.3229856491088867,
"learning_rate": 1.5737896750626647e-07,
"loss": 1.5464,
"step": 29481
},
{
"epoch": 0.7470588607554883,
"grad_norm": 1.4807835817337036,
"learning_rate": 1.4811061980287976e-07,
"loss": 1.5113,
"step": 29512
},
{
"epoch": 0.7478435864495592,
"grad_norm": 1.3511358499526978,
"learning_rate": 1.3912272229338886e-07,
"loss": 1.546,
"step": 29543
},
{
"epoch": 0.7486283121436301,
"grad_norm": 1.330914855003357,
"learning_rate": 1.3041537640499645e-07,
"loss": 1.5271,
"step": 29574
},
{
"epoch": 0.749413037837701,
"grad_norm": 1.6085385084152222,
"learning_rate": 1.2198868039891564e-07,
"loss": 1.5346,
"step": 29605
},
{
"epoch": 0.7501977635317719,
"grad_norm": 1.3291810750961304,
"learning_rate": 1.138427293692651e-07,
"loss": 1.5132,
"step": 29636
},
{
"epoch": 0.7509824892258428,
"grad_norm": 1.367587685585022,
"learning_rate": 1.0597761524199778e-07,
"loss": 1.5226,
"step": 29667
},
{
"epoch": 0.7517672149199137,
"grad_norm": 1.4591524600982666,
"learning_rate": 9.839342677385455e-08,
"loss": 1.5447,
"step": 29698
},
{
"epoch": 0.7525519406139846,
"grad_norm": 1.3880685567855835,
"learning_rate": 9.109024955137325e-08,
"loss": 1.5304,
"step": 29729
},
{
"epoch": 0.7533366663080555,
"grad_norm": 1.3544681072235107,
"learning_rate": 8.406816598991729e-08,
"loss": 1.5362,
"step": 29760
},
{
"epoch": 0.7541213920021264,
"grad_norm": 1.398155927658081,
"learning_rate": 7.73272553327431e-08,
"loss": 1.5075,
"step": 29791
},
{
"epoch": 0.7549061176961972,
"grad_norm": 1.4514081478118896,
"learning_rate": 7.086759365011186e-08,
"loss": 1.518,
"step": 29822
},
{
"epoch": 0.7556908433902682,
"grad_norm": 1.4363720417022705,
"learning_rate": 6.468925383842639e-08,
"loss": 1.5194,
"step": 29853
},
{
"epoch": 0.756475569084339,
"grad_norm": 1.4076640605926514,
"learning_rate": 5.8792305619415067e-08,
"loss": 1.5053,
"step": 29884
},
{
"epoch": 0.75726029477841,
"grad_norm": 1.3342225551605225,
"learning_rate": 5.317681553933529e-08,
"loss": 1.52,
"step": 29915
},
{
"epoch": 0.7580450204724808,
"grad_norm": 1.3737679719924927,
"learning_rate": 4.78428469682296e-08,
"loss": 1.5417,
"step": 29946
},
{
"epoch": 0.7588297461665517,
"grad_norm": 1.4676426649093628,
"learning_rate": 4.2790460099206844e-08,
"loss": 1.538,
"step": 29977
},
{
"epoch": 0.7596144718606226,
"grad_norm": 1.693217396736145,
"learning_rate": 3.801971194777043e-08,
"loss": 1.5206,
"step": 30008
},
{
"epoch": 0.7603991975546934,
"grad_norm": 1.4130475521087646,
"learning_rate": 3.353065635115782e-08,
"loss": 1.5305,
"step": 30039
},
{
"epoch": 0.7611839232487644,
"grad_norm": 1.4824076890945435,
"learning_rate": 2.93233439677576e-08,
"loss": 1.5118,
"step": 30070
},
{
"epoch": 0.7619686489428352,
"grad_norm": 1.3690931797027588,
"learning_rate": 2.539782227651555e-08,
"loss": 1.5165,
"step": 30101
},
{
"epoch": 0.7627533746369062,
"grad_norm": 1.366620421409607,
"learning_rate": 2.175413557641004e-08,
"loss": 1.5418,
"step": 30132
},
{
"epoch": 0.763538100330977,
"grad_norm": 1.4603701829910278,
"learning_rate": 1.839232498594967e-08,
"loss": 1.5311,
"step": 30163
},
{
"epoch": 0.764322826025048,
"grad_norm": 1.339460015296936,
"learning_rate": 1.5312428442712522e-08,
"loss": 1.5446,
"step": 30194
},
{
"epoch": 0.7651075517191188,
"grad_norm": 1.4318183660507202,
"learning_rate": 1.2514480702913168e-08,
"loss": 1.5058,
"step": 30225
},
{
"epoch": 0.7658922774131898,
"grad_norm": 1.366489291191101,
"learning_rate": 9.998513341005766e-09,
"loss": 1.5299,
"step": 30256
},
{
"epoch": 0.7666770031072606,
"grad_norm": 1.4269777536392212,
"learning_rate": 7.764554749345454e-09,
"loss": 1.5383,
"step": 30287
},
{
"epoch": 0.7674617288013315,
"grad_norm": 1.3449435234069824,
"learning_rate": 5.812630137849717e-09,
"loss": 1.5281,
"step": 30318
},
{
"epoch": 0.7682464544954024,
"grad_norm": 1.5927067995071411,
"learning_rate": 4.142761533723616e-09,
"loss": 1.5119,
"step": 30349
},
{
"epoch": 0.7690311801894733,
"grad_norm": 1.3833186626434326,
"learning_rate": 2.7549677812044317e-09,
"loss": 1.5108,
"step": 30380
},
{
"epoch": 0.7698159058835442,
"grad_norm": 1.47097909450531,
"learning_rate": 1.6492645413590525e-09,
"loss": 1.5346,
"step": 30411
},
{
"epoch": 0.770600631577615,
"grad_norm": 1.407623052597046,
"learning_rate": 8.256642918980096e-10,
"loss": 1.5191,
"step": 30442
},
{
"epoch": 0.771385357271686,
"grad_norm": 1.4193038940429688,
"learning_rate": 2.841763270367004e-10,
"loss": 1.5294,
"step": 30473
},
{
"epoch": 0.7721700829657568,
"grad_norm": 1.5234286785125732,
"learning_rate": 2.480675739269245e-11,
"loss": 1.5123,
"step": 30504
}
],
"logging_steps": 31,
"max_steps": 30517,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 3052,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.263722516828232e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}