{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9992821249102656,
  "eval_steps": 500,
  "global_step": 1044,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.000957166786312515,
      "grad_norm": 3.9142152723532337,
      "learning_rate": 1.9047619047619051e-06,
      "loss": 1.3978,
      "step": 1
    },
    {
      "epoch": 0.004785833931562575,
      "grad_norm": 1.3430217420733046,
      "learning_rate": 9.523809523809523e-06,
      "loss": 1.3489,
      "step": 5
    },
    {
      "epoch": 0.00957166786312515,
      "grad_norm": 0.5850408636793494,
      "learning_rate": 1.9047619047619046e-05,
      "loss": 1.2871,
      "step": 10
    },
    {
      "epoch": 0.014357501794687724,
      "grad_norm": 0.46666716038967326,
      "learning_rate": 2.857142857142857e-05,
      "loss": 1.2106,
      "step": 15
    },
    {
      "epoch": 0.0191433357262503,
      "grad_norm": 0.35044064248530404,
      "learning_rate": 3.809523809523809e-05,
      "loss": 1.189,
      "step": 20
    },
    {
      "epoch": 0.023929169657812874,
      "grad_norm": 0.27361957875198517,
      "learning_rate": 4.761904761904762e-05,
      "loss": 1.1469,
      "step": 25
    },
    {
      "epoch": 0.028715003589375447,
      "grad_norm": 0.2368453005937937,
      "learning_rate": 5.714285714285714e-05,
      "loss": 1.158,
      "step": 30
    },
    {
      "epoch": 0.03350083752093802,
      "grad_norm": 0.2277332385016794,
      "learning_rate": 6.666666666666667e-05,
      "loss": 1.1437,
      "step": 35
    },
    {
      "epoch": 0.0382866714525006,
      "grad_norm": 0.2265653549311157,
      "learning_rate": 7.619047619047618e-05,
      "loss": 1.1302,
      "step": 40
    },
    {
      "epoch": 0.043072505384063174,
      "grad_norm": 0.22079711284915807,
      "learning_rate": 8.571428571428571e-05,
      "loss": 1.13,
      "step": 45
    },
    {
      "epoch": 0.04785833931562575,
      "grad_norm": 0.20813516832540208,
      "learning_rate": 9.523809523809524e-05,
      "loss": 1.106,
      "step": 50
    },
    {
      "epoch": 0.05264417324718832,
      "grad_norm": 0.2044131638757028,
      "learning_rate": 0.00010476190476190477,
      "loss": 1.1348,
      "step": 55
    },
    {
      "epoch": 0.057430007178750894,
      "grad_norm": 0.20101729146508107,
      "learning_rate": 0.00011428571428571428,
      "loss": 1.1018,
      "step": 60
    },
    {
      "epoch": 0.062215841110313475,
      "grad_norm": 0.21865369935553125,
      "learning_rate": 0.0001238095238095238,
      "loss": 1.1129,
      "step": 65
    },
    {
      "epoch": 0.06700167504187604,
      "grad_norm": 0.18405578482864565,
      "learning_rate": 0.00013333333333333334,
      "loss": 1.1018,
      "step": 70
    },
    {
      "epoch": 0.07178750897343862,
      "grad_norm": 0.18488079729650672,
      "learning_rate": 0.00014285714285714287,
      "loss": 1.1417,
      "step": 75
    },
    {
      "epoch": 0.0765733429050012,
      "grad_norm": 0.18433481759594844,
      "learning_rate": 0.00015238095238095237,
      "loss": 1.111,
      "step": 80
    },
    {
      "epoch": 0.08135917683656377,
      "grad_norm": 0.20377971597879482,
      "learning_rate": 0.00016190476190476192,
      "loss": 1.0709,
      "step": 85
    },
    {
      "epoch": 0.08614501076812635,
      "grad_norm": 0.20225554382239913,
      "learning_rate": 0.00017142857142857143,
      "loss": 1.1142,
      "step": 90
    },
    {
      "epoch": 0.09093084469968891,
      "grad_norm": 0.18520967333311886,
      "learning_rate": 0.00018095238095238095,
      "loss": 1.1142,
      "step": 95
    },
    {
      "epoch": 0.0957166786312515,
      "grad_norm": 0.19606367225373053,
      "learning_rate": 0.00019047619047619048,
      "loss": 1.1049,
      "step": 100
    },
    {
      "epoch": 0.10050251256281408,
      "grad_norm": 0.1867473714189168,
      "learning_rate": 0.0002,
      "loss": 1.0927,
      "step": 105
    },
    {
      "epoch": 0.10528834649437664,
      "grad_norm": 0.185817071062854,
      "learning_rate": 0.00019998600836567816,
      "loss": 1.1206,
      "step": 110
    },
    {
      "epoch": 0.11007418042593922,
      "grad_norm": 0.1762396939142846,
      "learning_rate": 0.00019994403737802927,
      "loss": 1.1022,
      "step": 115
    },
    {
      "epoch": 0.11486001435750179,
      "grad_norm": 0.16668242539032083,
      "learning_rate": 0.00019987409878190752,
      "loss": 1.1052,
      "step": 120
    },
    {
      "epoch": 0.11964584828906437,
      "grad_norm": 0.1764182848939352,
      "learning_rate": 0.00019977621214841822,
      "loss": 1.1059,
      "step": 125
    },
    {
      "epoch": 0.12443168222062695,
      "grad_norm": 0.1787380695504439,
      "learning_rate": 0.0001996504048694409,
      "loss": 1.1102,
      "step": 130
    },
    {
      "epoch": 0.12921751615218952,
      "grad_norm": 0.18152638872711746,
      "learning_rate": 0.00019949671214996445,
      "loss": 1.0986,
      "step": 135
    },
    {
      "epoch": 0.13400335008375208,
      "grad_norm": 0.1745817045352934,
      "learning_rate": 0.00019931517699823547,
      "loss": 1.085,
      "step": 140
    },
    {
      "epoch": 0.13878918401531468,
      "grad_norm": 0.17554030590687575,
      "learning_rate": 0.0001991058502137231,
      "loss": 1.1363,
      "step": 145
    },
    {
      "epoch": 0.14357501794687724,
      "grad_norm": 0.1800405053641118,
      "learning_rate": 0.00019886879037290384,
      "loss": 1.0924,
      "step": 150
    },
    {
      "epoch": 0.1483608518784398,
      "grad_norm": 0.19148632700424054,
      "learning_rate": 0.0001986040638128698,
      "loss": 1.0824,
      "step": 155
    },
    {
      "epoch": 0.1531466858100024,
      "grad_norm": 0.17313730878343153,
      "learning_rate": 0.0001983117446127654,
      "loss": 1.1071,
      "step": 160
    },
    {
      "epoch": 0.15793251974156497,
      "grad_norm": 0.16961371492797625,
      "learning_rate": 0.00019799191457305768,
      "loss": 1.1311,
      "step": 165
    },
    {
      "epoch": 0.16271835367312754,
      "grad_norm": 0.17327733992734365,
      "learning_rate": 0.00019764466319264595,
      "loss": 1.1133,
      "step": 170
    },
    {
      "epoch": 0.16750418760469013,
      "grad_norm": 0.17176717180251114,
      "learning_rate": 0.00019727008764381675,
      "loss": 1.1153,
      "step": 175
    },
    {
      "epoch": 0.1722900215362527,
      "grad_norm": 0.17138393560502058,
      "learning_rate": 0.0001968682927450523,
      "loss": 1.1006,
      "step": 180
    },
    {
      "epoch": 0.17707585546781526,
      "grad_norm": 0.16465106542109514,
      "learning_rate": 0.00019643939093169844,
      "loss": 1.104,
      "step": 185
    },
    {
      "epoch": 0.18186168939937783,
      "grad_norm": 0.1699157446206454,
      "learning_rate": 0.00019598350222450178,
      "loss": 1.1167,
      "step": 190
    },
    {
      "epoch": 0.18664752333094042,
      "grad_norm": 0.17602023031266878,
      "learning_rate": 0.00019550075419602408,
      "loss": 1.1131,
      "step": 195
    },
    {
      "epoch": 0.191433357262503,
      "grad_norm": 0.18282225583073394,
      "learning_rate": 0.00019499128193494297,
      "loss": 1.0889,
      "step": 200
    },
    {
      "epoch": 0.19621919119406556,
      "grad_norm": 0.16528930064048408,
      "learning_rate": 0.0001944552280082499,
      "loss": 1.1013,
      "step": 205
    },
    {
      "epoch": 0.20100502512562815,
      "grad_norm": 0.16504045631379008,
      "learning_rate": 0.0001938927424213553,
      "loss": 1.1003,
      "step": 210
    },
    {
      "epoch": 0.20579085905719072,
      "grad_norm": 0.16559761424705557,
      "learning_rate": 0.000193303982576112,
      "loss": 1.0998,
      "step": 215
    },
    {
      "epoch": 0.21057669298875328,
      "grad_norm": 0.16791680669282769,
      "learning_rate": 0.0001926891132267692,
      "loss": 1.0919,
      "step": 220
    },
    {
      "epoch": 0.21536252692031588,
      "grad_norm": 0.16689008275055853,
      "learning_rate": 0.00019204830643386868,
      "loss": 1.1069,
      "step": 225
    },
    {
      "epoch": 0.22014836085187844,
      "grad_norm": 0.17305744880787605,
      "learning_rate": 0.00019138174151609683,
      "loss": 1.1272,
      "step": 230
    },
    {
      "epoch": 0.224934194783441,
      "grad_norm": 0.1618964004882894,
      "learning_rate": 0.00019068960500010523,
      "loss": 1.0827,
      "step": 235
    },
    {
      "epoch": 0.22972002871500358,
      "grad_norm": 0.16411871929076272,
      "learning_rate": 0.00018997209056831462,
      "loss": 1.1164,
      "step": 240
    },
    {
      "epoch": 0.23450586264656617,
      "grad_norm": 0.1691454723246668,
      "learning_rate": 0.0001892293990047159,
      "loss": 1.1079,
      "step": 245
    },
    {
      "epoch": 0.23929169657812874,
      "grad_norm": 0.1649265891086064,
      "learning_rate": 0.00018846173813868454,
      "loss": 1.0825,
      "step": 250
    },
    {
      "epoch": 0.2440775305096913,
      "grad_norm": 0.17018378559137046,
      "learning_rate": 0.000187669322786823,
      "loss": 1.1216,
      "step": 255
    },
    {
      "epoch": 0.2488633644412539,
      "grad_norm": 0.1703316481606123,
      "learning_rate": 0.0001868523746928479,
      "loss": 1.0783,
      "step": 260
    },
    {
      "epoch": 0.25364919837281646,
      "grad_norm": 0.16735434292797727,
      "learning_rate": 0.0001860111224655391,
      "loss": 1.1149,
      "step": 265
    },
    {
      "epoch": 0.25843503230437903,
      "grad_norm": 0.1555718882454273,
      "learning_rate": 0.0001851458015147673,
      "loss": 1.1075,
      "step": 270
    },
    {
      "epoch": 0.2632208662359416,
      "grad_norm": 0.16977397286003804,
      "learning_rate": 0.00018425665398561883,
      "loss": 1.0852,
      "step": 275
    },
    {
      "epoch": 0.26800670016750416,
      "grad_norm": 0.1663432558952086,
      "learning_rate": 0.00018334392869063536,
      "loss": 1.0811,
      "step": 280
    },
    {
      "epoch": 0.2727925340990668,
      "grad_norm": 0.1666368907912767,
      "learning_rate": 0.00018240788104018822,
      "loss": 1.1014,
      "step": 285
    },
    {
      "epoch": 0.27757836803062935,
      "grad_norm": 0.15687924477714424,
      "learning_rate": 0.00018144877297100606,
      "loss": 1.0736,
      "step": 290
    },
    {
      "epoch": 0.2823642019621919,
      "grad_norm": 0.16061101558707236,
      "learning_rate": 0.0001804668728728764,
      "loss": 1.0931,
      "step": 295
    },
    {
      "epoch": 0.2871500358937545,
      "grad_norm": 0.160793101364396,
      "learning_rate": 0.00017946245551354157,
      "loss": 1.0999,
      "step": 300
    },
    {
      "epoch": 0.29193586982531705,
      "grad_norm": 0.1633263725476254,
      "learning_rate": 0.00017843580196180952,
      "loss": 1.0948,
      "step": 305
    },
    {
      "epoch": 0.2967217037568796,
      "grad_norm": 0.1631452387362867,
      "learning_rate": 0.00017738719950890168,
      "loss": 1.1013,
      "step": 310
    },
    {
      "epoch": 0.3015075376884422,
      "grad_norm": 0.16210535171429089,
      "learning_rate": 0.00017631694158805946,
      "loss": 1.0798,
      "step": 315
    },
    {
      "epoch": 0.3062933716200048,
      "grad_norm": 0.16229934636841442,
      "learning_rate": 0.000175225327692432,
      "loss": 1.0575,
      "step": 320
    },
    {
      "epoch": 0.3110792055515674,
      "grad_norm": 0.16588195121854765,
      "learning_rate": 0.00017411266329126824,
      "loss": 1.096,
      "step": 325
    },
    {
      "epoch": 0.31586503948312994,
      "grad_norm": 0.158210200244668,
      "learning_rate": 0.00017297925974443673,
      "loss": 1.1071,
      "step": 330
    },
    {
      "epoch": 0.3206508734146925,
      "grad_norm": 0.1663784778299078,
      "learning_rate": 0.00017182543421529676,
      "loss": 1.0739,
      "step": 335
    },
    {
      "epoch": 0.32543670734625507,
      "grad_norm": 0.15384028200238806,
      "learning_rate": 0.00017065150958194586,
      "loss": 1.0848,
      "step": 340
    },
    {
      "epoch": 0.33022254127781764,
      "grad_norm": 0.16073214574887584,
      "learning_rate": 0.00016945781434686783,
      "loss": 1.1157,
      "step": 345
    },
    {
      "epoch": 0.33500837520938026,
      "grad_norm": 0.1745939193140414,
      "learning_rate": 0.00016824468254500704,
      "loss": 1.0815,
      "step": 350
    },
    {
      "epoch": 0.3397942091409428,
      "grad_norm": 0.15802897019970708,
      "learning_rate": 0.0001670124536502947,
      "loss": 1.0779,
      "step": 355
    },
    {
      "epoch": 0.3445800430725054,
      "grad_norm": 0.1579431494377225,
      "learning_rate": 0.00016576147248065267,
      "loss": 1.1031,
      "step": 360
    },
    {
      "epoch": 0.34936587700406796,
      "grad_norm": 0.16455288633589932,
      "learning_rate": 0.00016449208910150232,
      "loss": 1.1207,
      "step": 365
    },
    {
      "epoch": 0.3541517109356305,
      "grad_norm": 0.15512720174775488,
      "learning_rate": 0.00016320465872780477,
      "loss": 1.0843,
      "step": 370
    },
    {
      "epoch": 0.3589375448671931,
      "grad_norm": 0.15810739086397552,
      "learning_rate": 0.00016189954162466012,
      "loss": 1.0674,
      "step": 375
    },
    {
      "epoch": 0.36372337879875566,
      "grad_norm": 0.15539897008538223,
      "learning_rate": 0.0001605771030064934,
      "loss": 1.1075,
      "step": 380
    },
    {
      "epoch": 0.3685092127303183,
      "grad_norm": 0.16059302879871643,
      "learning_rate": 0.00015923771293485585,
      "loss": 1.1083,
      "step": 385
    },
    {
      "epoch": 0.37329504666188085,
      "grad_norm": 0.1726863039386017,
      "learning_rate": 0.00015788174621486934,
      "loss": 1.0839,
      "step": 390
    },
    {
      "epoch": 0.3780808805934434,
      "grad_norm": 0.160896911699282,
      "learning_rate": 0.00015650958229034391,
      "loss": 1.093,
      "step": 395
    },
    {
      "epoch": 0.382866714525006,
      "grad_norm": 0.1539033105501165,
      "learning_rate": 0.00015512160513759672,
      "loss": 1.0824,
      "step": 400
    },
    {
      "epoch": 0.38765254845656855,
      "grad_norm": 0.15253934847352404,
      "learning_rate": 0.00015371820315800315,
      "loss": 1.0611,
      "step": 405
    },
    {
      "epoch": 0.3924383823881311,
      "grad_norm": 0.1549203336671571,
      "learning_rate": 0.00015229976906930935,
      "loss": 1.0926,
      "step": 410
    },
    {
      "epoch": 0.3972242163196937,
      "grad_norm": 0.15736586699846142,
      "learning_rate": 0.0001508666997957369,
      "loss": 1.0838,
      "step": 415
    },
    {
      "epoch": 0.4020100502512563,
      "grad_norm": 0.15414651629074486,
      "learning_rate": 0.00014941939635691035,
      "loss": 1.0962,
      "step": 420
    },
    {
      "epoch": 0.40679588418281887,
      "grad_norm": 0.15216014768555902,
      "learning_rate": 0.00014795826375563925,
      "loss": 1.0837,
      "step": 425
    },
    {
      "epoch": 0.41158171811438143,
      "grad_norm": 0.1551252486012846,
      "learning_rate": 0.0001464837108645845,
      "loss": 1.096,
      "step": 430
    },
    {
      "epoch": 0.416367552045944,
      "grad_norm": 0.15880410911617168,
      "learning_rate": 0.00014499615031184296,
      "loss": 1.0947,
      "step": 435
    },
    {
      "epoch": 0.42115338597750657,
      "grad_norm": 0.16084656769756484,
      "learning_rate": 0.00014349599836548034,
      "loss": 1.0955,
      "step": 440
    },
    {
      "epoch": 0.42593921990906913,
      "grad_norm": 0.14942909791958908,
      "learning_rate": 0.0001419836748170459,
      "loss": 1.0911,
      "step": 445
    },
    {
      "epoch": 0.43072505384063176,
      "grad_norm": 0.16134597273400678,
      "learning_rate": 0.0001404596028641009,
      "loss": 1.1136,
      "step": 450
    },
    {
      "epoch": 0.4355108877721943,
      "grad_norm": 0.15552785776756606,
      "learning_rate": 0.0001389242089917943,
      "loss": 1.1005,
      "step": 455
    },
    {
      "epoch": 0.4402967217037569,
      "grad_norm": 0.1544583591443468,
      "learning_rate": 0.00013737792285351805,
      "loss": 1.0896,
      "step": 460
    },
    {
      "epoch": 0.44508255563531945,
      "grad_norm": 0.15743294110434283,
      "learning_rate": 0.0001358211771506763,
      "loss": 1.0687,
      "step": 465
    },
    {
      "epoch": 0.449868389566882,
      "grad_norm": 0.15489693015617015,
      "learning_rate": 0.00013425440751160112,
      "loss": 1.0909,
      "step": 470
    },
    {
      "epoch": 0.4546542234984446,
      "grad_norm": 0.1556280787651109,
      "learning_rate": 0.00013267805236964967,
      "loss": 1.1008,
      "step": 475
    },
    {
      "epoch": 0.45944005743000715,
      "grad_norm": 0.16139496091159036,
      "learning_rate": 0.00013109255284051615,
      "loss": 1.1167,
      "step": 480
    },
    {
      "epoch": 0.4642258913615698,
      "grad_norm": 0.15380326887200926,
      "learning_rate": 0.00012949835259879304,
      "loss": 1.1021,
      "step": 485
    },
    {
      "epoch": 0.46901172529313234,
      "grad_norm": 0.1504710821626308,
      "learning_rate": 0.00012789589775381676,
      "loss": 1.0824,
      "step": 490
    },
    {
      "epoch": 0.4737975592246949,
      "grad_norm": 0.16882632755621252,
      "learning_rate": 0.00012628563672483146,
      "loss": 1.091,
      "step": 495
    },
    {
      "epoch": 0.4785833931562575,
      "grad_norm": 0.16236683430294702,
      "learning_rate": 0.0001246680201155068,
      "loss": 1.0609,
      "step": 500
    },
    {
      "epoch": 0.48336922708782004,
      "grad_norm": 0.1534881294655078,
      "learning_rate": 0.00012304350058784405,
      "loss": 1.0611,
      "step": 505
    },
    {
      "epoch": 0.4881550610193826,
      "grad_norm": 0.16620841316700394,
      "learning_rate": 0.00012141253273550696,
      "loss": 1.0932,
      "step": 510
    },
    {
      "epoch": 0.49294089495094523,
      "grad_norm": 0.16942714030828704,
      "learning_rate": 0.00011977557295661108,
      "loss": 1.0856,
      "step": 515
    },
    {
      "epoch": 0.4977267288825078,
      "grad_norm": 0.15500201031703087,
      "learning_rate": 0.00011813307932600887,
      "loss": 1.0852,
      "step": 520
    },
    {
      "epoch": 0.5025125628140703,
      "grad_norm": 0.15248801968172002,
      "learning_rate": 0.00011648551146710556,
      "loss": 1.1069,
      "step": 525
    },
    {
      "epoch": 0.5072983967456329,
      "grad_norm": 0.14978453385390675,
      "learning_rate": 0.0001148333304232411,
      "loss": 1.088,
      "step": 530
    },
    {
      "epoch": 0.5120842306771956,
      "grad_norm": 0.14736066147246124,
      "learning_rate": 0.00011317699852867548,
      "loss": 1.0506,
      "step": 535
    },
    {
      "epoch": 0.5168700646087581,
      "grad_norm": 0.15088998664120562,
      "learning_rate": 0.0001115169792792124,
      "loss": 1.0972,
      "step": 540
    },
    {
      "epoch": 0.5216558985403207,
      "grad_norm": 0.14676026138747209,
      "learning_rate": 0.00010985373720249801,
      "loss": 1.0871,
      "step": 545
    },
    {
      "epoch": 0.5264417324718832,
      "grad_norm": 0.17054822297185676,
      "learning_rate": 0.00010818773772803082,
      "loss": 1.0957,
      "step": 550
    },
    {
      "epoch": 0.5312275664034458,
      "grad_norm": 0.15081743477470166,
      "learning_rate": 0.0001065194470569193,
      "loss": 1.1114,
      "step": 555
    },
    {
      "epoch": 0.5360134003350083,
      "grad_norm": 0.1556600989117304,
      "learning_rate": 0.0001048493320314238,
      "loss": 1.0747,
      "step": 560
    },
    {
      "epoch": 0.540799234266571,
      "grad_norm": 0.15346464585086714,
      "learning_rate": 0.00010317786000431851,
      "loss": 1.0761,
      "step": 565
    },
    {
      "epoch": 0.5455850681981336,
      "grad_norm": 0.15178562379014646,
      "learning_rate": 0.00010150549870811107,
      "loss": 1.0839,
      "step": 570
    },
    {
      "epoch": 0.5503709021296961,
      "grad_norm": 0.15263581024104103,
      "learning_rate": 9.983271612415575e-05,
      "loss": 1.0742,
      "step": 575
    },
    {
      "epoch": 0.5551567360612587,
      "grad_norm": 0.15166582071053056,
      "learning_rate": 9.81599803516968e-05,
      "loss": 1.0725,
      "step": 580
    },
    {
      "epoch": 0.5599425699928212,
      "grad_norm": 0.14735687803417952,
      "learning_rate": 9.648775947687912e-05,
      "loss": 1.0705,
      "step": 585
    },
    {
      "epoch": 0.5647284039243838,
      "grad_norm": 0.14825818203221888,
      "learning_rate": 9.48165214417624e-05,
      "loss": 1.0871,
      "step": 590
    },
    {
      "epoch": 0.5695142378559463,
      "grad_norm": 0.15700946642781993,
      "learning_rate": 9.314673391337576e-05,
      "loss": 1.0979,
      "step": 595
    },
    {
      "epoch": 0.574300071787509,
      "grad_norm": 0.15580031067347558,
      "learning_rate": 9.147886415284903e-05,
      "loss": 1.0592,
      "step": 600
    },
    {
      "epoch": 0.5790859057190716,
      "grad_norm": 0.14548002556094225,
      "learning_rate": 8.981337888465788e-05,
      "loss": 1.0787,
      "step": 605
    },
    {
      "epoch": 0.5838717396506341,
      "grad_norm": 0.14237124600928142,
      "learning_rate": 8.815074416601913e-05,
      "loss": 1.0698,
      "step": 610
    },
    {
      "epoch": 0.5886575735821967,
      "grad_norm": 0.15304745525626437,
      "learning_rate": 8.649142525647272e-05,
      "loss": 1.0848,
      "step": 615
    },
    {
      "epoch": 0.5934434075137592,
      "grad_norm": 0.14513336716190856,
      "learning_rate": 8.48358864876867e-05,
      "loss": 1.0462,
      "step": 620
    },
    {
      "epoch": 0.5982292414453219,
      "grad_norm": 0.1468415945819683,
      "learning_rate": 8.318459113352221e-05,
      "loss": 1.0906,
      "step": 625
    },
    {
      "epoch": 0.6030150753768844,
      "grad_norm": 0.14408143553897426,
      "learning_rate": 8.153800128039441e-05,
      "loss": 1.085,
      "step": 630
    },
    {
      "epoch": 0.607800909308447,
      "grad_norm": 0.15046217184291616,
      "learning_rate": 7.989657769796533e-05,
      "loss": 1.0882,
      "step": 635
    },
    {
      "epoch": 0.6125867432400096,
      "grad_norm": 0.14348283659906289,
      "learning_rate": 7.82607797102056e-05,
      "loss": 1.0861,
      "step": 640
    },
    {
      "epoch": 0.6173725771715721,
      "grad_norm": 0.14685503152106738,
      "learning_rate": 7.663106506686057e-05,
      "loss": 1.1003,
      "step": 645
    },
    {
      "epoch": 0.6221584111031347,
      "grad_norm": 0.1480277391784376,
      "learning_rate": 7.500788981535708e-05,
      "loss": 1.0758,
      "step": 650
    },
    {
      "epoch": 0.6269442450346973,
      "grad_norm": 0.1477910922274185,
      "learning_rate": 7.339170817318625e-05,
      "loss": 1.0695,
      "step": 655
    },
    {
      "epoch": 0.6317300789662599,
      "grad_norm": 0.1551465349289344,
      "learning_rate": 7.178297240079882e-05,
      "loss": 1.0942,
      "step": 660
    },
    {
      "epoch": 0.6365159128978225,
      "grad_norm": 0.148811465121087,
      "learning_rate": 7.018213267504775e-05,
      "loss": 1.0825,
      "step": 665
    },
    {
      "epoch": 0.641301746829385,
      "grad_norm": 0.146937156337137,
      "learning_rate": 6.858963696321403e-05,
      "loss": 1.0985,
      "step": 670
    },
    {
      "epoch": 0.6460875807609476,
      "grad_norm": 0.14703161191479286,
      "learning_rate": 6.700593089765086e-05,
      "loss": 1.06,
      "step": 675
    },
    {
      "epoch": 0.6508734146925101,
      "grad_norm": 0.14564360148371303,
      "learning_rate": 6.543145765108106e-05,
      "loss": 1.0853,
      "step": 680
    },
    {
      "epoch": 0.6556592486240728,
      "grad_norm": 0.14887365645849163,
      "learning_rate": 6.3866657812583e-05,
      "loss": 1.0787,
      "step": 685
    },
    {
      "epoch": 0.6604450825556353,
      "grad_norm": 0.14533659914404762,
      "learning_rate": 6.231196926429913e-05,
      "loss": 1.073,
      "step": 690
    },
    {
      "epoch": 0.6652309164871979,
      "grad_norm": 0.2354314895944445,
      "learning_rate": 6.076782705890257e-05,
      "loss": 1.0815,
      "step": 695
    },
    {
      "epoch": 0.6700167504187605,
      "grad_norm": 0.14132233475416703,
      "learning_rate": 5.9234663297854876e-05,
      "loss": 1.0555,
      "step": 700
    },
    {
      "epoch": 0.674802584350323,
      "grad_norm": 0.14913316600220797,
      "learning_rate": 5.7712907010490036e-05,
      "loss": 1.0785,
      "step": 705
    },
    {
      "epoch": 0.6795884182818857,
      "grad_norm": 0.15328072297180578,
      "learning_rate": 5.620298403395805e-05,
      "loss": 1.0857,
      "step": 710
    },
    {
      "epoch": 0.6843742522134482,
      "grad_norm": 0.17603388258774993,
      "learning_rate": 5.4705316894061765e-05,
      "loss": 1.0898,
      "step": 715
    },
    {
      "epoch": 0.6891600861450108,
      "grad_norm": 0.1448443355064005,
      "learning_rate": 5.322032468702036e-05,
      "loss": 1.0714,
      "step": 720
    },
    {
      "epoch": 0.6939459200765733,
      "grad_norm": 0.4624474555190123,
      "learning_rate": 5.1748422962192376e-05,
      "loss": 1.0994,
      "step": 725
    },
    {
      "epoch": 0.6987317540081359,
      "grad_norm": 0.14868980834848183,
      "learning_rate": 5.0290023605791666e-05,
      "loss": 1.0725,
      "step": 730
    },
    {
      "epoch": 0.7035175879396985,
      "grad_norm": 0.15278504704361137,
      "learning_rate": 4.8845534725628086e-05,
      "loss": 1.0962,
      "step": 735
    },
    {
      "epoch": 0.708303421871261,
      "grad_norm": 0.14605679246576617,
      "learning_rate": 4.741536053690552e-05,
      "loss": 1.0947,
      "step": 740
    },
    {
      "epoch": 0.7130892558028237,
      "grad_norm": 0.172204603811799,
      "learning_rate": 4.599990124910918e-05,
      "loss": 1.0758,
      "step": 745
    },
    {
      "epoch": 0.7178750897343862,
      "grad_norm": 0.14357849865669614,
      "learning_rate": 4.4599552954014145e-05,
      "loss": 1.0682,
      "step": 750
    },
    {
      "epoch": 0.7226609236659488,
      "grad_norm": 0.14980923833672957,
      "learning_rate": 4.32147075148458e-05,
      "loss": 1.0814,
      "step": 755
    },
    {
      "epoch": 0.7274467575975113,
      "grad_norm": 0.16395768222951593,
      "learning_rate": 4.1845752456623665e-05,
      "loss": 1.0583,
      "step": 760
    },
    {
      "epoch": 0.7322325915290739,
      "grad_norm": 0.14059821304657993,
      "learning_rate": 4.049307085771931e-05,
      "loss": 1.0839,
      "step": 765
    },
    {
      "epoch": 0.7370184254606366,
      "grad_norm": 0.1472110334031576,
      "learning_rate": 3.9157041242658477e-05,
      "loss": 1.1079,
      "step": 770
    },
    {
      "epoch": 0.7418042593921991,
      "grad_norm": 0.14020342123522012,
      "learning_rate": 3.783803747619741e-05,
      "loss": 1.0829,
      "step": 775
    },
    {
      "epoch": 0.7465900933237617,
      "grad_norm": 0.17437047699695307,
      "learning_rate": 3.653642865870359e-05,
      "loss": 1.0808,
      "step": 780
    },
    {
      "epoch": 0.7513759272553242,
      "grad_norm": 0.14320013892049976,
      "learning_rate": 3.525257902286908e-05,
      "loss": 1.0608,
      "step": 785
    },
    {
      "epoch": 0.7561617611868868,
      "grad_norm": 0.14437417000631428,
      "learning_rate": 3.398684783178648e-05,
      "loss": 1.0618,
      "step": 790
    },
    {
      "epoch": 0.7609475951184493,
      "grad_norm": 0.14321363672597254,
      "learning_rate": 3.273958927841525e-05,
      "loss": 1.0659,
      "step": 795
    },
    {
      "epoch": 0.765733429050012,
      "grad_norm": 0.14121990349576288,
      "learning_rate": 3.1511152386467055e-05,
      "loss": 1.0936,
      "step": 800
    },
    {
      "epoch": 0.7705192629815746,
      "grad_norm": 0.16146069783583863,
      "learning_rate": 3.0301880912737568e-05,
      "loss": 1.0647,
      "step": 805
    },
    {
      "epoch": 0.7753050969131371,
      "grad_norm": 0.1447026626027737,
      "learning_rate": 2.9112113250911844e-05,
      "loss": 1.0747,
      "step": 810
    },
    {
      "epoch": 0.7800909308446997,
      "grad_norm": 0.14724228311552523,
      "learning_rate": 2.7942182336870925e-05,
      "loss": 1.1046,
      "step": 815
    },
    {
      "epoch": 0.7848767647762622,
      "grad_norm": 0.14612792897080507,
      "learning_rate": 2.6792415555525463e-05,
      "loss": 1.0391,
      "step": 820
    },
    {
      "epoch": 0.7896625987078248,
      "grad_norm": 0.14445016139434405,
      "learning_rate": 2.5663134649202647e-05,
      "loss": 1.0808,
      "step": 825
    },
    {
      "epoch": 0.7944484326393874,
      "grad_norm": 0.14283033243615206,
      "learning_rate": 2.4554655627612245e-05,
      "loss": 1.0767,
      "step": 830
    },
    {
      "epoch": 0.79923426657095,
      "grad_norm": 0.1428104588189023,
      "learning_rate": 2.34672886794167e-05,
      "loss": 1.0884,
      "step": 835
    },
    {
      "epoch": 0.8040201005025126,
      "grad_norm": 0.14106416222944104,
      "learning_rate": 2.2401338085430323e-05,
      "loss": 1.0891,
      "step": 840
    },
    {
      "epoch": 0.8088059344340751,
      "grad_norm": 0.14453431354715718,
      "learning_rate": 2.135710213347134e-05,
      "loss": 1.0829,
      "step": 845
    },
    {
      "epoch": 0.8135917683656377,
      "grad_norm": 0.1436138017414945,
      "learning_rate": 2.0334873034891554e-05,
      "loss": 1.0823,
      "step": 850
    },
    {
      "epoch": 0.8183776022972002,
      "grad_norm": 0.14415504753616376,
      "learning_rate": 1.933493684280574e-05,
      "loss": 1.0749,
      "step": 855
    },
    {
      "epoch": 0.8231634362287629,
      "grad_norm": 0.14188286670890893,
      "learning_rate": 1.8357573372044834e-05,
      "loss": 1.0775,
      "step": 860
    },
    {
      "epoch": 0.8279492701603255,
      "grad_norm": 0.14043422592547342,
      "learning_rate": 1.740305612085439e-05,
      "loss": 1.0852,
      "step": 865
    },
    {
      "epoch": 0.832735104091888,
      "grad_norm": 0.14014109535516273,
      "learning_rate": 1.647165219436113e-05,
      "loss": 1.0716,
      "step": 870
    },
    {
      "epoch": 0.8375209380234506,
      "grad_norm": 0.18266681120520475,
      "learning_rate": 1.556362222982799e-05,
      "loss": 1.0711,
      "step": 875
    },
    {
      "epoch": 0.8423067719550131,
      "grad_norm": 0.14585487433506303,
      "learning_rate": 1.4679220323719234e-05,
      "loss": 1.0561,
      "step": 880
    },
    {
      "epoch": 0.8470926058865758,
      "grad_norm": 0.13911103035630754,
      "learning_rate": 1.3818693960596185e-05,
      "loss": 1.0707,
      "step": 885
    },
    {
      "epoch": 0.8518784398181383,
      "grad_norm": 0.15612123605821987,
      "learning_rate": 1.2982283943862738e-05,
      "loss": 1.0494,
      "step": 890
    },
    {
      "epoch": 0.8566642737497009,
      "grad_norm": 0.14067555622023134,
      "learning_rate": 1.217022432838093e-05,
      "loss": 1.0686,
      "step": 895
    },
    {
      "epoch": 0.8614501076812635,
      "grad_norm": 0.1457410414761679,
      "learning_rate": 1.1382742354974429e-05,
      "loss": 1.0562,
      "step": 900
    },
    {
      "epoch": 0.866235941612826,
      "grad_norm": 0.1398250627278749,
      "learning_rate": 1.0620058386839393e-05,
      "loss": 1.0753,
      "step": 905
    },
    {
      "epoch": 0.8710217755443886,
      "grad_norm": 0.14690238478434312,
      "learning_rate": 9.882385847879539e-06,
      "loss": 1.0539,
      "step": 910
    },
    {
      "epoch": 0.8758076094759512,
      "grad_norm": 0.14224902345010998,
      "learning_rate": 9.169931162983137e-06,
      "loss": 1.0575,
      "step": 915
    },
    {
      "epoch": 0.8805934434075138,
      "grad_norm": 0.14002967562121116,
      "learning_rate": 8.482893700258643e-06,
      "loss": 1.0831,
      "step": 920
    },
    {
      "epoch": 0.8853792773390763,
      "grad_norm": 0.14652920530592364,
      "learning_rate": 7.821465715244947e-06,
      "loss": 1.0844,
      "step": 925
    },
    {
      "epoch": 0.8901651112706389,
      "grad_norm": 0.13985808750925746,
      "learning_rate": 7.185832297111938e-06,
      "loss": 1.0618,
      "step": 930
    },
    {
      "epoch": 0.8949509452022015,
      "grad_norm": 0.15160308510490375,
      "learning_rate": 6.576171316866608e-06,
      "loss": 1.0773,
      "step": 935
    },
    {
      "epoch": 0.899736779133764,
      "grad_norm": 0.14784429409642344,
      "learning_rate": 5.9926533775789055e-06,
      "loss": 1.0951,
      "step": 940
    },
    {
      "epoch": 0.9045226130653267,
      "grad_norm": 0.14167088318411009,
      "learning_rate": 5.435441766641369e-06,
      "loss": 1.0841,
      "step": 945
    },
    {
      "epoch": 0.9093084469968892,
      "grad_norm": 0.14256818695069146,
      "learning_rate": 4.904692410075973e-06,
      "loss": 1.0647,
      "step": 950
    },
    {
      "epoch": 0.9140942809284518,
      "grad_norm": 0.15531748633710526,
      "learning_rate": 4.400553828900989e-06,
      "loss": 1.0757,
      "step": 955
    },
    {
      "epoch": 0.9188801148600143,
      "grad_norm": 0.14420681549864126,
      "learning_rate": 3.923167097569935e-06,
      "loss": 1.0903,
      "step": 960
    },
    {
      "epoch": 0.9236659487915769,
      "grad_norm": 0.14398010788396462,
      "learning_rate": 3.4726658044943126e-06,
      "loss": 1.0668,
      "step": 965
    },
    {
      "epoch": 0.9284517827231396,
      "grad_norm": 0.14589900176146645,
      "learning_rate": 3.0491760146611926e-06,
      "loss": 1.0845,
      "step": 970
    },
    {
      "epoch": 0.9332376166547021,
      "grad_norm": 0.13882750982702796,
      "learning_rate": 2.652816234356159e-06,
      "loss": 1.0382,
      "step": 975
    },
    {
      "epoch": 0.9380234505862647,
      "grad_norm": 0.14112035905216325,
      "learning_rate": 2.283697378001315e-06,
      "loss": 1.0825,
      "step": 980
    },
    {
      "epoch": 0.9428092845178272,
      "grad_norm": 0.13934480624047157,
      "learning_rate": 1.9419227371178627e-06,
      "loss": 1.0679,
      "step": 985
    },
    {
      "epoch": 0.9475951184493898,
      "grad_norm": 0.14117739445269173,
      "learning_rate": 1.6275879514217052e-06,
      "loss": 1.0772,
      "step": 990
    },
    {
      "epoch": 0.9523809523809523,
      "grad_norm": 0.14031209854381504,
      "learning_rate": 1.3407809820603856e-06,
      "loss": 1.0767,
      "step": 995
    },
    {
      "epoch": 0.957166786312515,
      "grad_norm": 0.14091355128035063,
      "learning_rate": 1.0815820869985893e-06,
      "loss": 1.0635,
      "step": 1000
    },
    {
      "epoch": 0.9619526202440776,
      "grad_norm": 0.14100298765660577,
      "learning_rate": 8.50063798559475e-07,
      "loss": 1.0861,
      "step": 1005
    },
    {
      "epoch": 0.9667384541756401,
      "grad_norm": 0.1412224316948679,
      "learning_rate": 6.462909031276443e-07,
      "loss": 1.0633,
      "step": 1010
    },
    {
      "epoch": 0.9715242881072027,
      "grad_norm": 0.1385906964183353,
      "learning_rate": 4.7032042301985434e-07,
      "loss": 1.0726,
      "step": 1015
    },
    {
      "epoch": 0.9763101220387652,
      "grad_norm": 0.151976410727331,
      "learning_rate": 3.222016005282824e-07,
      "loss": 1.0645,
      "step": 1020
    },
    {
      "epoch": 0.9810959559703278,
      "grad_norm": 0.14304003914264313,
      "learning_rate": 2.0197588414094804e-07,
      "loss": 1.0785,
      "step": 1025
    },
    {
      "epoch": 0.9858817899018905,
      "grad_norm": 0.1395525833943131,
      "learning_rate": 1.0967691694302451e-07,
      "loss": 1.0582,
      "step": 1030
    },
    {
      "epoch": 0.990667623833453,
      "grad_norm": 0.14208441339069042,
      "learning_rate": 4.5330527202480654e-08,
      "loss": 1.0763,
      "step": 1035
    },
    {
      "epoch": 0.9954534577650156,
      "grad_norm": 0.13931375530799342,
      "learning_rate": 8.95472114241791e-09,
      "loss": 1.0444,
      "step": 1040
    },
    {
      "epoch": 0.9992821249102656,
      "eval_loss": 1.077100157737732,
      "eval_runtime": 3923.6787,
      "eval_samples_per_second": 3.43,
      "eval_steps_per_second": 0.858,
      "step": 1044
    },
    {
      "epoch": 0.9992821249102656,
      "step": 1044,
      "total_flos": 2155604625850368.0,
      "train_loss": 1.091635976486279,
      "train_runtime": 24351.9745,
      "train_samples_per_second": 2.746,
      "train_steps_per_second": 0.043
    }
  ],
  "logging_steps": 5,
  "max_steps": 1044,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2155604625850368.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}