{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992821249102656, "eval_steps": 500, "global_step": 1044, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000957166786312515, "grad_norm": 3.9142152723532337, "learning_rate": 1.9047619047619051e-06, "loss": 1.3978, "step": 1 }, { "epoch": 0.004785833931562575, "grad_norm": 1.3430217420733046, "learning_rate": 9.523809523809523e-06, "loss": 1.3489, "step": 5 }, { "epoch": 0.00957166786312515, "grad_norm": 0.5850408636793494, "learning_rate": 1.9047619047619046e-05, "loss": 1.2871, "step": 10 }, { "epoch": 0.014357501794687724, "grad_norm": 0.46666716038967326, "learning_rate": 2.857142857142857e-05, "loss": 1.2106, "step": 15 }, { "epoch": 0.0191433357262503, "grad_norm": 0.35044064248530404, "learning_rate": 3.809523809523809e-05, "loss": 1.189, "step": 20 }, { "epoch": 0.023929169657812874, "grad_norm": 0.27361957875198517, "learning_rate": 4.761904761904762e-05, "loss": 1.1469, "step": 25 }, { "epoch": 0.028715003589375447, "grad_norm": 0.2368453005937937, "learning_rate": 5.714285714285714e-05, "loss": 1.158, "step": 30 }, { "epoch": 0.03350083752093802, "grad_norm": 0.2277332385016794, "learning_rate": 6.666666666666667e-05, "loss": 1.1437, "step": 35 }, { "epoch": 0.0382866714525006, "grad_norm": 0.2265653549311157, "learning_rate": 7.619047619047618e-05, "loss": 1.1302, "step": 40 }, { "epoch": 0.043072505384063174, "grad_norm": 0.22079711284915807, "learning_rate": 8.571428571428571e-05, "loss": 1.13, "step": 45 }, { "epoch": 0.04785833931562575, "grad_norm": 0.20813516832540208, "learning_rate": 9.523809523809524e-05, "loss": 1.106, "step": 50 }, { "epoch": 0.05264417324718832, "grad_norm": 0.2044131638757028, "learning_rate": 0.00010476190476190477, "loss": 1.1348, "step": 55 }, { "epoch": 0.057430007178750894, "grad_norm": 0.20101729146508107, "learning_rate": 0.00011428571428571428, "loss": 1.1018, "step": 60 }, { "epoch": 0.062215841110313475, "grad_norm": 0.21865369935553125, "learning_rate": 0.0001238095238095238, "loss": 1.1129, "step": 65 }, { "epoch": 0.06700167504187604, "grad_norm": 0.18405578482864565, "learning_rate": 0.00013333333333333334, "loss": 1.1018, "step": 70 }, { "epoch": 0.07178750897343862, "grad_norm": 0.18488079729650672, "learning_rate": 0.00014285714285714287, "loss": 1.1417, "step": 75 }, { "epoch": 0.0765733429050012, "grad_norm": 0.18433481759594844, "learning_rate": 0.00015238095238095237, "loss": 1.111, "step": 80 }, { "epoch": 0.08135917683656377, "grad_norm": 0.20377971597879482, "learning_rate": 0.00016190476190476192, "loss": 1.0709, "step": 85 }, { "epoch": 0.08614501076812635, "grad_norm": 0.20225554382239913, "learning_rate": 0.00017142857142857143, "loss": 1.1142, "step": 90 }, { "epoch": 0.09093084469968891, "grad_norm": 0.18520967333311886, "learning_rate": 0.00018095238095238095, "loss": 1.1142, "step": 95 }, { "epoch": 0.0957166786312515, "grad_norm": 0.19606367225373053, "learning_rate": 0.00019047619047619048, "loss": 1.1049, "step": 100 }, { "epoch": 0.10050251256281408, "grad_norm": 0.1867473714189168, "learning_rate": 0.0002, "loss": 1.0927, "step": 105 }, { "epoch": 0.10528834649437664, "grad_norm": 0.185817071062854, "learning_rate": 0.00019998600836567816, "loss": 1.1206, "step": 110 }, { "epoch": 0.11007418042593922, "grad_norm": 0.1762396939142846, "learning_rate": 0.00019994403737802927, "loss": 1.1022, "step": 115 }, { "epoch": 0.11486001435750179, "grad_norm": 0.16668242539032083, "learning_rate": 0.00019987409878190752, "loss": 1.1052, "step": 120 }, { "epoch": 0.11964584828906437, "grad_norm": 0.1764182848939352, "learning_rate": 0.00019977621214841822, "loss": 1.1059, "step": 125 }, { "epoch": 0.12443168222062695, "grad_norm": 0.1787380695504439, "learning_rate": 0.0001996504048694409, "loss": 1.1102, "step": 130 }, { "epoch": 0.12921751615218952, "grad_norm": 0.18152638872711746, "learning_rate": 0.00019949671214996445, "loss": 1.0986, "step": 135 }, { "epoch": 0.13400335008375208, "grad_norm": 0.1745817045352934, "learning_rate": 0.00019931517699823547, "loss": 1.085, "step": 140 }, { "epoch": 0.13878918401531468, "grad_norm": 0.17554030590687575, "learning_rate": 0.0001991058502137231, "loss": 1.1363, "step": 145 }, { "epoch": 0.14357501794687724, "grad_norm": 0.1800405053641118, "learning_rate": 0.00019886879037290384, "loss": 1.0924, "step": 150 }, { "epoch": 0.1483608518784398, "grad_norm": 0.19148632700424054, "learning_rate": 0.0001986040638128698, "loss": 1.0824, "step": 155 }, { "epoch": 0.1531466858100024, "grad_norm": 0.17313730878343153, "learning_rate": 0.0001983117446127654, "loss": 1.1071, "step": 160 }, { "epoch": 0.15793251974156497, "grad_norm": 0.16961371492797625, "learning_rate": 0.00019799191457305768, "loss": 1.1311, "step": 165 }, { "epoch": 0.16271835367312754, "grad_norm": 0.17327733992734365, "learning_rate": 0.00019764466319264595, "loss": 1.1133, "step": 170 }, { "epoch": 0.16750418760469013, "grad_norm": 0.17176717180251114, "learning_rate": 0.00019727008764381675, "loss": 1.1153, "step": 175 }, { "epoch": 0.1722900215362527, "grad_norm": 0.17138393560502058, "learning_rate": 0.0001968682927450523, "loss": 1.1006, "step": 180 }, { "epoch": 0.17707585546781526, "grad_norm": 0.16465106542109514, "learning_rate": 0.00019643939093169844, "loss": 1.104, "step": 185 }, { "epoch": 0.18186168939937783, "grad_norm": 0.1699157446206454, "learning_rate": 0.00019598350222450178, "loss": 1.1167, "step": 190 }, { "epoch": 0.18664752333094042, "grad_norm": 0.17602023031266878, "learning_rate": 0.00019550075419602408, "loss": 1.1131, "step": 195 }, { "epoch": 0.191433357262503, "grad_norm": 0.18282225583073394, "learning_rate": 0.00019499128193494297, "loss": 1.0889, "step": 200 }, { "epoch": 0.19621919119406556, "grad_norm": 0.16528930064048408, "learning_rate": 0.0001944552280082499, "loss": 1.1013, "step": 205 }, { "epoch": 0.20100502512562815, "grad_norm": 0.16504045631379008, "learning_rate": 0.0001938927424213553, "loss": 1.1003, "step": 210 }, { "epoch": 0.20579085905719072, "grad_norm": 0.16559761424705557, "learning_rate": 0.000193303982576112, "loss": 1.0998, "step": 215 }, { "epoch": 0.21057669298875328, "grad_norm": 0.16791680669282769, "learning_rate": 0.0001926891132267692, "loss": 1.0919, "step": 220 }, { "epoch": 0.21536252692031588, "grad_norm": 0.16689008275055853, "learning_rate": 0.00019204830643386868, "loss": 1.1069, "step": 225 }, { "epoch": 0.22014836085187844, "grad_norm": 0.17305744880787605, "learning_rate": 0.00019138174151609683, "loss": 1.1272, "step": 230 }, { "epoch": 0.224934194783441, "grad_norm": 0.1618964004882894, "learning_rate": 0.00019068960500010523, "loss": 1.0827, "step": 235 }, { "epoch": 0.22972002871500358, "grad_norm": 0.16411871929076272, "learning_rate": 0.00018997209056831462, "loss": 1.1164, "step": 240 }, { "epoch": 0.23450586264656617, "grad_norm": 0.1691454723246668, "learning_rate": 0.0001892293990047159, "loss": 1.1079, "step": 245 }, { "epoch": 0.23929169657812874, "grad_norm": 0.1649265891086064, "learning_rate": 0.00018846173813868454, "loss": 1.0825, "step": 250 }, { "epoch": 0.2440775305096913, "grad_norm": 0.17018378559137046, "learning_rate": 0.000187669322786823, "loss": 1.1216, "step": 255 }, { "epoch": 0.2488633644412539, "grad_norm": 0.1703316481606123, "learning_rate": 0.0001868523746928479, "loss": 1.0783, "step": 260 }, { "epoch": 0.25364919837281646, "grad_norm": 0.16735434292797727, "learning_rate": 0.0001860111224655391, "loss": 1.1149, "step": 265 }, { "epoch": 0.25843503230437903, "grad_norm": 0.1555718882454273, "learning_rate": 0.0001851458015147673, "loss": 1.1075, "step": 270 }, { "epoch": 0.2632208662359416, "grad_norm": 0.16977397286003804, "learning_rate": 0.00018425665398561883, "loss": 1.0852, "step": 275 }, { "epoch": 0.26800670016750416, "grad_norm": 0.1663432558952086, "learning_rate": 0.00018334392869063536, "loss": 1.0811, "step": 280 }, { "epoch": 0.2727925340990668, "grad_norm": 0.1666368907912767, "learning_rate": 0.00018240788104018822, "loss": 1.1014, "step": 285 }, { "epoch": 0.27757836803062935, "grad_norm": 0.15687924477714424, "learning_rate": 0.00018144877297100606, "loss": 1.0736, "step": 290 }, { "epoch": 0.2823642019621919, "grad_norm": 0.16061101558707236, "learning_rate": 0.0001804668728728764, "loss": 1.0931, "step": 295 }, { "epoch": 0.2871500358937545, "grad_norm": 0.160793101364396, "learning_rate": 0.00017946245551354157, "loss": 1.0999, "step": 300 }, { "epoch": 0.29193586982531705, "grad_norm": 0.1633263725476254, "learning_rate": 0.00017843580196180952, "loss": 1.0948, "step": 305 }, { "epoch": 0.2967217037568796, "grad_norm": 0.1631452387362867, "learning_rate": 0.00017738719950890168, "loss": 1.1013, "step": 310 }, { "epoch": 0.3015075376884422, "grad_norm": 0.16210535171429089, "learning_rate": 0.00017631694158805946, "loss": 1.0798, "step": 315 }, { "epoch": 0.3062933716200048, "grad_norm": 0.16229934636841442, "learning_rate": 0.000175225327692432, "loss": 1.0575, "step": 320 }, { "epoch": 0.3110792055515674, "grad_norm": 0.16588195121854765, "learning_rate": 0.00017411266329126824, "loss": 1.096, "step": 325 }, { "epoch": 0.31586503948312994, "grad_norm": 0.158210200244668, "learning_rate": 0.00017297925974443673, "loss": 1.1071, "step": 330 }, { "epoch": 0.3206508734146925, "grad_norm": 0.1663784778299078, "learning_rate": 0.00017182543421529676, "loss": 1.0739, "step": 335 }, { "epoch": 0.32543670734625507, "grad_norm": 0.15384028200238806, "learning_rate": 0.00017065150958194586, "loss": 1.0848, "step": 340 }, { "epoch": 0.33022254127781764, "grad_norm": 0.16073214574887584, "learning_rate": 0.00016945781434686783, "loss": 1.1157, "step": 345 }, { "epoch": 0.33500837520938026, "grad_norm": 0.1745939193140414, "learning_rate": 0.00016824468254500704, "loss": 1.0815, "step": 350 }, { "epoch": 0.3397942091409428, "grad_norm": 0.15802897019970708, "learning_rate": 0.0001670124536502947, "loss": 1.0779, "step": 355 }, { "epoch": 0.3445800430725054, "grad_norm": 0.1579431494377225, "learning_rate": 0.00016576147248065267, "loss": 1.1031, "step": 360 }, { "epoch": 0.34936587700406796, "grad_norm": 0.16455288633589932, "learning_rate": 0.00016449208910150232, "loss": 1.1207, "step": 365 }, { "epoch": 0.3541517109356305, "grad_norm": 0.15512720174775488, "learning_rate": 0.00016320465872780477, "loss": 1.0843, "step": 370 }, { "epoch": 0.3589375448671931, "grad_norm": 0.15810739086397552, "learning_rate": 0.00016189954162466012, "loss": 1.0674, "step": 375 }, { "epoch": 0.36372337879875566, "grad_norm": 0.15539897008538223, "learning_rate": 0.0001605771030064934, "loss": 1.1075, "step": 380 }, { "epoch": 0.3685092127303183, "grad_norm": 0.16059302879871643, "learning_rate": 0.00015923771293485585, "loss": 1.1083, "step": 385 }, { "epoch": 0.37329504666188085, "grad_norm": 0.1726863039386017, "learning_rate": 0.00015788174621486934, "loss": 1.0839, "step": 390 }, { "epoch": 0.3780808805934434, "grad_norm": 0.160896911699282, "learning_rate": 0.00015650958229034391, "loss": 1.093, "step": 395 }, { "epoch": 0.382866714525006, "grad_norm": 0.1539033105501165, "learning_rate": 0.00015512160513759672, "loss": 1.0824, "step": 400 }, { "epoch": 0.38765254845656855, "grad_norm": 0.15253934847352404, "learning_rate": 0.00015371820315800315, "loss": 1.0611, "step": 405 }, { "epoch": 0.3924383823881311, "grad_norm": 0.1549203336671571, "learning_rate": 0.00015229976906930935, "loss": 1.0926, "step": 410 }, { "epoch": 0.3972242163196937, "grad_norm": 0.15736586699846142, "learning_rate": 0.0001508666997957369, "loss": 1.0838, "step": 415 }, { "epoch": 0.4020100502512563, "grad_norm": 0.15414651629074486, "learning_rate": 0.00014941939635691035, "loss": 1.0962, "step": 420 }, { "epoch": 0.40679588418281887, "grad_norm": 0.15216014768555902, "learning_rate": 0.00014795826375563925, "loss": 1.0837, "step": 425 }, { "epoch": 0.41158171811438143, "grad_norm": 0.1551252486012846, "learning_rate": 0.0001464837108645845, "loss": 1.096, "step": 430 }, { "epoch": 0.416367552045944, "grad_norm": 0.15880410911617168, "learning_rate": 0.00014499615031184296, "loss": 1.0947, "step": 435 }, { "epoch": 0.42115338597750657, "grad_norm": 0.16084656769756484, "learning_rate": 0.00014349599836548034, "loss": 1.0955, "step": 440 }, { "epoch": 0.42593921990906913, "grad_norm": 0.14942909791958908, "learning_rate": 0.0001419836748170459, "loss": 1.0911, "step": 445 }, { "epoch": 0.43072505384063176, "grad_norm": 0.16134597273400678, "learning_rate": 0.0001404596028641009, "loss": 1.1136, "step": 450 }, { "epoch": 0.4355108877721943, "grad_norm": 0.15552785776756606, "learning_rate": 0.0001389242089917943, "loss": 1.1005, "step": 455 }, { "epoch": 0.4402967217037569, "grad_norm": 0.1544583591443468, "learning_rate": 0.00013737792285351805, "loss": 1.0896, "step": 460 }, { "epoch": 0.44508255563531945, "grad_norm": 0.15743294110434283, "learning_rate": 0.0001358211771506763, "loss": 1.0687, "step": 465 }, { "epoch": 0.449868389566882, "grad_norm": 0.15489693015617015, "learning_rate": 0.00013425440751160112, "loss": 1.0909, "step": 470 }, { "epoch": 0.4546542234984446, "grad_norm": 0.1556280787651109, "learning_rate": 0.00013267805236964967, "loss": 1.1008, "step": 475 }, { "epoch": 0.45944005743000715, "grad_norm": 0.16139496091159036, "learning_rate": 0.00013109255284051615, "loss": 1.1167, "step": 480 }, { "epoch": 0.4642258913615698, "grad_norm": 0.15380326887200926, "learning_rate": 0.00012949835259879304, "loss": 1.1021, "step": 485 }, { "epoch": 0.46901172529313234, "grad_norm": 0.1504710821626308, "learning_rate": 0.00012789589775381676, "loss": 1.0824, "step": 490 }, { "epoch": 0.4737975592246949, "grad_norm": 0.16882632755621252, "learning_rate": 0.00012628563672483146, "loss": 1.091, "step": 495 }, { "epoch": 0.4785833931562575, "grad_norm": 0.16236683430294702, "learning_rate": 0.0001246680201155068, "loss": 1.0609, "step": 500 }, { "epoch": 0.48336922708782004, "grad_norm": 0.1534881294655078, "learning_rate": 0.00012304350058784405, "loss": 1.0611, "step": 505 }, { "epoch": 0.4881550610193826, "grad_norm": 0.16620841316700394, "learning_rate": 0.00012141253273550696, "loss": 1.0932, "step": 510 }, { "epoch": 0.49294089495094523, "grad_norm": 0.16942714030828704, "learning_rate": 0.00011977557295661108, "loss": 1.0856, "step": 515 }, { "epoch": 0.4977267288825078, "grad_norm": 0.15500201031703087, "learning_rate": 0.00011813307932600887, "loss": 1.0852, "step": 520 }, { "epoch": 0.5025125628140703, "grad_norm": 0.15248801968172002, "learning_rate": 0.00011648551146710556, "loss": 1.1069, "step": 525 }, { "epoch": 0.5072983967456329, "grad_norm": 0.14978453385390675, "learning_rate": 0.0001148333304232411, "loss": 1.088, "step": 530 }, { "epoch": 0.5120842306771956, "grad_norm": 0.14736066147246124, "learning_rate": 0.00011317699852867548, "loss": 1.0506, "step": 535 }, { "epoch": 0.5168700646087581, "grad_norm": 0.15088998664120562, "learning_rate": 0.0001115169792792124, "loss": 1.0972, "step": 540 }, { "epoch": 0.5216558985403207, "grad_norm": 0.14676026138747209, "learning_rate": 0.00010985373720249801, "loss": 1.0871, "step": 545 }, { "epoch": 0.5264417324718832, "grad_norm": 0.17054822297185676, "learning_rate": 0.00010818773772803082, "loss": 1.0957, "step": 550 }, { "epoch": 0.5312275664034458, "grad_norm": 0.15081743477470166, "learning_rate": 0.0001065194470569193, "loss": 1.1114, "step": 555 }, { "epoch": 0.5360134003350083, "grad_norm": 0.1556600989117304, "learning_rate": 0.0001048493320314238, "loss": 1.0747, "step": 560 }, { "epoch": 0.540799234266571, "grad_norm": 0.15346464585086714, "learning_rate": 0.00010317786000431851, "loss": 1.0761, "step": 565 }, { "epoch": 0.5455850681981336, "grad_norm": 0.15178562379014646, "learning_rate": 0.00010150549870811107, "loss": 1.0839, "step": 570 }, { "epoch": 0.5503709021296961, "grad_norm": 0.15263581024104103, "learning_rate": 9.983271612415575e-05, "loss": 1.0742, "step": 575 }, { "epoch": 0.5551567360612587, "grad_norm": 0.15166582071053056, "learning_rate": 9.81599803516968e-05, "loss": 1.0725, "step": 580 }, { "epoch": 0.5599425699928212, "grad_norm": 0.14735687803417952, "learning_rate": 9.648775947687912e-05, "loss": 1.0705, "step": 585 }, { "epoch": 0.5647284039243838, "grad_norm": 0.14825818203221888, "learning_rate": 9.48165214417624e-05, "loss": 1.0871, "step": 590 }, { "epoch": 0.5695142378559463, "grad_norm": 0.15700946642781993, "learning_rate": 9.314673391337576e-05, "loss": 1.0979, "step": 595 }, { "epoch": 0.574300071787509, "grad_norm": 0.15580031067347558, "learning_rate": 9.147886415284903e-05, "loss": 1.0592, "step": 600 }, { "epoch": 0.5790859057190716, "grad_norm": 0.14548002556094225, "learning_rate": 8.981337888465788e-05, "loss": 1.0787, "step": 605 }, { "epoch": 0.5838717396506341, "grad_norm": 0.14237124600928142, "learning_rate": 8.815074416601913e-05, "loss": 1.0698, "step": 610 }, { "epoch": 0.5886575735821967, "grad_norm": 0.15304745525626437, "learning_rate": 8.649142525647272e-05, "loss": 1.0848, "step": 615 }, { "epoch": 0.5934434075137592, "grad_norm": 0.14513336716190856, "learning_rate": 8.48358864876867e-05, "loss": 1.0462, "step": 620 }, { "epoch": 0.5982292414453219, "grad_norm": 0.1468415945819683, "learning_rate": 8.318459113352221e-05, "loss": 1.0906, "step": 625 }, { "epoch": 0.6030150753768844, "grad_norm": 0.14408143553897426, "learning_rate": 8.153800128039441e-05, "loss": 1.085, "step": 630 }, { "epoch": 0.607800909308447, "grad_norm": 0.15046217184291616, "learning_rate": 7.989657769796533e-05, "loss": 1.0882, "step": 635 }, { "epoch": 0.6125867432400096, "grad_norm": 0.14348283659906289, "learning_rate": 7.82607797102056e-05, "loss": 1.0861, "step": 640 }, { "epoch": 0.6173725771715721, "grad_norm": 0.14685503152106738, "learning_rate": 7.663106506686057e-05, "loss": 1.1003, "step": 645 }, { "epoch": 0.6221584111031347, "grad_norm": 0.1480277391784376, "learning_rate": 7.500788981535708e-05, "loss": 1.0758, "step": 650 }, { "epoch": 0.6269442450346973, "grad_norm": 0.1477910922274185, "learning_rate": 7.339170817318625e-05, "loss": 1.0695, "step": 655 }, { "epoch": 0.6317300789662599, "grad_norm": 0.1551465349289344, "learning_rate": 7.178297240079882e-05, "loss": 1.0942, "step": 660 }, { "epoch": 0.6365159128978225, "grad_norm": 0.148811465121087, "learning_rate": 7.018213267504775e-05, "loss": 1.0825, "step": 665 }, { "epoch": 0.641301746829385, "grad_norm": 0.146937156337137, "learning_rate": 6.858963696321403e-05, "loss": 1.0985, "step": 670 }, { "epoch": 0.6460875807609476, "grad_norm": 0.14703161191479286, "learning_rate": 6.700593089765086e-05, "loss": 1.06, "step": 675 }, { "epoch": 0.6508734146925101, "grad_norm": 0.14564360148371303, "learning_rate": 6.543145765108106e-05, "loss": 1.0853, "step": 680 }, { "epoch": 0.6556592486240728, "grad_norm": 0.14887365645849163, "learning_rate": 6.3866657812583e-05, "loss": 1.0787, "step": 685 }, { "epoch": 0.6604450825556353, "grad_norm": 0.14533659914404762, "learning_rate": 6.231196926429913e-05, "loss": 1.073, "step": 690 }, { "epoch": 0.6652309164871979, "grad_norm": 0.2354314895944445, "learning_rate": 6.076782705890257e-05, "loss": 1.0815, "step": 695 }, { "epoch": 0.6700167504187605, "grad_norm": 0.14132233475416703, "learning_rate": 5.9234663297854876e-05, "loss": 1.0555, "step": 700 }, { "epoch": 0.674802584350323, "grad_norm": 0.14913316600220797, "learning_rate": 5.7712907010490036e-05, "loss": 1.0785, "step": 705 }, { "epoch": 0.6795884182818857, "grad_norm": 0.15328072297180578, "learning_rate": 5.620298403395805e-05, "loss": 1.0857, "step": 710 }, { "epoch": 0.6843742522134482, "grad_norm": 0.17603388258774993, "learning_rate": 5.4705316894061765e-05, "loss": 1.0898, "step": 715 }, { "epoch": 0.6891600861450108, "grad_norm": 0.1448443355064005, "learning_rate": 5.322032468702036e-05, "loss": 1.0714, "step": 720 }, { "epoch": 0.6939459200765733, "grad_norm": 0.4624474555190123, "learning_rate": 5.1748422962192376e-05, "loss": 1.0994, "step": 725 }, { "epoch": 0.6987317540081359, "grad_norm": 0.14868980834848183, "learning_rate": 5.0290023605791666e-05, "loss": 1.0725, "step": 730 }, { "epoch": 0.7035175879396985, "grad_norm": 0.15278504704361137, "learning_rate": 4.8845534725628086e-05, "loss": 1.0962, "step": 735 }, { "epoch": 0.708303421871261, "grad_norm": 0.14605679246576617, "learning_rate": 4.741536053690552e-05, "loss": 1.0947, "step": 740 }, { "epoch": 0.7130892558028237, "grad_norm": 0.172204603811799, "learning_rate": 4.599990124910918e-05, "loss": 1.0758, "step": 745 }, { "epoch": 0.7178750897343862, "grad_norm": 0.14357849865669614, "learning_rate": 4.4599552954014145e-05, "loss": 1.0682, "step": 750 }, { "epoch": 0.7226609236659488, "grad_norm": 0.14980923833672957, "learning_rate": 4.32147075148458e-05, "loss": 1.0814, "step": 755 }, { "epoch": 0.7274467575975113, "grad_norm": 0.16395768222951593, "learning_rate": 4.1845752456623665e-05, "loss": 1.0583, "step": 760 }, { "epoch": 0.7322325915290739, "grad_norm": 0.14059821304657993, "learning_rate": 4.049307085771931e-05, "loss": 1.0839, "step": 765 }, { "epoch": 0.7370184254606366, "grad_norm": 0.1472110334031576, "learning_rate": 3.9157041242658477e-05, "loss": 1.1079, "step": 770 }, { "epoch": 0.7418042593921991, "grad_norm": 0.14020342123522012, "learning_rate": 3.783803747619741e-05, "loss": 1.0829, "step": 775 }, { "epoch": 0.7465900933237617, "grad_norm": 0.17437047699695307, "learning_rate": 3.653642865870359e-05, "loss": 1.0808, "step": 780 }, { "epoch": 0.7513759272553242, "grad_norm": 0.14320013892049976, "learning_rate": 3.525257902286908e-05, "loss": 1.0608, "step": 785 }, { "epoch": 0.7561617611868868, "grad_norm": 0.14437417000631428, "learning_rate": 3.398684783178648e-05, "loss": 1.0618, "step": 790 }, { "epoch": 0.7609475951184493, "grad_norm": 0.14321363672597254, "learning_rate": 3.273958927841525e-05, "loss": 1.0659, "step": 795 }, { "epoch": 0.765733429050012, "grad_norm": 0.14121990349576288, "learning_rate": 3.1511152386467055e-05, "loss": 1.0936, "step": 800 }, { "epoch": 0.7705192629815746, "grad_norm": 0.16146069783583863, "learning_rate": 3.0301880912737568e-05, "loss": 1.0647, "step": 805 }, { "epoch": 0.7753050969131371, "grad_norm": 0.1447026626027737, "learning_rate": 2.9112113250911844e-05, "loss": 1.0747, "step": 810 }, { "epoch": 0.7800909308446997, "grad_norm": 0.14724228311552523, "learning_rate": 2.7942182336870925e-05, "loss": 1.1046, "step": 815 }, { "epoch": 0.7848767647762622, "grad_norm": 0.14612792897080507, "learning_rate": 2.6792415555525463e-05, "loss": 1.0391, "step": 820 }, { "epoch": 0.7896625987078248, "grad_norm": 0.14445016139434405, "learning_rate": 2.5663134649202647e-05, "loss": 1.0808, "step": 825 }, { "epoch": 0.7944484326393874, "grad_norm": 0.14283033243615206, "learning_rate": 2.4554655627612245e-05, "loss": 1.0767, "step": 830 }, { "epoch": 0.79923426657095, "grad_norm": 0.1428104588189023, "learning_rate": 2.34672886794167e-05, "loss": 1.0884, "step": 835 }, { "epoch": 0.8040201005025126, "grad_norm": 0.14106416222944104, "learning_rate": 2.2401338085430323e-05, "loss": 1.0891, "step": 840 }, { "epoch": 0.8088059344340751, "grad_norm": 0.14453431354715718, "learning_rate": 2.135710213347134e-05, "loss": 1.0829, "step": 845 }, { "epoch": 0.8135917683656377, "grad_norm": 0.1436138017414945, "learning_rate": 2.0334873034891554e-05, "loss": 1.0823, "step": 850 }, { "epoch": 0.8183776022972002, "grad_norm": 0.14415504753616376, "learning_rate": 1.933493684280574e-05, "loss": 1.0749, "step": 855 }, { "epoch": 0.8231634362287629, "grad_norm": 0.14188286670890893, "learning_rate": 1.8357573372044834e-05, "loss": 1.0775, "step": 860 }, { "epoch": 0.8279492701603255, "grad_norm": 0.14043422592547342, "learning_rate": 1.740305612085439e-05, "loss": 1.0852, "step": 865 }, { "epoch": 0.832735104091888, "grad_norm": 0.14014109535516273, "learning_rate": 1.647165219436113e-05, "loss": 1.0716, "step": 870 }, { "epoch": 0.8375209380234506, "grad_norm": 0.18266681120520475, "learning_rate": 1.556362222982799e-05, "loss": 1.0711, "step": 875 }, { "epoch": 0.8423067719550131, "grad_norm": 0.14585487433506303, "learning_rate": 1.4679220323719234e-05, "loss": 1.0561, "step": 880 }, { "epoch": 0.8470926058865758, "grad_norm": 0.13911103035630754, "learning_rate": 1.3818693960596185e-05, "loss": 1.0707, "step": 885 }, { "epoch": 0.8518784398181383, "grad_norm": 0.15612123605821987, "learning_rate": 1.2982283943862738e-05, "loss": 1.0494, "step": 890 }, { "epoch": 0.8566642737497009, "grad_norm": 0.14067555622023134, "learning_rate": 1.217022432838093e-05, "loss": 1.0686, "step": 895 }, { "epoch": 0.8614501076812635, "grad_norm": 0.1457410414761679, "learning_rate": 1.1382742354974429e-05, "loss": 1.0562, "step": 900 }, { "epoch": 0.866235941612826, "grad_norm": 0.1398250627278749, "learning_rate": 1.0620058386839393e-05, "loss": 1.0753, "step": 905 }, { "epoch": 0.8710217755443886, "grad_norm": 0.14690238478434312, "learning_rate": 9.882385847879539e-06, "loss": 1.0539, "step": 910 }, { "epoch": 0.8758076094759512, "grad_norm": 0.14224902345010998, "learning_rate": 9.169931162983137e-06, "loss": 1.0575, "step": 915 }, { "epoch": 0.8805934434075138, "grad_norm": 0.14002967562121116, "learning_rate": 8.482893700258643e-06, "loss": 1.0831, "step": 920 }, { "epoch": 0.8853792773390763, "grad_norm": 0.14652920530592364, "learning_rate": 7.821465715244947e-06, "loss": 1.0844, "step": 925 }, { "epoch": 0.8901651112706389, "grad_norm": 0.13985808750925746, "learning_rate": 7.185832297111938e-06, "loss": 1.0618, "step": 930 }, { "epoch": 0.8949509452022015, "grad_norm": 0.15160308510490375, "learning_rate": 6.576171316866608e-06, "loss": 1.0773, "step": 935 }, { "epoch": 0.899736779133764, "grad_norm": 0.14784429409642344, "learning_rate": 5.9926533775789055e-06, "loss": 1.0951, "step": 940 }, { "epoch": 0.9045226130653267, "grad_norm": 0.14167088318411009, "learning_rate": 5.435441766641369e-06, "loss": 1.0841, "step": 945 }, { "epoch": 0.9093084469968892, "grad_norm": 0.14256818695069146, "learning_rate": 4.904692410075973e-06, "loss": 1.0647, "step": 950 }, { "epoch": 0.9140942809284518, "grad_norm": 0.15531748633710526, "learning_rate": 4.400553828900989e-06, "loss": 1.0757, "step": 955 }, { "epoch": 0.9188801148600143, "grad_norm": 0.14420681549864126, "learning_rate": 3.923167097569935e-06, "loss": 1.0903, "step": 960 }, { "epoch": 0.9236659487915769, "grad_norm": 0.14398010788396462, "learning_rate": 3.4726658044943126e-06, "loss": 1.0668, "step": 965 }, { "epoch": 0.9284517827231396, "grad_norm": 0.14589900176146645, "learning_rate": 3.0491760146611926e-06, "loss": 1.0845, "step": 970 }, { "epoch": 0.9332376166547021, "grad_norm": 0.13882750982702796, "learning_rate": 2.652816234356159e-06, "loss": 1.0382, "step": 975 }, { "epoch": 0.9380234505862647, "grad_norm": 0.14112035905216325, "learning_rate": 2.283697378001315e-06, "loss": 1.0825, "step": 980 }, { "epoch": 0.9428092845178272, "grad_norm": 0.13934480624047157, "learning_rate": 1.9419227371178627e-06, "loss": 1.0679, "step": 985 }, { "epoch": 0.9475951184493898, "grad_norm": 0.14117739445269173, "learning_rate": 1.6275879514217052e-06, "loss": 1.0772, "step": 990 }, { "epoch": 0.9523809523809523, "grad_norm": 0.14031209854381504, "learning_rate": 1.3407809820603856e-06, "loss": 1.0767, "step": 995 }, { "epoch": 0.957166786312515, "grad_norm": 0.14091355128035063, "learning_rate": 1.0815820869985893e-06, "loss": 1.0635, "step": 1000 }, { "epoch": 0.9619526202440776, "grad_norm": 0.14100298765660577, "learning_rate": 8.50063798559475e-07, "loss": 1.0861, "step": 1005 }, { "epoch": 0.9667384541756401, "grad_norm": 0.1412224316948679, "learning_rate": 6.462909031276443e-07, "loss": 1.0633, "step": 1010 }, { "epoch": 0.9715242881072027, "grad_norm": 0.1385906964183353, "learning_rate": 4.7032042301985434e-07, "loss": 1.0726, "step": 1015 }, { "epoch": 0.9763101220387652, "grad_norm": 0.151976410727331, "learning_rate": 3.222016005282824e-07, "loss": 1.0645, "step": 1020 }, { "epoch": 0.9810959559703278, "grad_norm": 0.14304003914264313, "learning_rate": 2.0197588414094804e-07, "loss": 1.0785, "step": 1025 }, { "epoch": 0.9858817899018905, "grad_norm": 0.1395525833943131, "learning_rate": 1.0967691694302451e-07, "loss": 1.0582, "step": 1030 }, { "epoch": 0.990667623833453, "grad_norm": 0.14208441339069042, "learning_rate": 4.5330527202480654e-08, "loss": 1.0763, "step": 1035 }, { "epoch": 0.9954534577650156, "grad_norm": 0.13931375530799342, "learning_rate": 8.95472114241791e-09, "loss": 1.0444, "step": 1040 }, { "epoch": 0.9992821249102656, "eval_loss": 1.077100157737732, "eval_runtime": 3923.6787, "eval_samples_per_second": 3.43, "eval_steps_per_second": 0.858, "step": 1044 }, { "epoch": 0.9992821249102656, "step": 1044, "total_flos": 2155604625850368.0, "train_loss": 1.091635976486279, "train_runtime": 24351.9745, "train_samples_per_second": 2.746, "train_steps_per_second": 0.043 } ], "logging_steps": 5, "max_steps": 1044, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2155604625850368.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }