downstream-7b-p0.1_seed42_rare / trainer_state.json
terry69's picture
Model save
55d7170 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992821249102656,
"eval_steps": 500,
"global_step": 1044,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000957166786312515,
"grad_norm": 3.9142152723532337,
"learning_rate": 1.9047619047619051e-06,
"loss": 1.3978,
"step": 1
},
{
"epoch": 0.004785833931562575,
"grad_norm": 1.3430217420733046,
"learning_rate": 9.523809523809523e-06,
"loss": 1.3489,
"step": 5
},
{
"epoch": 0.00957166786312515,
"grad_norm": 0.5850408636793494,
"learning_rate": 1.9047619047619046e-05,
"loss": 1.2871,
"step": 10
},
{
"epoch": 0.014357501794687724,
"grad_norm": 0.46666716038967326,
"learning_rate": 2.857142857142857e-05,
"loss": 1.2106,
"step": 15
},
{
"epoch": 0.0191433357262503,
"grad_norm": 0.35044064248530404,
"learning_rate": 3.809523809523809e-05,
"loss": 1.189,
"step": 20
},
{
"epoch": 0.023929169657812874,
"grad_norm": 0.27361957875198517,
"learning_rate": 4.761904761904762e-05,
"loss": 1.1469,
"step": 25
},
{
"epoch": 0.028715003589375447,
"grad_norm": 0.2368453005937937,
"learning_rate": 5.714285714285714e-05,
"loss": 1.158,
"step": 30
},
{
"epoch": 0.03350083752093802,
"grad_norm": 0.2277332385016794,
"learning_rate": 6.666666666666667e-05,
"loss": 1.1437,
"step": 35
},
{
"epoch": 0.0382866714525006,
"grad_norm": 0.2265653549311157,
"learning_rate": 7.619047619047618e-05,
"loss": 1.1302,
"step": 40
},
{
"epoch": 0.043072505384063174,
"grad_norm": 0.22079711284915807,
"learning_rate": 8.571428571428571e-05,
"loss": 1.13,
"step": 45
},
{
"epoch": 0.04785833931562575,
"grad_norm": 0.20813516832540208,
"learning_rate": 9.523809523809524e-05,
"loss": 1.106,
"step": 50
},
{
"epoch": 0.05264417324718832,
"grad_norm": 0.2044131638757028,
"learning_rate": 0.00010476190476190477,
"loss": 1.1348,
"step": 55
},
{
"epoch": 0.057430007178750894,
"grad_norm": 0.20101729146508107,
"learning_rate": 0.00011428571428571428,
"loss": 1.1018,
"step": 60
},
{
"epoch": 0.062215841110313475,
"grad_norm": 0.21865369935553125,
"learning_rate": 0.0001238095238095238,
"loss": 1.1129,
"step": 65
},
{
"epoch": 0.06700167504187604,
"grad_norm": 0.18405578482864565,
"learning_rate": 0.00013333333333333334,
"loss": 1.1018,
"step": 70
},
{
"epoch": 0.07178750897343862,
"grad_norm": 0.18488079729650672,
"learning_rate": 0.00014285714285714287,
"loss": 1.1417,
"step": 75
},
{
"epoch": 0.0765733429050012,
"grad_norm": 0.18433481759594844,
"learning_rate": 0.00015238095238095237,
"loss": 1.111,
"step": 80
},
{
"epoch": 0.08135917683656377,
"grad_norm": 0.20377971597879482,
"learning_rate": 0.00016190476190476192,
"loss": 1.0709,
"step": 85
},
{
"epoch": 0.08614501076812635,
"grad_norm": 0.20225554382239913,
"learning_rate": 0.00017142857142857143,
"loss": 1.1142,
"step": 90
},
{
"epoch": 0.09093084469968891,
"grad_norm": 0.18520967333311886,
"learning_rate": 0.00018095238095238095,
"loss": 1.1142,
"step": 95
},
{
"epoch": 0.0957166786312515,
"grad_norm": 0.19606367225373053,
"learning_rate": 0.00019047619047619048,
"loss": 1.1049,
"step": 100
},
{
"epoch": 0.10050251256281408,
"grad_norm": 0.1867473714189168,
"learning_rate": 0.0002,
"loss": 1.0927,
"step": 105
},
{
"epoch": 0.10528834649437664,
"grad_norm": 0.185817071062854,
"learning_rate": 0.00019998600836567816,
"loss": 1.1206,
"step": 110
},
{
"epoch": 0.11007418042593922,
"grad_norm": 0.1762396939142846,
"learning_rate": 0.00019994403737802927,
"loss": 1.1022,
"step": 115
},
{
"epoch": 0.11486001435750179,
"grad_norm": 0.16668242539032083,
"learning_rate": 0.00019987409878190752,
"loss": 1.1052,
"step": 120
},
{
"epoch": 0.11964584828906437,
"grad_norm": 0.1764182848939352,
"learning_rate": 0.00019977621214841822,
"loss": 1.1059,
"step": 125
},
{
"epoch": 0.12443168222062695,
"grad_norm": 0.1787380695504439,
"learning_rate": 0.0001996504048694409,
"loss": 1.1102,
"step": 130
},
{
"epoch": 0.12921751615218952,
"grad_norm": 0.18152638872711746,
"learning_rate": 0.00019949671214996445,
"loss": 1.0986,
"step": 135
},
{
"epoch": 0.13400335008375208,
"grad_norm": 0.1745817045352934,
"learning_rate": 0.00019931517699823547,
"loss": 1.085,
"step": 140
},
{
"epoch": 0.13878918401531468,
"grad_norm": 0.17554030590687575,
"learning_rate": 0.0001991058502137231,
"loss": 1.1363,
"step": 145
},
{
"epoch": 0.14357501794687724,
"grad_norm": 0.1800405053641118,
"learning_rate": 0.00019886879037290384,
"loss": 1.0924,
"step": 150
},
{
"epoch": 0.1483608518784398,
"grad_norm": 0.19148632700424054,
"learning_rate": 0.0001986040638128698,
"loss": 1.0824,
"step": 155
},
{
"epoch": 0.1531466858100024,
"grad_norm": 0.17313730878343153,
"learning_rate": 0.0001983117446127654,
"loss": 1.1071,
"step": 160
},
{
"epoch": 0.15793251974156497,
"grad_norm": 0.16961371492797625,
"learning_rate": 0.00019799191457305768,
"loss": 1.1311,
"step": 165
},
{
"epoch": 0.16271835367312754,
"grad_norm": 0.17327733992734365,
"learning_rate": 0.00019764466319264595,
"loss": 1.1133,
"step": 170
},
{
"epoch": 0.16750418760469013,
"grad_norm": 0.17176717180251114,
"learning_rate": 0.00019727008764381675,
"loss": 1.1153,
"step": 175
},
{
"epoch": 0.1722900215362527,
"grad_norm": 0.17138393560502058,
"learning_rate": 0.0001968682927450523,
"loss": 1.1006,
"step": 180
},
{
"epoch": 0.17707585546781526,
"grad_norm": 0.16465106542109514,
"learning_rate": 0.00019643939093169844,
"loss": 1.104,
"step": 185
},
{
"epoch": 0.18186168939937783,
"grad_norm": 0.1699157446206454,
"learning_rate": 0.00019598350222450178,
"loss": 1.1167,
"step": 190
},
{
"epoch": 0.18664752333094042,
"grad_norm": 0.17602023031266878,
"learning_rate": 0.00019550075419602408,
"loss": 1.1131,
"step": 195
},
{
"epoch": 0.191433357262503,
"grad_norm": 0.18282225583073394,
"learning_rate": 0.00019499128193494297,
"loss": 1.0889,
"step": 200
},
{
"epoch": 0.19621919119406556,
"grad_norm": 0.16528930064048408,
"learning_rate": 0.0001944552280082499,
"loss": 1.1013,
"step": 205
},
{
"epoch": 0.20100502512562815,
"grad_norm": 0.16504045631379008,
"learning_rate": 0.0001938927424213553,
"loss": 1.1003,
"step": 210
},
{
"epoch": 0.20579085905719072,
"grad_norm": 0.16559761424705557,
"learning_rate": 0.000193303982576112,
"loss": 1.0998,
"step": 215
},
{
"epoch": 0.21057669298875328,
"grad_norm": 0.16791680669282769,
"learning_rate": 0.0001926891132267692,
"loss": 1.0919,
"step": 220
},
{
"epoch": 0.21536252692031588,
"grad_norm": 0.16689008275055853,
"learning_rate": 0.00019204830643386868,
"loss": 1.1069,
"step": 225
},
{
"epoch": 0.22014836085187844,
"grad_norm": 0.17305744880787605,
"learning_rate": 0.00019138174151609683,
"loss": 1.1272,
"step": 230
},
{
"epoch": 0.224934194783441,
"grad_norm": 0.1618964004882894,
"learning_rate": 0.00019068960500010523,
"loss": 1.0827,
"step": 235
},
{
"epoch": 0.22972002871500358,
"grad_norm": 0.16411871929076272,
"learning_rate": 0.00018997209056831462,
"loss": 1.1164,
"step": 240
},
{
"epoch": 0.23450586264656617,
"grad_norm": 0.1691454723246668,
"learning_rate": 0.0001892293990047159,
"loss": 1.1079,
"step": 245
},
{
"epoch": 0.23929169657812874,
"grad_norm": 0.1649265891086064,
"learning_rate": 0.00018846173813868454,
"loss": 1.0825,
"step": 250
},
{
"epoch": 0.2440775305096913,
"grad_norm": 0.17018378559137046,
"learning_rate": 0.000187669322786823,
"loss": 1.1216,
"step": 255
},
{
"epoch": 0.2488633644412539,
"grad_norm": 0.1703316481606123,
"learning_rate": 0.0001868523746928479,
"loss": 1.0783,
"step": 260
},
{
"epoch": 0.25364919837281646,
"grad_norm": 0.16735434292797727,
"learning_rate": 0.0001860111224655391,
"loss": 1.1149,
"step": 265
},
{
"epoch": 0.25843503230437903,
"grad_norm": 0.1555718882454273,
"learning_rate": 0.0001851458015147673,
"loss": 1.1075,
"step": 270
},
{
"epoch": 0.2632208662359416,
"grad_norm": 0.16977397286003804,
"learning_rate": 0.00018425665398561883,
"loss": 1.0852,
"step": 275
},
{
"epoch": 0.26800670016750416,
"grad_norm": 0.1663432558952086,
"learning_rate": 0.00018334392869063536,
"loss": 1.0811,
"step": 280
},
{
"epoch": 0.2727925340990668,
"grad_norm": 0.1666368907912767,
"learning_rate": 0.00018240788104018822,
"loss": 1.1014,
"step": 285
},
{
"epoch": 0.27757836803062935,
"grad_norm": 0.15687924477714424,
"learning_rate": 0.00018144877297100606,
"loss": 1.0736,
"step": 290
},
{
"epoch": 0.2823642019621919,
"grad_norm": 0.16061101558707236,
"learning_rate": 0.0001804668728728764,
"loss": 1.0931,
"step": 295
},
{
"epoch": 0.2871500358937545,
"grad_norm": 0.160793101364396,
"learning_rate": 0.00017946245551354157,
"loss": 1.0999,
"step": 300
},
{
"epoch": 0.29193586982531705,
"grad_norm": 0.1633263725476254,
"learning_rate": 0.00017843580196180952,
"loss": 1.0948,
"step": 305
},
{
"epoch": 0.2967217037568796,
"grad_norm": 0.1631452387362867,
"learning_rate": 0.00017738719950890168,
"loss": 1.1013,
"step": 310
},
{
"epoch": 0.3015075376884422,
"grad_norm": 0.16210535171429089,
"learning_rate": 0.00017631694158805946,
"loss": 1.0798,
"step": 315
},
{
"epoch": 0.3062933716200048,
"grad_norm": 0.16229934636841442,
"learning_rate": 0.000175225327692432,
"loss": 1.0575,
"step": 320
},
{
"epoch": 0.3110792055515674,
"grad_norm": 0.16588195121854765,
"learning_rate": 0.00017411266329126824,
"loss": 1.096,
"step": 325
},
{
"epoch": 0.31586503948312994,
"grad_norm": 0.158210200244668,
"learning_rate": 0.00017297925974443673,
"loss": 1.1071,
"step": 330
},
{
"epoch": 0.3206508734146925,
"grad_norm": 0.1663784778299078,
"learning_rate": 0.00017182543421529676,
"loss": 1.0739,
"step": 335
},
{
"epoch": 0.32543670734625507,
"grad_norm": 0.15384028200238806,
"learning_rate": 0.00017065150958194586,
"loss": 1.0848,
"step": 340
},
{
"epoch": 0.33022254127781764,
"grad_norm": 0.16073214574887584,
"learning_rate": 0.00016945781434686783,
"loss": 1.1157,
"step": 345
},
{
"epoch": 0.33500837520938026,
"grad_norm": 0.1745939193140414,
"learning_rate": 0.00016824468254500704,
"loss": 1.0815,
"step": 350
},
{
"epoch": 0.3397942091409428,
"grad_norm": 0.15802897019970708,
"learning_rate": 0.0001670124536502947,
"loss": 1.0779,
"step": 355
},
{
"epoch": 0.3445800430725054,
"grad_norm": 0.1579431494377225,
"learning_rate": 0.00016576147248065267,
"loss": 1.1031,
"step": 360
},
{
"epoch": 0.34936587700406796,
"grad_norm": 0.16455288633589932,
"learning_rate": 0.00016449208910150232,
"loss": 1.1207,
"step": 365
},
{
"epoch": 0.3541517109356305,
"grad_norm": 0.15512720174775488,
"learning_rate": 0.00016320465872780477,
"loss": 1.0843,
"step": 370
},
{
"epoch": 0.3589375448671931,
"grad_norm": 0.15810739086397552,
"learning_rate": 0.00016189954162466012,
"loss": 1.0674,
"step": 375
},
{
"epoch": 0.36372337879875566,
"grad_norm": 0.15539897008538223,
"learning_rate": 0.0001605771030064934,
"loss": 1.1075,
"step": 380
},
{
"epoch": 0.3685092127303183,
"grad_norm": 0.16059302879871643,
"learning_rate": 0.00015923771293485585,
"loss": 1.1083,
"step": 385
},
{
"epoch": 0.37329504666188085,
"grad_norm": 0.1726863039386017,
"learning_rate": 0.00015788174621486934,
"loss": 1.0839,
"step": 390
},
{
"epoch": 0.3780808805934434,
"grad_norm": 0.160896911699282,
"learning_rate": 0.00015650958229034391,
"loss": 1.093,
"step": 395
},
{
"epoch": 0.382866714525006,
"grad_norm": 0.1539033105501165,
"learning_rate": 0.00015512160513759672,
"loss": 1.0824,
"step": 400
},
{
"epoch": 0.38765254845656855,
"grad_norm": 0.15253934847352404,
"learning_rate": 0.00015371820315800315,
"loss": 1.0611,
"step": 405
},
{
"epoch": 0.3924383823881311,
"grad_norm": 0.1549203336671571,
"learning_rate": 0.00015229976906930935,
"loss": 1.0926,
"step": 410
},
{
"epoch": 0.3972242163196937,
"grad_norm": 0.15736586699846142,
"learning_rate": 0.0001508666997957369,
"loss": 1.0838,
"step": 415
},
{
"epoch": 0.4020100502512563,
"grad_norm": 0.15414651629074486,
"learning_rate": 0.00014941939635691035,
"loss": 1.0962,
"step": 420
},
{
"epoch": 0.40679588418281887,
"grad_norm": 0.15216014768555902,
"learning_rate": 0.00014795826375563925,
"loss": 1.0837,
"step": 425
},
{
"epoch": 0.41158171811438143,
"grad_norm": 0.1551252486012846,
"learning_rate": 0.0001464837108645845,
"loss": 1.096,
"step": 430
},
{
"epoch": 0.416367552045944,
"grad_norm": 0.15880410911617168,
"learning_rate": 0.00014499615031184296,
"loss": 1.0947,
"step": 435
},
{
"epoch": 0.42115338597750657,
"grad_norm": 0.16084656769756484,
"learning_rate": 0.00014349599836548034,
"loss": 1.0955,
"step": 440
},
{
"epoch": 0.42593921990906913,
"grad_norm": 0.14942909791958908,
"learning_rate": 0.0001419836748170459,
"loss": 1.0911,
"step": 445
},
{
"epoch": 0.43072505384063176,
"grad_norm": 0.16134597273400678,
"learning_rate": 0.0001404596028641009,
"loss": 1.1136,
"step": 450
},
{
"epoch": 0.4355108877721943,
"grad_norm": 0.15552785776756606,
"learning_rate": 0.0001389242089917943,
"loss": 1.1005,
"step": 455
},
{
"epoch": 0.4402967217037569,
"grad_norm": 0.1544583591443468,
"learning_rate": 0.00013737792285351805,
"loss": 1.0896,
"step": 460
},
{
"epoch": 0.44508255563531945,
"grad_norm": 0.15743294110434283,
"learning_rate": 0.0001358211771506763,
"loss": 1.0687,
"step": 465
},
{
"epoch": 0.449868389566882,
"grad_norm": 0.15489693015617015,
"learning_rate": 0.00013425440751160112,
"loss": 1.0909,
"step": 470
},
{
"epoch": 0.4546542234984446,
"grad_norm": 0.1556280787651109,
"learning_rate": 0.00013267805236964967,
"loss": 1.1008,
"step": 475
},
{
"epoch": 0.45944005743000715,
"grad_norm": 0.16139496091159036,
"learning_rate": 0.00013109255284051615,
"loss": 1.1167,
"step": 480
},
{
"epoch": 0.4642258913615698,
"grad_norm": 0.15380326887200926,
"learning_rate": 0.00012949835259879304,
"loss": 1.1021,
"step": 485
},
{
"epoch": 0.46901172529313234,
"grad_norm": 0.1504710821626308,
"learning_rate": 0.00012789589775381676,
"loss": 1.0824,
"step": 490
},
{
"epoch": 0.4737975592246949,
"grad_norm": 0.16882632755621252,
"learning_rate": 0.00012628563672483146,
"loss": 1.091,
"step": 495
},
{
"epoch": 0.4785833931562575,
"grad_norm": 0.16236683430294702,
"learning_rate": 0.0001246680201155068,
"loss": 1.0609,
"step": 500
},
{
"epoch": 0.48336922708782004,
"grad_norm": 0.1534881294655078,
"learning_rate": 0.00012304350058784405,
"loss": 1.0611,
"step": 505
},
{
"epoch": 0.4881550610193826,
"grad_norm": 0.16620841316700394,
"learning_rate": 0.00012141253273550696,
"loss": 1.0932,
"step": 510
},
{
"epoch": 0.49294089495094523,
"grad_norm": 0.16942714030828704,
"learning_rate": 0.00011977557295661108,
"loss": 1.0856,
"step": 515
},
{
"epoch": 0.4977267288825078,
"grad_norm": 0.15500201031703087,
"learning_rate": 0.00011813307932600887,
"loss": 1.0852,
"step": 520
},
{
"epoch": 0.5025125628140703,
"grad_norm": 0.15248801968172002,
"learning_rate": 0.00011648551146710556,
"loss": 1.1069,
"step": 525
},
{
"epoch": 0.5072983967456329,
"grad_norm": 0.14978453385390675,
"learning_rate": 0.0001148333304232411,
"loss": 1.088,
"step": 530
},
{
"epoch": 0.5120842306771956,
"grad_norm": 0.14736066147246124,
"learning_rate": 0.00011317699852867548,
"loss": 1.0506,
"step": 535
},
{
"epoch": 0.5168700646087581,
"grad_norm": 0.15088998664120562,
"learning_rate": 0.0001115169792792124,
"loss": 1.0972,
"step": 540
},
{
"epoch": 0.5216558985403207,
"grad_norm": 0.14676026138747209,
"learning_rate": 0.00010985373720249801,
"loss": 1.0871,
"step": 545
},
{
"epoch": 0.5264417324718832,
"grad_norm": 0.17054822297185676,
"learning_rate": 0.00010818773772803082,
"loss": 1.0957,
"step": 550
},
{
"epoch": 0.5312275664034458,
"grad_norm": 0.15081743477470166,
"learning_rate": 0.0001065194470569193,
"loss": 1.1114,
"step": 555
},
{
"epoch": 0.5360134003350083,
"grad_norm": 0.1556600989117304,
"learning_rate": 0.0001048493320314238,
"loss": 1.0747,
"step": 560
},
{
"epoch": 0.540799234266571,
"grad_norm": 0.15346464585086714,
"learning_rate": 0.00010317786000431851,
"loss": 1.0761,
"step": 565
},
{
"epoch": 0.5455850681981336,
"grad_norm": 0.15178562379014646,
"learning_rate": 0.00010150549870811107,
"loss": 1.0839,
"step": 570
},
{
"epoch": 0.5503709021296961,
"grad_norm": 0.15263581024104103,
"learning_rate": 9.983271612415575e-05,
"loss": 1.0742,
"step": 575
},
{
"epoch": 0.5551567360612587,
"grad_norm": 0.15166582071053056,
"learning_rate": 9.81599803516968e-05,
"loss": 1.0725,
"step": 580
},
{
"epoch": 0.5599425699928212,
"grad_norm": 0.14735687803417952,
"learning_rate": 9.648775947687912e-05,
"loss": 1.0705,
"step": 585
},
{
"epoch": 0.5647284039243838,
"grad_norm": 0.14825818203221888,
"learning_rate": 9.48165214417624e-05,
"loss": 1.0871,
"step": 590
},
{
"epoch": 0.5695142378559463,
"grad_norm": 0.15700946642781993,
"learning_rate": 9.314673391337576e-05,
"loss": 1.0979,
"step": 595
},
{
"epoch": 0.574300071787509,
"grad_norm": 0.15580031067347558,
"learning_rate": 9.147886415284903e-05,
"loss": 1.0592,
"step": 600
},
{
"epoch": 0.5790859057190716,
"grad_norm": 0.14548002556094225,
"learning_rate": 8.981337888465788e-05,
"loss": 1.0787,
"step": 605
},
{
"epoch": 0.5838717396506341,
"grad_norm": 0.14237124600928142,
"learning_rate": 8.815074416601913e-05,
"loss": 1.0698,
"step": 610
},
{
"epoch": 0.5886575735821967,
"grad_norm": 0.15304745525626437,
"learning_rate": 8.649142525647272e-05,
"loss": 1.0848,
"step": 615
},
{
"epoch": 0.5934434075137592,
"grad_norm": 0.14513336716190856,
"learning_rate": 8.48358864876867e-05,
"loss": 1.0462,
"step": 620
},
{
"epoch": 0.5982292414453219,
"grad_norm": 0.1468415945819683,
"learning_rate": 8.318459113352221e-05,
"loss": 1.0906,
"step": 625
},
{
"epoch": 0.6030150753768844,
"grad_norm": 0.14408143553897426,
"learning_rate": 8.153800128039441e-05,
"loss": 1.085,
"step": 630
},
{
"epoch": 0.607800909308447,
"grad_norm": 0.15046217184291616,
"learning_rate": 7.989657769796533e-05,
"loss": 1.0882,
"step": 635
},
{
"epoch": 0.6125867432400096,
"grad_norm": 0.14348283659906289,
"learning_rate": 7.82607797102056e-05,
"loss": 1.0861,
"step": 640
},
{
"epoch": 0.6173725771715721,
"grad_norm": 0.14685503152106738,
"learning_rate": 7.663106506686057e-05,
"loss": 1.1003,
"step": 645
},
{
"epoch": 0.6221584111031347,
"grad_norm": 0.1480277391784376,
"learning_rate": 7.500788981535708e-05,
"loss": 1.0758,
"step": 650
},
{
"epoch": 0.6269442450346973,
"grad_norm": 0.1477910922274185,
"learning_rate": 7.339170817318625e-05,
"loss": 1.0695,
"step": 655
},
{
"epoch": 0.6317300789662599,
"grad_norm": 0.1551465349289344,
"learning_rate": 7.178297240079882e-05,
"loss": 1.0942,
"step": 660
},
{
"epoch": 0.6365159128978225,
"grad_norm": 0.148811465121087,
"learning_rate": 7.018213267504775e-05,
"loss": 1.0825,
"step": 665
},
{
"epoch": 0.641301746829385,
"grad_norm": 0.146937156337137,
"learning_rate": 6.858963696321403e-05,
"loss": 1.0985,
"step": 670
},
{
"epoch": 0.6460875807609476,
"grad_norm": 0.14703161191479286,
"learning_rate": 6.700593089765086e-05,
"loss": 1.06,
"step": 675
},
{
"epoch": 0.6508734146925101,
"grad_norm": 0.14564360148371303,
"learning_rate": 6.543145765108106e-05,
"loss": 1.0853,
"step": 680
},
{
"epoch": 0.6556592486240728,
"grad_norm": 0.14887365645849163,
"learning_rate": 6.3866657812583e-05,
"loss": 1.0787,
"step": 685
},
{
"epoch": 0.6604450825556353,
"grad_norm": 0.14533659914404762,
"learning_rate": 6.231196926429913e-05,
"loss": 1.073,
"step": 690
},
{
"epoch": 0.6652309164871979,
"grad_norm": 0.2354314895944445,
"learning_rate": 6.076782705890257e-05,
"loss": 1.0815,
"step": 695
},
{
"epoch": 0.6700167504187605,
"grad_norm": 0.14132233475416703,
"learning_rate": 5.9234663297854876e-05,
"loss": 1.0555,
"step": 700
},
{
"epoch": 0.674802584350323,
"grad_norm": 0.14913316600220797,
"learning_rate": 5.7712907010490036e-05,
"loss": 1.0785,
"step": 705
},
{
"epoch": 0.6795884182818857,
"grad_norm": 0.15328072297180578,
"learning_rate": 5.620298403395805e-05,
"loss": 1.0857,
"step": 710
},
{
"epoch": 0.6843742522134482,
"grad_norm": 0.17603388258774993,
"learning_rate": 5.4705316894061765e-05,
"loss": 1.0898,
"step": 715
},
{
"epoch": 0.6891600861450108,
"grad_norm": 0.1448443355064005,
"learning_rate": 5.322032468702036e-05,
"loss": 1.0714,
"step": 720
},
{
"epoch": 0.6939459200765733,
"grad_norm": 0.4624474555190123,
"learning_rate": 5.1748422962192376e-05,
"loss": 1.0994,
"step": 725
},
{
"epoch": 0.6987317540081359,
"grad_norm": 0.14868980834848183,
"learning_rate": 5.0290023605791666e-05,
"loss": 1.0725,
"step": 730
},
{
"epoch": 0.7035175879396985,
"grad_norm": 0.15278504704361137,
"learning_rate": 4.8845534725628086e-05,
"loss": 1.0962,
"step": 735
},
{
"epoch": 0.708303421871261,
"grad_norm": 0.14605679246576617,
"learning_rate": 4.741536053690552e-05,
"loss": 1.0947,
"step": 740
},
{
"epoch": 0.7130892558028237,
"grad_norm": 0.172204603811799,
"learning_rate": 4.599990124910918e-05,
"loss": 1.0758,
"step": 745
},
{
"epoch": 0.7178750897343862,
"grad_norm": 0.14357849865669614,
"learning_rate": 4.4599552954014145e-05,
"loss": 1.0682,
"step": 750
},
{
"epoch": 0.7226609236659488,
"grad_norm": 0.14980923833672957,
"learning_rate": 4.32147075148458e-05,
"loss": 1.0814,
"step": 755
},
{
"epoch": 0.7274467575975113,
"grad_norm": 0.16395768222951593,
"learning_rate": 4.1845752456623665e-05,
"loss": 1.0583,
"step": 760
},
{
"epoch": 0.7322325915290739,
"grad_norm": 0.14059821304657993,
"learning_rate": 4.049307085771931e-05,
"loss": 1.0839,
"step": 765
},
{
"epoch": 0.7370184254606366,
"grad_norm": 0.1472110334031576,
"learning_rate": 3.9157041242658477e-05,
"loss": 1.1079,
"step": 770
},
{
"epoch": 0.7418042593921991,
"grad_norm": 0.14020342123522012,
"learning_rate": 3.783803747619741e-05,
"loss": 1.0829,
"step": 775
},
{
"epoch": 0.7465900933237617,
"grad_norm": 0.17437047699695307,
"learning_rate": 3.653642865870359e-05,
"loss": 1.0808,
"step": 780
},
{
"epoch": 0.7513759272553242,
"grad_norm": 0.14320013892049976,
"learning_rate": 3.525257902286908e-05,
"loss": 1.0608,
"step": 785
},
{
"epoch": 0.7561617611868868,
"grad_norm": 0.14437417000631428,
"learning_rate": 3.398684783178648e-05,
"loss": 1.0618,
"step": 790
},
{
"epoch": 0.7609475951184493,
"grad_norm": 0.14321363672597254,
"learning_rate": 3.273958927841525e-05,
"loss": 1.0659,
"step": 795
},
{
"epoch": 0.765733429050012,
"grad_norm": 0.14121990349576288,
"learning_rate": 3.1511152386467055e-05,
"loss": 1.0936,
"step": 800
},
{
"epoch": 0.7705192629815746,
"grad_norm": 0.16146069783583863,
"learning_rate": 3.0301880912737568e-05,
"loss": 1.0647,
"step": 805
},
{
"epoch": 0.7753050969131371,
"grad_norm": 0.1447026626027737,
"learning_rate": 2.9112113250911844e-05,
"loss": 1.0747,
"step": 810
},
{
"epoch": 0.7800909308446997,
"grad_norm": 0.14724228311552523,
"learning_rate": 2.7942182336870925e-05,
"loss": 1.1046,
"step": 815
},
{
"epoch": 0.7848767647762622,
"grad_norm": 0.14612792897080507,
"learning_rate": 2.6792415555525463e-05,
"loss": 1.0391,
"step": 820
},
{
"epoch": 0.7896625987078248,
"grad_norm": 0.14445016139434405,
"learning_rate": 2.5663134649202647e-05,
"loss": 1.0808,
"step": 825
},
{
"epoch": 0.7944484326393874,
"grad_norm": 0.14283033243615206,
"learning_rate": 2.4554655627612245e-05,
"loss": 1.0767,
"step": 830
},
{
"epoch": 0.79923426657095,
"grad_norm": 0.1428104588189023,
"learning_rate": 2.34672886794167e-05,
"loss": 1.0884,
"step": 835
},
{
"epoch": 0.8040201005025126,
"grad_norm": 0.14106416222944104,
"learning_rate": 2.2401338085430323e-05,
"loss": 1.0891,
"step": 840
},
{
"epoch": 0.8088059344340751,
"grad_norm": 0.14453431354715718,
"learning_rate": 2.135710213347134e-05,
"loss": 1.0829,
"step": 845
},
{
"epoch": 0.8135917683656377,
"grad_norm": 0.1436138017414945,
"learning_rate": 2.0334873034891554e-05,
"loss": 1.0823,
"step": 850
},
{
"epoch": 0.8183776022972002,
"grad_norm": 0.14415504753616376,
"learning_rate": 1.933493684280574e-05,
"loss": 1.0749,
"step": 855
},
{
"epoch": 0.8231634362287629,
"grad_norm": 0.14188286670890893,
"learning_rate": 1.8357573372044834e-05,
"loss": 1.0775,
"step": 860
},
{
"epoch": 0.8279492701603255,
"grad_norm": 0.14043422592547342,
"learning_rate": 1.740305612085439e-05,
"loss": 1.0852,
"step": 865
},
{
"epoch": 0.832735104091888,
"grad_norm": 0.14014109535516273,
"learning_rate": 1.647165219436113e-05,
"loss": 1.0716,
"step": 870
},
{
"epoch": 0.8375209380234506,
"grad_norm": 0.18266681120520475,
"learning_rate": 1.556362222982799e-05,
"loss": 1.0711,
"step": 875
},
{
"epoch": 0.8423067719550131,
"grad_norm": 0.14585487433506303,
"learning_rate": 1.4679220323719234e-05,
"loss": 1.0561,
"step": 880
},
{
"epoch": 0.8470926058865758,
"grad_norm": 0.13911103035630754,
"learning_rate": 1.3818693960596185e-05,
"loss": 1.0707,
"step": 885
},
{
"epoch": 0.8518784398181383,
"grad_norm": 0.15612123605821987,
"learning_rate": 1.2982283943862738e-05,
"loss": 1.0494,
"step": 890
},
{
"epoch": 0.8566642737497009,
"grad_norm": 0.14067555622023134,
"learning_rate": 1.217022432838093e-05,
"loss": 1.0686,
"step": 895
},
{
"epoch": 0.8614501076812635,
"grad_norm": 0.1457410414761679,
"learning_rate": 1.1382742354974429e-05,
"loss": 1.0562,
"step": 900
},
{
"epoch": 0.866235941612826,
"grad_norm": 0.1398250627278749,
"learning_rate": 1.0620058386839393e-05,
"loss": 1.0753,
"step": 905
},
{
"epoch": 0.8710217755443886,
"grad_norm": 0.14690238478434312,
"learning_rate": 9.882385847879539e-06,
"loss": 1.0539,
"step": 910
},
{
"epoch": 0.8758076094759512,
"grad_norm": 0.14224902345010998,
"learning_rate": 9.169931162983137e-06,
"loss": 1.0575,
"step": 915
},
{
"epoch": 0.8805934434075138,
"grad_norm": 0.14002967562121116,
"learning_rate": 8.482893700258643e-06,
"loss": 1.0831,
"step": 920
},
{
"epoch": 0.8853792773390763,
"grad_norm": 0.14652920530592364,
"learning_rate": 7.821465715244947e-06,
"loss": 1.0844,
"step": 925
},
{
"epoch": 0.8901651112706389,
"grad_norm": 0.13985808750925746,
"learning_rate": 7.185832297111938e-06,
"loss": 1.0618,
"step": 930
},
{
"epoch": 0.8949509452022015,
"grad_norm": 0.15160308510490375,
"learning_rate": 6.576171316866608e-06,
"loss": 1.0773,
"step": 935
},
{
"epoch": 0.899736779133764,
"grad_norm": 0.14784429409642344,
"learning_rate": 5.9926533775789055e-06,
"loss": 1.0951,
"step": 940
},
{
"epoch": 0.9045226130653267,
"grad_norm": 0.14167088318411009,
"learning_rate": 5.435441766641369e-06,
"loss": 1.0841,
"step": 945
},
{
"epoch": 0.9093084469968892,
"grad_norm": 0.14256818695069146,
"learning_rate": 4.904692410075973e-06,
"loss": 1.0647,
"step": 950
},
{
"epoch": 0.9140942809284518,
"grad_norm": 0.15531748633710526,
"learning_rate": 4.400553828900989e-06,
"loss": 1.0757,
"step": 955
},
{
"epoch": 0.9188801148600143,
"grad_norm": 0.14420681549864126,
"learning_rate": 3.923167097569935e-06,
"loss": 1.0903,
"step": 960
},
{
"epoch": 0.9236659487915769,
"grad_norm": 0.14398010788396462,
"learning_rate": 3.4726658044943126e-06,
"loss": 1.0668,
"step": 965
},
{
"epoch": 0.9284517827231396,
"grad_norm": 0.14589900176146645,
"learning_rate": 3.0491760146611926e-06,
"loss": 1.0845,
"step": 970
},
{
"epoch": 0.9332376166547021,
"grad_norm": 0.13882750982702796,
"learning_rate": 2.652816234356159e-06,
"loss": 1.0382,
"step": 975
},
{
"epoch": 0.9380234505862647,
"grad_norm": 0.14112035905216325,
"learning_rate": 2.283697378001315e-06,
"loss": 1.0825,
"step": 980
},
{
"epoch": 0.9428092845178272,
"grad_norm": 0.13934480624047157,
"learning_rate": 1.9419227371178627e-06,
"loss": 1.0679,
"step": 985
},
{
"epoch": 0.9475951184493898,
"grad_norm": 0.14117739445269173,
"learning_rate": 1.6275879514217052e-06,
"loss": 1.0772,
"step": 990
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.14031209854381504,
"learning_rate": 1.3407809820603856e-06,
"loss": 1.0767,
"step": 995
},
{
"epoch": 0.957166786312515,
"grad_norm": 0.14091355128035063,
"learning_rate": 1.0815820869985893e-06,
"loss": 1.0635,
"step": 1000
},
{
"epoch": 0.9619526202440776,
"grad_norm": 0.14100298765660577,
"learning_rate": 8.50063798559475e-07,
"loss": 1.0861,
"step": 1005
},
{
"epoch": 0.9667384541756401,
"grad_norm": 0.1412224316948679,
"learning_rate": 6.462909031276443e-07,
"loss": 1.0633,
"step": 1010
},
{
"epoch": 0.9715242881072027,
"grad_norm": 0.1385906964183353,
"learning_rate": 4.7032042301985434e-07,
"loss": 1.0726,
"step": 1015
},
{
"epoch": 0.9763101220387652,
"grad_norm": 0.151976410727331,
"learning_rate": 3.222016005282824e-07,
"loss": 1.0645,
"step": 1020
},
{
"epoch": 0.9810959559703278,
"grad_norm": 0.14304003914264313,
"learning_rate": 2.0197588414094804e-07,
"loss": 1.0785,
"step": 1025
},
{
"epoch": 0.9858817899018905,
"grad_norm": 0.1395525833943131,
"learning_rate": 1.0967691694302451e-07,
"loss": 1.0582,
"step": 1030
},
{
"epoch": 0.990667623833453,
"grad_norm": 0.14208441339069042,
"learning_rate": 4.5330527202480654e-08,
"loss": 1.0763,
"step": 1035
},
{
"epoch": 0.9954534577650156,
"grad_norm": 0.13931375530799342,
"learning_rate": 8.95472114241791e-09,
"loss": 1.0444,
"step": 1040
},
{
"epoch": 0.9992821249102656,
"eval_loss": 1.077100157737732,
"eval_runtime": 3923.6787,
"eval_samples_per_second": 3.43,
"eval_steps_per_second": 0.858,
"step": 1044
},
{
"epoch": 0.9992821249102656,
"step": 1044,
"total_flos": 2155604625850368.0,
"train_loss": 1.091635976486279,
"train_runtime": 24351.9745,
"train_samples_per_second": 2.746,
"train_steps_per_second": 0.043
}
],
"logging_steps": 5,
"max_steps": 1044,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2155604625850368.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}