downstream-7b-p0.1_seed42_rare / trainer_state.json

Model save

55d7170 verified 11 months ago

37.6 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.9992821249102656,
	"eval_steps": 500,
	"global_step": 1044,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.000957166786312515,
	"grad_norm": 3.9142152723532337,
	"learning_rate": 1.9047619047619051e-06,
	"loss": 1.3978,
	"step": 1
	},
	{
	"epoch": 0.004785833931562575,
	"grad_norm": 1.3430217420733046,
	"learning_rate": 9.523809523809523e-06,
	"loss": 1.3489,
	"step": 5
	},
	{
	"epoch": 0.00957166786312515,
	"grad_norm": 0.5850408636793494,
	"learning_rate": 1.9047619047619046e-05,
	"loss": 1.2871,
	"step": 10
	},
	{
	"epoch": 0.014357501794687724,
	"grad_norm": 0.46666716038967326,
	"learning_rate": 2.857142857142857e-05,
	"loss": 1.2106,
	"step": 15
	},
	{
	"epoch": 0.0191433357262503,
	"grad_norm": 0.35044064248530404,
	"learning_rate": 3.809523809523809e-05,
	"loss": 1.189,
	"step": 20
	},
	{
	"epoch": 0.023929169657812874,
	"grad_norm": 0.27361957875198517,
	"learning_rate": 4.761904761904762e-05,
	"loss": 1.1469,
	"step": 25
	},
	{
	"epoch": 0.028715003589375447,
	"grad_norm": 0.2368453005937937,
	"learning_rate": 5.714285714285714e-05,
	"loss": 1.158,
	"step": 30
	},
	{
	"epoch": 0.03350083752093802,
	"grad_norm": 0.2277332385016794,
	"learning_rate": 6.666666666666667e-05,
	"loss": 1.1437,
	"step": 35
	},
	{
	"epoch": 0.0382866714525006,
	"grad_norm": 0.2265653549311157,
	"learning_rate": 7.619047619047618e-05,
	"loss": 1.1302,
	"step": 40
	},
	{
	"epoch": 0.043072505384063174,
	"grad_norm": 0.22079711284915807,
	"learning_rate": 8.571428571428571e-05,
	"loss": 1.13,
	"step": 45
	},
	{
	"epoch": 0.04785833931562575,
	"grad_norm": 0.20813516832540208,
	"learning_rate": 9.523809523809524e-05,
	"loss": 1.106,
	"step": 50
	},
	{
	"epoch": 0.05264417324718832,
	"grad_norm": 0.2044131638757028,
	"learning_rate": 0.00010476190476190477,
	"loss": 1.1348,
	"step": 55
	},
	{
	"epoch": 0.057430007178750894,
	"grad_norm": 0.20101729146508107,
	"learning_rate": 0.00011428571428571428,
	"loss": 1.1018,
	"step": 60
	},
	{
	"epoch": 0.062215841110313475,
	"grad_norm": 0.21865369935553125,
	"learning_rate": 0.0001238095238095238,
	"loss": 1.1129,
	"step": 65
	},
	{
	"epoch": 0.06700167504187604,
	"grad_norm": 0.18405578482864565,
	"learning_rate": 0.00013333333333333334,
	"loss": 1.1018,
	"step": 70
	},
	{
	"epoch": 0.07178750897343862,
	"grad_norm": 0.18488079729650672,
	"learning_rate": 0.00014285714285714287,
	"loss": 1.1417,
	"step": 75
	},
	{
	"epoch": 0.0765733429050012,
	"grad_norm": 0.18433481759594844,
	"learning_rate": 0.00015238095238095237,
	"loss": 1.111,
	"step": 80
	},
	{
	"epoch": 0.08135917683656377,
	"grad_norm": 0.20377971597879482,
	"learning_rate": 0.00016190476190476192,
	"loss": 1.0709,
	"step": 85
	},
	{
	"epoch": 0.08614501076812635,
	"grad_norm": 0.20225554382239913,
	"learning_rate": 0.00017142857142857143,
	"loss": 1.1142,
	"step": 90
	},
	{
	"epoch": 0.09093084469968891,
	"grad_norm": 0.18520967333311886,
	"learning_rate": 0.00018095238095238095,
	"loss": 1.1142,
	"step": 95
	},
	{
	"epoch": 0.0957166786312515,
	"grad_norm": 0.19606367225373053,
	"learning_rate": 0.00019047619047619048,
	"loss": 1.1049,
	"step": 100
	},
	{
	"epoch": 0.10050251256281408,
	"grad_norm": 0.1867473714189168,
	"learning_rate": 0.0002,
	"loss": 1.0927,
	"step": 105
	},
	{
	"epoch": 0.10528834649437664,
	"grad_norm": 0.185817071062854,
	"learning_rate": 0.00019998600836567816,
	"loss": 1.1206,
	"step": 110
	},
	{
	"epoch": 0.11007418042593922,
	"grad_norm": 0.1762396939142846,
	"learning_rate": 0.00019994403737802927,
	"loss": 1.1022,
	"step": 115
	},
	{
	"epoch": 0.11486001435750179,
	"grad_norm": 0.16668242539032083,
	"learning_rate": 0.00019987409878190752,
	"loss": 1.1052,
	"step": 120
	},
	{
	"epoch": 0.11964584828906437,
	"grad_norm": 0.1764182848939352,
	"learning_rate": 0.00019977621214841822,
	"loss": 1.1059,
	"step": 125
	},
	{
	"epoch": 0.12443168222062695,
	"grad_norm": 0.1787380695504439,
	"learning_rate": 0.0001996504048694409,
	"loss": 1.1102,
	"step": 130
	},
	{
	"epoch": 0.12921751615218952,
	"grad_norm": 0.18152638872711746,
	"learning_rate": 0.00019949671214996445,
	"loss": 1.0986,
	"step": 135
	},
	{
	"epoch": 0.13400335008375208,
	"grad_norm": 0.1745817045352934,
	"learning_rate": 0.00019931517699823547,
	"loss": 1.085,
	"step": 140
	},
	{
	"epoch": 0.13878918401531468,
	"grad_norm": 0.17554030590687575,
	"learning_rate": 0.0001991058502137231,
	"loss": 1.1363,
	"step": 145
	},
	{
	"epoch": 0.14357501794687724,
	"grad_norm": 0.1800405053641118,
	"learning_rate": 0.00019886879037290384,
	"loss": 1.0924,
	"step": 150
	},
	{
	"epoch": 0.1483608518784398,
	"grad_norm": 0.19148632700424054,
	"learning_rate": 0.0001986040638128698,
	"loss": 1.0824,
	"step": 155
	},
	{
	"epoch": 0.1531466858100024,
	"grad_norm": 0.17313730878343153,
	"learning_rate": 0.0001983117446127654,
	"loss": 1.1071,
	"step": 160
	},
	{
	"epoch": 0.15793251974156497,
	"grad_norm": 0.16961371492797625,
	"learning_rate": 0.00019799191457305768,
	"loss": 1.1311,
	"step": 165
	},
	{
	"epoch": 0.16271835367312754,
	"grad_norm": 0.17327733992734365,
	"learning_rate": 0.00019764466319264595,
	"loss": 1.1133,
	"step": 170
	},
	{
	"epoch": 0.16750418760469013,
	"grad_norm": 0.17176717180251114,
	"learning_rate": 0.00019727008764381675,
	"loss": 1.1153,
	"step": 175
	},
	{
	"epoch": 0.1722900215362527,
	"grad_norm": 0.17138393560502058,
	"learning_rate": 0.0001968682927450523,
	"loss": 1.1006,
	"step": 180
	},
	{
	"epoch": 0.17707585546781526,
	"grad_norm": 0.16465106542109514,
	"learning_rate": 0.00019643939093169844,
	"loss": 1.104,
	"step": 185
	},
	{
	"epoch": 0.18186168939937783,
	"grad_norm": 0.1699157446206454,
	"learning_rate": 0.00019598350222450178,
	"loss": 1.1167,
	"step": 190
	},
	{
	"epoch": 0.18664752333094042,
	"grad_norm": 0.17602023031266878,
	"learning_rate": 0.00019550075419602408,
	"loss": 1.1131,
	"step": 195
	},
	{
	"epoch": 0.191433357262503,
	"grad_norm": 0.18282225583073394,
	"learning_rate": 0.00019499128193494297,
	"loss": 1.0889,
	"step": 200
	},
	{
	"epoch": 0.19621919119406556,
	"grad_norm": 0.16528930064048408,
	"learning_rate": 0.0001944552280082499,
	"loss": 1.1013,
	"step": 205
	},
	{
	"epoch": 0.20100502512562815,
	"grad_norm": 0.16504045631379008,
	"learning_rate": 0.0001938927424213553,
	"loss": 1.1003,
	"step": 210
	},
	{
	"epoch": 0.20579085905719072,
	"grad_norm": 0.16559761424705557,
	"learning_rate": 0.000193303982576112,
	"loss": 1.0998,
	"step": 215
	},
	{
	"epoch": 0.21057669298875328,
	"grad_norm": 0.16791680669282769,
	"learning_rate": 0.0001926891132267692,
	"loss": 1.0919,
	"step": 220
	},
	{
	"epoch": 0.21536252692031588,
	"grad_norm": 0.16689008275055853,
	"learning_rate": 0.00019204830643386868,
	"loss": 1.1069,
	"step": 225
	},
	{
	"epoch": 0.22014836085187844,
	"grad_norm": 0.17305744880787605,
	"learning_rate": 0.00019138174151609683,
	"loss": 1.1272,
	"step": 230
	},
	{
	"epoch": 0.224934194783441,
	"grad_norm": 0.1618964004882894,
	"learning_rate": 0.00019068960500010523,
	"loss": 1.0827,
	"step": 235
	},
	{
	"epoch": 0.22972002871500358,
	"grad_norm": 0.16411871929076272,
	"learning_rate": 0.00018997209056831462,
	"loss": 1.1164,
	"step": 240
	},
	{
	"epoch": 0.23450586264656617,
	"grad_norm": 0.1691454723246668,
	"learning_rate": 0.0001892293990047159,
	"loss": 1.1079,
	"step": 245
	},
	{
	"epoch": 0.23929169657812874,
	"grad_norm": 0.1649265891086064,
	"learning_rate": 0.00018846173813868454,
	"loss": 1.0825,
	"step": 250
	},
	{
	"epoch": 0.2440775305096913,
	"grad_norm": 0.17018378559137046,
	"learning_rate": 0.000187669322786823,
	"loss": 1.1216,
	"step": 255
	},
	{
	"epoch": 0.2488633644412539,
	"grad_norm": 0.1703316481606123,
	"learning_rate": 0.0001868523746928479,
	"loss": 1.0783,
	"step": 260
	},
	{
	"epoch": 0.25364919837281646,
	"grad_norm": 0.16735434292797727,
	"learning_rate": 0.0001860111224655391,
	"loss": 1.1149,
	"step": 265
	},
	{
	"epoch": 0.25843503230437903,
	"grad_norm": 0.1555718882454273,
	"learning_rate": 0.0001851458015147673,
	"loss": 1.1075,
	"step": 270
	},
	{
	"epoch": 0.2632208662359416,
	"grad_norm": 0.16977397286003804,
	"learning_rate": 0.00018425665398561883,
	"loss": 1.0852,
	"step": 275
	},
	{
	"epoch": 0.26800670016750416,
	"grad_norm": 0.1663432558952086,
	"learning_rate": 0.00018334392869063536,
	"loss": 1.0811,
	"step": 280
	},
	{
	"epoch": 0.2727925340990668,
	"grad_norm": 0.1666368907912767,
	"learning_rate": 0.00018240788104018822,
	"loss": 1.1014,
	"step": 285
	},
	{
	"epoch": 0.27757836803062935,
	"grad_norm": 0.15687924477714424,
	"learning_rate": 0.00018144877297100606,
	"loss": 1.0736,
	"step": 290
	},
	{
	"epoch": 0.2823642019621919,
	"grad_norm": 0.16061101558707236,
	"learning_rate": 0.0001804668728728764,
	"loss": 1.0931,
	"step": 295
	},
	{
	"epoch": 0.2871500358937545,
	"grad_norm": 0.160793101364396,
	"learning_rate": 0.00017946245551354157,
	"loss": 1.0999,
	"step": 300
	},
	{
	"epoch": 0.29193586982531705,
	"grad_norm": 0.1633263725476254,
	"learning_rate": 0.00017843580196180952,
	"loss": 1.0948,
	"step": 305
	},
	{
	"epoch": 0.2967217037568796,
	"grad_norm": 0.1631452387362867,
	"learning_rate": 0.00017738719950890168,
	"loss": 1.1013,
	"step": 310
	},
	{
	"epoch": 0.3015075376884422,
	"grad_norm": 0.16210535171429089,
	"learning_rate": 0.00017631694158805946,
	"loss": 1.0798,
	"step": 315
	},
	{
	"epoch": 0.3062933716200048,
	"grad_norm": 0.16229934636841442,
	"learning_rate": 0.000175225327692432,
	"loss": 1.0575,
	"step": 320
	},
	{
	"epoch": 0.3110792055515674,
	"grad_norm": 0.16588195121854765,
	"learning_rate": 0.00017411266329126824,
	"loss": 1.096,
	"step": 325
	},
	{
	"epoch": 0.31586503948312994,
	"grad_norm": 0.158210200244668,
	"learning_rate": 0.00017297925974443673,
	"loss": 1.1071,
	"step": 330
	},
	{
	"epoch": 0.3206508734146925,
	"grad_norm": 0.1663784778299078,
	"learning_rate": 0.00017182543421529676,
	"loss": 1.0739,
	"step": 335
	},
	{
	"epoch": 0.32543670734625507,
	"grad_norm": 0.15384028200238806,
	"learning_rate": 0.00017065150958194586,
	"loss": 1.0848,
	"step": 340
	},
	{
	"epoch": 0.33022254127781764,
	"grad_norm": 0.16073214574887584,
	"learning_rate": 0.00016945781434686783,
	"loss": 1.1157,
	"step": 345
	},
	{
	"epoch": 0.33500837520938026,
	"grad_norm": 0.1745939193140414,
	"learning_rate": 0.00016824468254500704,
	"loss": 1.0815,
	"step": 350
	},
	{
	"epoch": 0.3397942091409428,
	"grad_norm": 0.15802897019970708,
	"learning_rate": 0.0001670124536502947,
	"loss": 1.0779,
	"step": 355
	},
	{
	"epoch": 0.3445800430725054,
	"grad_norm": 0.1579431494377225,
	"learning_rate": 0.00016576147248065267,
	"loss": 1.1031,
	"step": 360
	},
	{
	"epoch": 0.34936587700406796,
	"grad_norm": 0.16455288633589932,
	"learning_rate": 0.00016449208910150232,
	"loss": 1.1207,
	"step": 365
	},
	{
	"epoch": 0.3541517109356305,
	"grad_norm": 0.15512720174775488,
	"learning_rate": 0.00016320465872780477,
	"loss": 1.0843,
	"step": 370
	},
	{
	"epoch": 0.3589375448671931,
	"grad_norm": 0.15810739086397552,
	"learning_rate": 0.00016189954162466012,
	"loss": 1.0674,
	"step": 375
	},
	{
	"epoch": 0.36372337879875566,
	"grad_norm": 0.15539897008538223,
	"learning_rate": 0.0001605771030064934,
	"loss": 1.1075,
	"step": 380
	},
	{
	"epoch": 0.3685092127303183,
	"grad_norm": 0.16059302879871643,
	"learning_rate": 0.00015923771293485585,
	"loss": 1.1083,
	"step": 385
	},
	{
	"epoch": 0.37329504666188085,
	"grad_norm": 0.1726863039386017,
	"learning_rate": 0.00015788174621486934,
	"loss": 1.0839,
	"step": 390
	},
	{
	"epoch": 0.3780808805934434,
	"grad_norm": 0.160896911699282,
	"learning_rate": 0.00015650958229034391,
	"loss": 1.093,
	"step": 395
	},
	{
	"epoch": 0.382866714525006,
	"grad_norm": 0.1539033105501165,
	"learning_rate": 0.00015512160513759672,
	"loss": 1.0824,
	"step": 400
	},
	{
	"epoch": 0.38765254845656855,
	"grad_norm": 0.15253934847352404,
	"learning_rate": 0.00015371820315800315,
	"loss": 1.0611,
	"step": 405
	},
	{
	"epoch": 0.3924383823881311,
	"grad_norm": 0.1549203336671571,
	"learning_rate": 0.00015229976906930935,
	"loss": 1.0926,
	"step": 410
	},
	{
	"epoch": 0.3972242163196937,
	"grad_norm": 0.15736586699846142,
	"learning_rate": 0.0001508666997957369,
	"loss": 1.0838,
	"step": 415
	},
	{
	"epoch": 0.4020100502512563,
	"grad_norm": 0.15414651629074486,
	"learning_rate": 0.00014941939635691035,
	"loss": 1.0962,
	"step": 420
	},
	{
	"epoch": 0.40679588418281887,
	"grad_norm": 0.15216014768555902,
	"learning_rate": 0.00014795826375563925,
	"loss": 1.0837,
	"step": 425
	},
	{
	"epoch": 0.41158171811438143,
	"grad_norm": 0.1551252486012846,
	"learning_rate": 0.0001464837108645845,
	"loss": 1.096,
	"step": 430
	},
	{
	"epoch": 0.416367552045944,
	"grad_norm": 0.15880410911617168,
	"learning_rate": 0.00014499615031184296,
	"loss": 1.0947,
	"step": 435
	},
	{
	"epoch": 0.42115338597750657,
	"grad_norm": 0.16084656769756484,
	"learning_rate": 0.00014349599836548034,
	"loss": 1.0955,
	"step": 440
	},
	{
	"epoch": 0.42593921990906913,
	"grad_norm": 0.14942909791958908,
	"learning_rate": 0.0001419836748170459,
	"loss": 1.0911,
	"step": 445
	},
	{
	"epoch": 0.43072505384063176,
	"grad_norm": 0.16134597273400678,
	"learning_rate": 0.0001404596028641009,
	"loss": 1.1136,
	"step": 450
	},
	{
	"epoch": 0.4355108877721943,
	"grad_norm": 0.15552785776756606,
	"learning_rate": 0.0001389242089917943,
	"loss": 1.1005,
	"step": 455
	},
	{
	"epoch": 0.4402967217037569,
	"grad_norm": 0.1544583591443468,
	"learning_rate": 0.00013737792285351805,
	"loss": 1.0896,
	"step": 460
	},
	{
	"epoch": 0.44508255563531945,
	"grad_norm": 0.15743294110434283,
	"learning_rate": 0.0001358211771506763,
	"loss": 1.0687,
	"step": 465
	},
	{
	"epoch": 0.449868389566882,
	"grad_norm": 0.15489693015617015,
	"learning_rate": 0.00013425440751160112,
	"loss": 1.0909,
	"step": 470
	},
	{
	"epoch": 0.4546542234984446,
	"grad_norm": 0.1556280787651109,
	"learning_rate": 0.00013267805236964967,
	"loss": 1.1008,
	"step": 475
	},
	{
	"epoch": 0.45944005743000715,
	"grad_norm": 0.16139496091159036,
	"learning_rate": 0.00013109255284051615,
	"loss": 1.1167,
	"step": 480
	},
	{
	"epoch": 0.4642258913615698,
	"grad_norm": 0.15380326887200926,
	"learning_rate": 0.00012949835259879304,
	"loss": 1.1021,
	"step": 485
	},
	{
	"epoch": 0.46901172529313234,
	"grad_norm": 0.1504710821626308,
	"learning_rate": 0.00012789589775381676,
	"loss": 1.0824,
	"step": 490
	},
	{
	"epoch": 0.4737975592246949,
	"grad_norm": 0.16882632755621252,
	"learning_rate": 0.00012628563672483146,
	"loss": 1.091,
	"step": 495
	},
	{
	"epoch": 0.4785833931562575,
	"grad_norm": 0.16236683430294702,
	"learning_rate": 0.0001246680201155068,
	"loss": 1.0609,
	"step": 500
	},
	{
	"epoch": 0.48336922708782004,
	"grad_norm": 0.1534881294655078,
	"learning_rate": 0.00012304350058784405,
	"loss": 1.0611,
	"step": 505
	},
	{
	"epoch": 0.4881550610193826,
	"grad_norm": 0.16620841316700394,
	"learning_rate": 0.00012141253273550696,
	"loss": 1.0932,
	"step": 510
	},
	{
	"epoch": 0.49294089495094523,
	"grad_norm": 0.16942714030828704,
	"learning_rate": 0.00011977557295661108,
	"loss": 1.0856,
	"step": 515
	},
	{
	"epoch": 0.4977267288825078,
	"grad_norm": 0.15500201031703087,
	"learning_rate": 0.00011813307932600887,
	"loss": 1.0852,
	"step": 520
	},
	{
	"epoch": 0.5025125628140703,
	"grad_norm": 0.15248801968172002,
	"learning_rate": 0.00011648551146710556,
	"loss": 1.1069,
	"step": 525
	},
	{
	"epoch": 0.5072983967456329,
	"grad_norm": 0.14978453385390675,
	"learning_rate": 0.0001148333304232411,
	"loss": 1.088,
	"step": 530
	},
	{
	"epoch": 0.5120842306771956,
	"grad_norm": 0.14736066147246124,
	"learning_rate": 0.00011317699852867548,
	"loss": 1.0506,
	"step": 535
	},
	{
	"epoch": 0.5168700646087581,
	"grad_norm": 0.15088998664120562,
	"learning_rate": 0.0001115169792792124,
	"loss": 1.0972,
	"step": 540
	},
	{
	"epoch": 0.5216558985403207,
	"grad_norm": 0.14676026138747209,
	"learning_rate": 0.00010985373720249801,
	"loss": 1.0871,
	"step": 545
	},
	{
	"epoch": 0.5264417324718832,
	"grad_norm": 0.17054822297185676,
	"learning_rate": 0.00010818773772803082,
	"loss": 1.0957,
	"step": 550
	},
	{
	"epoch": 0.5312275664034458,
	"grad_norm": 0.15081743477470166,
	"learning_rate": 0.0001065194470569193,
	"loss": 1.1114,
	"step": 555
	},
	{
	"epoch": 0.5360134003350083,
	"grad_norm": 0.1556600989117304,
	"learning_rate": 0.0001048493320314238,
	"loss": 1.0747,
	"step": 560
	},
	{
	"epoch": 0.540799234266571,
	"grad_norm": 0.15346464585086714,
	"learning_rate": 0.00010317786000431851,
	"loss": 1.0761,
	"step": 565
	},
	{
	"epoch": 0.5455850681981336,
	"grad_norm": 0.15178562379014646,
	"learning_rate": 0.00010150549870811107,
	"loss": 1.0839,
	"step": 570
	},
	{
	"epoch": 0.5503709021296961,
	"grad_norm": 0.15263581024104103,
	"learning_rate": 9.983271612415575e-05,
	"loss": 1.0742,
	"step": 575
	},
	{
	"epoch": 0.5551567360612587,
	"grad_norm": 0.15166582071053056,
	"learning_rate": 9.81599803516968e-05,
	"loss": 1.0725,
	"step": 580
	},
	{
	"epoch": 0.5599425699928212,
	"grad_norm": 0.14735687803417952,
	"learning_rate": 9.648775947687912e-05,
	"loss": 1.0705,
	"step": 585
	},
	{
	"epoch": 0.5647284039243838,
	"grad_norm": 0.14825818203221888,
	"learning_rate": 9.48165214417624e-05,
	"loss": 1.0871,
	"step": 590
	},
	{
	"epoch": 0.5695142378559463,
	"grad_norm": 0.15700946642781993,
	"learning_rate": 9.314673391337576e-05,
	"loss": 1.0979,
	"step": 595
	},
	{
	"epoch": 0.574300071787509,
	"grad_norm": 0.15580031067347558,
	"learning_rate": 9.147886415284903e-05,
	"loss": 1.0592,
	"step": 600
	},
	{
	"epoch": 0.5790859057190716,
	"grad_norm": 0.14548002556094225,
	"learning_rate": 8.981337888465788e-05,
	"loss": 1.0787,
	"step": 605
	},
	{
	"epoch": 0.5838717396506341,
	"grad_norm": 0.14237124600928142,
	"learning_rate": 8.815074416601913e-05,
	"loss": 1.0698,
	"step": 610
	},
	{
	"epoch": 0.5886575735821967,
	"grad_norm": 0.15304745525626437,
	"learning_rate": 8.649142525647272e-05,
	"loss": 1.0848,
	"step": 615
	},
	{
	"epoch": 0.5934434075137592,
	"grad_norm": 0.14513336716190856,
	"learning_rate": 8.48358864876867e-05,
	"loss": 1.0462,
	"step": 620
	},
	{
	"epoch": 0.5982292414453219,
	"grad_norm": 0.1468415945819683,
	"learning_rate": 8.318459113352221e-05,
	"loss": 1.0906,
	"step": 625
	},
	{
	"epoch": 0.6030150753768844,
	"grad_norm": 0.14408143553897426,
	"learning_rate": 8.153800128039441e-05,
	"loss": 1.085,
	"step": 630
	},
	{
	"epoch": 0.607800909308447,
	"grad_norm": 0.15046217184291616,
	"learning_rate": 7.989657769796533e-05,
	"loss": 1.0882,
	"step": 635
	},
	{
	"epoch": 0.6125867432400096,
	"grad_norm": 0.14348283659906289,
	"learning_rate": 7.82607797102056e-05,
	"loss": 1.0861,
	"step": 640
	},
	{
	"epoch": 0.6173725771715721,
	"grad_norm": 0.14685503152106738,
	"learning_rate": 7.663106506686057e-05,
	"loss": 1.1003,
	"step": 645
	},
	{
	"epoch": 0.6221584111031347,
	"grad_norm": 0.1480277391784376,
	"learning_rate": 7.500788981535708e-05,
	"loss": 1.0758,
	"step": 650
	},
	{
	"epoch": 0.6269442450346973,
	"grad_norm": 0.1477910922274185,
	"learning_rate": 7.339170817318625e-05,
	"loss": 1.0695,
	"step": 655
	},
	{
	"epoch": 0.6317300789662599,
	"grad_norm": 0.1551465349289344,
	"learning_rate": 7.178297240079882e-05,
	"loss": 1.0942,
	"step": 660
	},
	{
	"epoch": 0.6365159128978225,
	"grad_norm": 0.148811465121087,
	"learning_rate": 7.018213267504775e-05,
	"loss": 1.0825,
	"step": 665
	},
	{
	"epoch": 0.641301746829385,
	"grad_norm": 0.146937156337137,
	"learning_rate": 6.858963696321403e-05,
	"loss": 1.0985,
	"step": 670
	},
	{
	"epoch": 0.6460875807609476,
	"grad_norm": 0.14703161191479286,
	"learning_rate": 6.700593089765086e-05,
	"loss": 1.06,
	"step": 675
	},
	{
	"epoch": 0.6508734146925101,
	"grad_norm": 0.14564360148371303,
	"learning_rate": 6.543145765108106e-05,
	"loss": 1.0853,
	"step": 680
	},
	{
	"epoch": 0.6556592486240728,
	"grad_norm": 0.14887365645849163,
	"learning_rate": 6.3866657812583e-05,
	"loss": 1.0787,
	"step": 685
	},
	{
	"epoch": 0.6604450825556353,
	"grad_norm": 0.14533659914404762,
	"learning_rate": 6.231196926429913e-05,
	"loss": 1.073,
	"step": 690
	},
	{
	"epoch": 0.6652309164871979,
	"grad_norm": 0.2354314895944445,
	"learning_rate": 6.076782705890257e-05,
	"loss": 1.0815,
	"step": 695
	},
	{
	"epoch": 0.6700167504187605,
	"grad_norm": 0.14132233475416703,
	"learning_rate": 5.9234663297854876e-05,
	"loss": 1.0555,
	"step": 700
	},
	{
	"epoch": 0.674802584350323,
	"grad_norm": 0.14913316600220797,
	"learning_rate": 5.7712907010490036e-05,
	"loss": 1.0785,
	"step": 705
	},
	{
	"epoch": 0.6795884182818857,
	"grad_norm": 0.15328072297180578,
	"learning_rate": 5.620298403395805e-05,
	"loss": 1.0857,
	"step": 710
	},
	{
	"epoch": 0.6843742522134482,
	"grad_norm": 0.17603388258774993,
	"learning_rate": 5.4705316894061765e-05,
	"loss": 1.0898,
	"step": 715
	},
	{
	"epoch": 0.6891600861450108,
	"grad_norm": 0.1448443355064005,
	"learning_rate": 5.322032468702036e-05,
	"loss": 1.0714,
	"step": 720
	},
	{
	"epoch": 0.6939459200765733,
	"grad_norm": 0.4624474555190123,
	"learning_rate": 5.1748422962192376e-05,
	"loss": 1.0994,
	"step": 725
	},
	{
	"epoch": 0.6987317540081359,
	"grad_norm": 0.14868980834848183,
	"learning_rate": 5.0290023605791666e-05,
	"loss": 1.0725,
	"step": 730
	},
	{
	"epoch": 0.7035175879396985,
	"grad_norm": 0.15278504704361137,
	"learning_rate": 4.8845534725628086e-05,
	"loss": 1.0962,
	"step": 735
	},
	{
	"epoch": 0.708303421871261,
	"grad_norm": 0.14605679246576617,
	"learning_rate": 4.741536053690552e-05,
	"loss": 1.0947,
	"step": 740
	},
	{
	"epoch": 0.7130892558028237,
	"grad_norm": 0.172204603811799,
	"learning_rate": 4.599990124910918e-05,
	"loss": 1.0758,
	"step": 745
	},
	{
	"epoch": 0.7178750897343862,
	"grad_norm": 0.14357849865669614,
	"learning_rate": 4.4599552954014145e-05,
	"loss": 1.0682,
	"step": 750
	},
	{
	"epoch": 0.7226609236659488,
	"grad_norm": 0.14980923833672957,
	"learning_rate": 4.32147075148458e-05,
	"loss": 1.0814,
	"step": 755
	},
	{
	"epoch": 0.7274467575975113,
	"grad_norm": 0.16395768222951593,
	"learning_rate": 4.1845752456623665e-05,
	"loss": 1.0583,
	"step": 760
	},
	{
	"epoch": 0.7322325915290739,
	"grad_norm": 0.14059821304657993,
	"learning_rate": 4.049307085771931e-05,
	"loss": 1.0839,
	"step": 765
	},
	{
	"epoch": 0.7370184254606366,
	"grad_norm": 0.1472110334031576,
	"learning_rate": 3.9157041242658477e-05,
	"loss": 1.1079,
	"step": 770
	},
	{
	"epoch": 0.7418042593921991,
	"grad_norm": 0.14020342123522012,
	"learning_rate": 3.783803747619741e-05,
	"loss": 1.0829,
	"step": 775
	},
	{
	"epoch": 0.7465900933237617,
	"grad_norm": 0.17437047699695307,
	"learning_rate": 3.653642865870359e-05,
	"loss": 1.0808,
	"step": 780
	},
	{
	"epoch": 0.7513759272553242,
	"grad_norm": 0.14320013892049976,
	"learning_rate": 3.525257902286908e-05,
	"loss": 1.0608,
	"step": 785
	},
	{
	"epoch": 0.7561617611868868,
	"grad_norm": 0.14437417000631428,
	"learning_rate": 3.398684783178648e-05,
	"loss": 1.0618,
	"step": 790
	},
	{
	"epoch": 0.7609475951184493,
	"grad_norm": 0.14321363672597254,
	"learning_rate": 3.273958927841525e-05,
	"loss": 1.0659,
	"step": 795
	},
	{
	"epoch": 0.765733429050012,
	"grad_norm": 0.14121990349576288,
	"learning_rate": 3.1511152386467055e-05,
	"loss": 1.0936,
	"step": 800
	},
	{
	"epoch": 0.7705192629815746,
	"grad_norm": 0.16146069783583863,
	"learning_rate": 3.0301880912737568e-05,
	"loss": 1.0647,
	"step": 805
	},
	{
	"epoch": 0.7753050969131371,
	"grad_norm": 0.1447026626027737,
	"learning_rate": 2.9112113250911844e-05,
	"loss": 1.0747,
	"step": 810
	},
	{
	"epoch": 0.7800909308446997,
	"grad_norm": 0.14724228311552523,
	"learning_rate": 2.7942182336870925e-05,
	"loss": 1.1046,
	"step": 815
	},
	{
	"epoch": 0.7848767647762622,
	"grad_norm": 0.14612792897080507,
	"learning_rate": 2.6792415555525463e-05,
	"loss": 1.0391,
	"step": 820
	},
	{
	"epoch": 0.7896625987078248,
	"grad_norm": 0.14445016139434405,
	"learning_rate": 2.5663134649202647e-05,
	"loss": 1.0808,
	"step": 825
	},
	{
	"epoch": 0.7944484326393874,
	"grad_norm": 0.14283033243615206,
	"learning_rate": 2.4554655627612245e-05,
	"loss": 1.0767,
	"step": 830
	},
	{
	"epoch": 0.79923426657095,
	"grad_norm": 0.1428104588189023,
	"learning_rate": 2.34672886794167e-05,
	"loss": 1.0884,
	"step": 835
	},
	{
	"epoch": 0.8040201005025126,
	"grad_norm": 0.14106416222944104,
	"learning_rate": 2.2401338085430323e-05,
	"loss": 1.0891,
	"step": 840
	},
	{
	"epoch": 0.8088059344340751,
	"grad_norm": 0.14453431354715718,
	"learning_rate": 2.135710213347134e-05,
	"loss": 1.0829,
	"step": 845
	},
	{
	"epoch": 0.8135917683656377,
	"grad_norm": 0.1436138017414945,
	"learning_rate": 2.0334873034891554e-05,
	"loss": 1.0823,
	"step": 850
	},
	{
	"epoch": 0.8183776022972002,
	"grad_norm": 0.14415504753616376,
	"learning_rate": 1.933493684280574e-05,
	"loss": 1.0749,
	"step": 855
	},
	{
	"epoch": 0.8231634362287629,
	"grad_norm": 0.14188286670890893,
	"learning_rate": 1.8357573372044834e-05,
	"loss": 1.0775,
	"step": 860
	},
	{
	"epoch": 0.8279492701603255,
	"grad_norm": 0.14043422592547342,
	"learning_rate": 1.740305612085439e-05,
	"loss": 1.0852,
	"step": 865
	},
	{
	"epoch": 0.832735104091888,
	"grad_norm": 0.14014109535516273,
	"learning_rate": 1.647165219436113e-05,
	"loss": 1.0716,
	"step": 870
	},
	{
	"epoch": 0.8375209380234506,
	"grad_norm": 0.18266681120520475,
	"learning_rate": 1.556362222982799e-05,
	"loss": 1.0711,
	"step": 875
	},
	{
	"epoch": 0.8423067719550131,
	"grad_norm": 0.14585487433506303,
	"learning_rate": 1.4679220323719234e-05,
	"loss": 1.0561,
	"step": 880
	},
	{
	"epoch": 0.8470926058865758,
	"grad_norm": 0.13911103035630754,
	"learning_rate": 1.3818693960596185e-05,
	"loss": 1.0707,
	"step": 885
	},
	{
	"epoch": 0.8518784398181383,
	"grad_norm": 0.15612123605821987,
	"learning_rate": 1.2982283943862738e-05,
	"loss": 1.0494,
	"step": 890
	},
	{
	"epoch": 0.8566642737497009,
	"grad_norm": 0.14067555622023134,
	"learning_rate": 1.217022432838093e-05,
	"loss": 1.0686,
	"step": 895
	},
	{
	"epoch": 0.8614501076812635,
	"grad_norm": 0.1457410414761679,
	"learning_rate": 1.1382742354974429e-05,
	"loss": 1.0562,
	"step": 900
	},
	{
	"epoch": 0.866235941612826,
	"grad_norm": 0.1398250627278749,
	"learning_rate": 1.0620058386839393e-05,
	"loss": 1.0753,
	"step": 905
	},
	{
	"epoch": 0.8710217755443886,
	"grad_norm": 0.14690238478434312,
	"learning_rate": 9.882385847879539e-06,
	"loss": 1.0539,
	"step": 910
	},
	{
	"epoch": 0.8758076094759512,
	"grad_norm": 0.14224902345010998,
	"learning_rate": 9.169931162983137e-06,
	"loss": 1.0575,
	"step": 915
	},
	{
	"epoch": 0.8805934434075138,
	"grad_norm": 0.14002967562121116,
	"learning_rate": 8.482893700258643e-06,
	"loss": 1.0831,
	"step": 920
	},
	{
	"epoch": 0.8853792773390763,
	"grad_norm": 0.14652920530592364,
	"learning_rate": 7.821465715244947e-06,
	"loss": 1.0844,
	"step": 925
	},
	{
	"epoch": 0.8901651112706389,
	"grad_norm": 0.13985808750925746,
	"learning_rate": 7.185832297111938e-06,
	"loss": 1.0618,
	"step": 930
	},
	{
	"epoch": 0.8949509452022015,
	"grad_norm": 0.15160308510490375,
	"learning_rate": 6.576171316866608e-06,
	"loss": 1.0773,
	"step": 935
	},
	{
	"epoch": 0.899736779133764,
	"grad_norm": 0.14784429409642344,
	"learning_rate": 5.9926533775789055e-06,
	"loss": 1.0951,
	"step": 940
	},
	{
	"epoch": 0.9045226130653267,
	"grad_norm": 0.14167088318411009,
	"learning_rate": 5.435441766641369e-06,
	"loss": 1.0841,
	"step": 945
	},
	{
	"epoch": 0.9093084469968892,
	"grad_norm": 0.14256818695069146,
	"learning_rate": 4.904692410075973e-06,
	"loss": 1.0647,
	"step": 950
	},
	{
	"epoch": 0.9140942809284518,
	"grad_norm": 0.15531748633710526,
	"learning_rate": 4.400553828900989e-06,
	"loss": 1.0757,
	"step": 955
	},
	{
	"epoch": 0.9188801148600143,
	"grad_norm": 0.14420681549864126,
	"learning_rate": 3.923167097569935e-06,
	"loss": 1.0903,
	"step": 960
	},
	{
	"epoch": 0.9236659487915769,
	"grad_norm": 0.14398010788396462,
	"learning_rate": 3.4726658044943126e-06,
	"loss": 1.0668,
	"step": 965
	},
	{
	"epoch": 0.9284517827231396,
	"grad_norm": 0.14589900176146645,
	"learning_rate": 3.0491760146611926e-06,
	"loss": 1.0845,
	"step": 970
	},
	{
	"epoch": 0.9332376166547021,
	"grad_norm": 0.13882750982702796,
	"learning_rate": 2.652816234356159e-06,
	"loss": 1.0382,
	"step": 975
	},
	{
	"epoch": 0.9380234505862647,
	"grad_norm": 0.14112035905216325,
	"learning_rate": 2.283697378001315e-06,
	"loss": 1.0825,
	"step": 980
	},
	{
	"epoch": 0.9428092845178272,
	"grad_norm": 0.13934480624047157,
	"learning_rate": 1.9419227371178627e-06,
	"loss": 1.0679,
	"step": 985
	},
	{
	"epoch": 0.9475951184493898,
	"grad_norm": 0.14117739445269173,
	"learning_rate": 1.6275879514217052e-06,
	"loss": 1.0772,
	"step": 990
	},
	{
	"epoch": 0.9523809523809523,
	"grad_norm": 0.14031209854381504,
	"learning_rate": 1.3407809820603856e-06,
	"loss": 1.0767,
	"step": 995
	},
	{
	"epoch": 0.957166786312515,
	"grad_norm": 0.14091355128035063,
	"learning_rate": 1.0815820869985893e-06,
	"loss": 1.0635,
	"step": 1000
	},
	{
	"epoch": 0.9619526202440776,
	"grad_norm": 0.14100298765660577,
	"learning_rate": 8.50063798559475e-07,
	"loss": 1.0861,
	"step": 1005
	},
	{
	"epoch": 0.9667384541756401,
	"grad_norm": 0.1412224316948679,
	"learning_rate": 6.462909031276443e-07,
	"loss": 1.0633,
	"step": 1010
	},
	{
	"epoch": 0.9715242881072027,
	"grad_norm": 0.1385906964183353,
	"learning_rate": 4.7032042301985434e-07,
	"loss": 1.0726,
	"step": 1015
	},
	{
	"epoch": 0.9763101220387652,
	"grad_norm": 0.151976410727331,
	"learning_rate": 3.222016005282824e-07,
	"loss": 1.0645,
	"step": 1020
	},
	{
	"epoch": 0.9810959559703278,
	"grad_norm": 0.14304003914264313,
	"learning_rate": 2.0197588414094804e-07,
	"loss": 1.0785,
	"step": 1025
	},
	{
	"epoch": 0.9858817899018905,
	"grad_norm": 0.1395525833943131,
	"learning_rate": 1.0967691694302451e-07,
	"loss": 1.0582,
	"step": 1030
	},
	{
	"epoch": 0.990667623833453,
	"grad_norm": 0.14208441339069042,
	"learning_rate": 4.5330527202480654e-08,
	"loss": 1.0763,
	"step": 1035
	},
	{
	"epoch": 0.9954534577650156,
	"grad_norm": 0.13931375530799342,
	"learning_rate": 8.95472114241791e-09,
	"loss": 1.0444,
	"step": 1040
	},
	{
	"epoch": 0.9992821249102656,
	"eval_loss": 1.077100157737732,
	"eval_runtime": 3923.6787,
	"eval_samples_per_second": 3.43,
	"eval_steps_per_second": 0.858,
	"step": 1044
	},
	{
	"epoch": 0.9992821249102656,
	"step": 1044,
	"total_flos": 2155604625850368.0,
	"train_loss": 1.091635976486279,
	"train_runtime": 24351.9745,
	"train_samples_per_second": 2.746,
	"train_steps_per_second": 0.043
	}
	],
	"logging_steps": 5,
	"max_steps": 1044,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 2155604625850368.0,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}