LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
16e67c8 verified
raw
history blame
108 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 6190,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01615508885298869,
"grad_norm": 13.517727851867676,
"learning_rate": 2.903225806451613e-06,
"loss": 1.0539,
"step": 10
},
{
"epoch": 0.03231017770597738,
"grad_norm": 2.3650765419006348,
"learning_rate": 6.129032258064516e-06,
"loss": 0.5986,
"step": 20
},
{
"epoch": 0.048465266558966075,
"grad_norm": 3.7127814292907715,
"learning_rate": 9.35483870967742e-06,
"loss": 0.325,
"step": 30
},
{
"epoch": 0.06462035541195477,
"grad_norm": 2.470418691635132,
"learning_rate": 1.2580645161290322e-05,
"loss": 0.2792,
"step": 40
},
{
"epoch": 0.08077544426494346,
"grad_norm": 1.4894506931304932,
"learning_rate": 1.5806451612903226e-05,
"loss": 0.2738,
"step": 50
},
{
"epoch": 0.09693053311793215,
"grad_norm": 1.6425580978393555,
"learning_rate": 1.9032258064516127e-05,
"loss": 0.2348,
"step": 60
},
{
"epoch": 0.11308562197092084,
"grad_norm": 9.821566581726074,
"learning_rate": 2.2258064516129034e-05,
"loss": 0.2049,
"step": 70
},
{
"epoch": 0.12924071082390953,
"grad_norm": 2.8060154914855957,
"learning_rate": 2.5483870967741935e-05,
"loss": 0.2037,
"step": 80
},
{
"epoch": 0.14539579967689822,
"grad_norm": 1.5872341394424438,
"learning_rate": 2.8709677419354843e-05,
"loss": 0.1972,
"step": 90
},
{
"epoch": 0.16155088852988692,
"grad_norm": 5.6780219078063965,
"learning_rate": 3.193548387096774e-05,
"loss": 0.1868,
"step": 100
},
{
"epoch": 0.1777059773828756,
"grad_norm": 0.8519928455352783,
"learning_rate": 3.516129032258065e-05,
"loss": 0.1799,
"step": 110
},
{
"epoch": 0.1938610662358643,
"grad_norm": 0.983458399772644,
"learning_rate": 3.838709677419355e-05,
"loss": 0.1686,
"step": 120
},
{
"epoch": 0.210016155088853,
"grad_norm": 1.8987292051315308,
"learning_rate": 4.161290322580645e-05,
"loss": 0.1778,
"step": 130
},
{
"epoch": 0.22617124394184168,
"grad_norm": 1.6562193632125854,
"learning_rate": 4.4838709677419356e-05,
"loss": 0.1735,
"step": 140
},
{
"epoch": 0.24232633279483037,
"grad_norm": 1.770867109298706,
"learning_rate": 4.806451612903226e-05,
"loss": 0.171,
"step": 150
},
{
"epoch": 0.25848142164781907,
"grad_norm": 1.1404958963394165,
"learning_rate": 5.1290322580645164e-05,
"loss": 0.1575,
"step": 160
},
{
"epoch": 0.27463651050080773,
"grad_norm": 1.507441759109497,
"learning_rate": 5.451612903225807e-05,
"loss": 0.1526,
"step": 170
},
{
"epoch": 0.29079159935379645,
"grad_norm": 1.0781203508377075,
"learning_rate": 5.7741935483870965e-05,
"loss": 0.1508,
"step": 180
},
{
"epoch": 0.3069466882067851,
"grad_norm": 1.5736271142959595,
"learning_rate": 6.096774193548387e-05,
"loss": 0.1368,
"step": 190
},
{
"epoch": 0.32310177705977383,
"grad_norm": 1.4114209413528442,
"learning_rate": 6.419354838709679e-05,
"loss": 0.1326,
"step": 200
},
{
"epoch": 0.3392568659127625,
"grad_norm": 1.3289586305618286,
"learning_rate": 6.741935483870968e-05,
"loss": 0.1338,
"step": 210
},
{
"epoch": 0.3554119547657512,
"grad_norm": 0.8117440342903137,
"learning_rate": 7.064516129032258e-05,
"loss": 0.1326,
"step": 220
},
{
"epoch": 0.3715670436187399,
"grad_norm": 1.1739834547042847,
"learning_rate": 7.387096774193549e-05,
"loss": 0.1102,
"step": 230
},
{
"epoch": 0.3877221324717286,
"grad_norm": 1.4124845266342163,
"learning_rate": 7.709677419354839e-05,
"loss": 0.1204,
"step": 240
},
{
"epoch": 0.40387722132471726,
"grad_norm": 0.8694249987602234,
"learning_rate": 8.03225806451613e-05,
"loss": 0.1075,
"step": 250
},
{
"epoch": 0.420032310177706,
"grad_norm": 1.367783546447754,
"learning_rate": 8.35483870967742e-05,
"loss": 0.1086,
"step": 260
},
{
"epoch": 0.43618739903069464,
"grad_norm": 1.4387221336364746,
"learning_rate": 8.677419354838711e-05,
"loss": 0.1285,
"step": 270
},
{
"epoch": 0.45234248788368336,
"grad_norm": 0.9508649706840515,
"learning_rate": 9e-05,
"loss": 0.1077,
"step": 280
},
{
"epoch": 0.46849757673667203,
"grad_norm": 0.9184303283691406,
"learning_rate": 9.32258064516129e-05,
"loss": 0.1263,
"step": 290
},
{
"epoch": 0.48465266558966075,
"grad_norm": 1.1463005542755127,
"learning_rate": 9.645161290322581e-05,
"loss": 0.1027,
"step": 300
},
{
"epoch": 0.5008077544426495,
"grad_norm": 1.938699722290039,
"learning_rate": 9.967741935483872e-05,
"loss": 0.1061,
"step": 310
},
{
"epoch": 0.5169628432956381,
"grad_norm": 0.9912849068641663,
"learning_rate": 9.999942194483773e-05,
"loss": 0.1036,
"step": 320
},
{
"epoch": 0.5331179321486268,
"grad_norm": 1.1873068809509277,
"learning_rate": 9.999742374662181e-05,
"loss": 0.0954,
"step": 330
},
{
"epoch": 0.5492730210016155,
"grad_norm": 1.0425370931625366,
"learning_rate": 9.999399832589556e-05,
"loss": 0.0923,
"step": 340
},
{
"epoch": 0.5654281098546042,
"grad_norm": 1.1135231256484985,
"learning_rate": 9.998914578044079e-05,
"loss": 0.0958,
"step": 350
},
{
"epoch": 0.5815831987075929,
"grad_norm": 0.9654638767242432,
"learning_rate": 9.998286624877786e-05,
"loss": 0.1026,
"step": 360
},
{
"epoch": 0.5977382875605816,
"grad_norm": 1.106973648071289,
"learning_rate": 9.99751599101618e-05,
"loss": 0.0945,
"step": 370
},
{
"epoch": 0.6138933764135702,
"grad_norm": 1.0972684621810913,
"learning_rate": 9.996602698457715e-05,
"loss": 0.0857,
"step": 380
},
{
"epoch": 0.630048465266559,
"grad_norm": 0.9330363869667053,
"learning_rate": 9.995546773273166e-05,
"loss": 0.0908,
"step": 390
},
{
"epoch": 0.6462035541195477,
"grad_norm": 0.9228382706642151,
"learning_rate": 9.994348245604892e-05,
"loss": 0.0929,
"step": 400
},
{
"epoch": 0.6623586429725363,
"grad_norm": 1.4199814796447754,
"learning_rate": 9.993007149665967e-05,
"loss": 0.1023,
"step": 410
},
{
"epoch": 0.678513731825525,
"grad_norm": 1.0425035953521729,
"learning_rate": 9.991523523739211e-05,
"loss": 0.0924,
"step": 420
},
{
"epoch": 0.6946688206785138,
"grad_norm": 0.9444074034690857,
"learning_rate": 9.989897410176093e-05,
"loss": 0.0961,
"step": 430
},
{
"epoch": 0.7108239095315024,
"grad_norm": 0.8055535554885864,
"learning_rate": 9.988128855395523e-05,
"loss": 0.0891,
"step": 440
},
{
"epoch": 0.7269789983844911,
"grad_norm": 1.0856647491455078,
"learning_rate": 9.986217909882522e-05,
"loss": 0.0849,
"step": 450
},
{
"epoch": 0.7431340872374798,
"grad_norm": 0.8828626275062561,
"learning_rate": 9.984164628186796e-05,
"loss": 0.0893,
"step": 460
},
{
"epoch": 0.7592891760904685,
"grad_norm": 0.7011072039604187,
"learning_rate": 9.981969068921158e-05,
"loss": 0.0951,
"step": 470
},
{
"epoch": 0.7754442649434572,
"grad_norm": 0.6536422967910767,
"learning_rate": 9.979631294759871e-05,
"loss": 0.0805,
"step": 480
},
{
"epoch": 0.7915993537964459,
"grad_norm": 0.6991639733314514,
"learning_rate": 9.97715137243685e-05,
"loss": 0.0809,
"step": 490
},
{
"epoch": 0.8077544426494345,
"grad_norm": 0.9698547124862671,
"learning_rate": 9.974529372743761e-05,
"loss": 0.0875,
"step": 500
},
{
"epoch": 0.8239095315024233,
"grad_norm": 0.7342029809951782,
"learning_rate": 9.971765370528006e-05,
"loss": 0.0821,
"step": 510
},
{
"epoch": 0.840064620355412,
"grad_norm": 0.5005660057067871,
"learning_rate": 9.968859444690567e-05,
"loss": 0.0748,
"step": 520
},
{
"epoch": 0.8562197092084006,
"grad_norm": 0.5115198493003845,
"learning_rate": 9.965811678183777e-05,
"loss": 0.0804,
"step": 530
},
{
"epoch": 0.8723747980613893,
"grad_norm": 0.7139051556587219,
"learning_rate": 9.962622158008938e-05,
"loss": 0.0686,
"step": 540
},
{
"epoch": 0.8885298869143781,
"grad_norm": 0.5260514616966248,
"learning_rate": 9.959290975213841e-05,
"loss": 0.0831,
"step": 550
},
{
"epoch": 0.9046849757673667,
"grad_norm": 0.5752175450325012,
"learning_rate": 9.955818224890165e-05,
"loss": 0.0656,
"step": 560
},
{
"epoch": 0.9208400646203554,
"grad_norm": 0.6161171197891235,
"learning_rate": 9.952204006170771e-05,
"loss": 0.0697,
"step": 570
},
{
"epoch": 0.9369951534733441,
"grad_norm": 0.935058057308197,
"learning_rate": 9.948448422226856e-05,
"loss": 0.0774,
"step": 580
},
{
"epoch": 0.9531502423263328,
"grad_norm": 1.006998062133789,
"learning_rate": 9.944551580265026e-05,
"loss": 0.0788,
"step": 590
},
{
"epoch": 0.9693053311793215,
"grad_norm": 0.9937463998794556,
"learning_rate": 9.940513591524222e-05,
"loss": 0.075,
"step": 600
},
{
"epoch": 0.9854604200323102,
"grad_norm": 0.840084433555603,
"learning_rate": 9.936334571272554e-05,
"loss": 0.0805,
"step": 610
},
{
"epoch": 1.001615508885299,
"grad_norm": 0.9836556315422058,
"learning_rate": 9.932014638804001e-05,
"loss": 0.0753,
"step": 620
},
{
"epoch": 1.0177705977382876,
"grad_norm": 0.7406233549118042,
"learning_rate": 9.927553917435017e-05,
"loss": 0.0695,
"step": 630
},
{
"epoch": 1.0339256865912763,
"grad_norm": 0.8061002492904663,
"learning_rate": 9.922952534501002e-05,
"loss": 0.0682,
"step": 640
},
{
"epoch": 1.050080775444265,
"grad_norm": 0.6358613967895508,
"learning_rate": 9.918210621352668e-05,
"loss": 0.077,
"step": 650
},
{
"epoch": 1.0662358642972536,
"grad_norm": 0.6549187898635864,
"learning_rate": 9.913328313352292e-05,
"loss": 0.0739,
"step": 660
},
{
"epoch": 1.0823909531502423,
"grad_norm": 0.8390158414840698,
"learning_rate": 9.908305749869858e-05,
"loss": 0.0883,
"step": 670
},
{
"epoch": 1.098546042003231,
"grad_norm": 0.942304253578186,
"learning_rate": 9.90314307427906e-05,
"loss": 0.0788,
"step": 680
},
{
"epoch": 1.1147011308562198,
"grad_norm": 1.1538914442062378,
"learning_rate": 9.897840433953234e-05,
"loss": 0.0766,
"step": 690
},
{
"epoch": 1.1308562197092085,
"grad_norm": 0.6114380359649658,
"learning_rate": 9.892397980261128e-05,
"loss": 0.0754,
"step": 700
},
{
"epoch": 1.1470113085621971,
"grad_norm": 0.9622769355773926,
"learning_rate": 9.886815868562596e-05,
"loss": 0.0824,
"step": 710
},
{
"epoch": 1.1631663974151858,
"grad_norm": 0.6100155115127563,
"learning_rate": 9.88109425820416e-05,
"loss": 0.067,
"step": 720
},
{
"epoch": 1.1793214862681745,
"grad_norm": 0.5996105670928955,
"learning_rate": 9.875233312514454e-05,
"loss": 0.0663,
"step": 730
},
{
"epoch": 1.1954765751211631,
"grad_norm": 0.5155414342880249,
"learning_rate": 9.869233198799572e-05,
"loss": 0.0629,
"step": 740
},
{
"epoch": 1.2116316639741518,
"grad_norm": 0.6942029595375061,
"learning_rate": 9.863094088338288e-05,
"loss": 0.0764,
"step": 750
},
{
"epoch": 1.2277867528271407,
"grad_norm": 0.5376043319702148,
"learning_rate": 9.856816156377163e-05,
"loss": 0.0675,
"step": 760
},
{
"epoch": 1.2439418416801293,
"grad_norm": 0.6686906814575195,
"learning_rate": 9.850399582125548e-05,
"loss": 0.0767,
"step": 770
},
{
"epoch": 1.260096930533118,
"grad_norm": 0.768054723739624,
"learning_rate": 9.843844548750464e-05,
"loss": 0.0716,
"step": 780
},
{
"epoch": 1.2762520193861067,
"grad_norm": 0.6787708401679993,
"learning_rate": 9.837151243371376e-05,
"loss": 0.0672,
"step": 790
},
{
"epoch": 1.2924071082390953,
"grad_norm": 0.500952959060669,
"learning_rate": 9.830319857054852e-05,
"loss": 0.0702,
"step": 800
},
{
"epoch": 1.308562197092084,
"grad_norm": 0.6068538427352905,
"learning_rate": 9.823350584809105e-05,
"loss": 0.0738,
"step": 810
},
{
"epoch": 1.3247172859450727,
"grad_norm": 0.6218283176422119,
"learning_rate": 9.816243625578432e-05,
"loss": 0.0756,
"step": 820
},
{
"epoch": 1.3408723747980613,
"grad_norm": 0.7377462983131409,
"learning_rate": 9.808999182237528e-05,
"loss": 0.0692,
"step": 830
},
{
"epoch": 1.35702746365105,
"grad_norm": 0.5580537915229797,
"learning_rate": 9.8016174615857e-05,
"loss": 0.0633,
"step": 840
},
{
"epoch": 1.3731825525040389,
"grad_norm": 0.615639328956604,
"learning_rate": 9.794098674340965e-05,
"loss": 0.0718,
"step": 850
},
{
"epoch": 1.3893376413570275,
"grad_norm": 0.7121309041976929,
"learning_rate": 9.78644303513403e-05,
"loss": 0.0633,
"step": 860
},
{
"epoch": 1.4054927302100162,
"grad_norm": 0.5688542127609253,
"learning_rate": 9.778650762502166e-05,
"loss": 0.0678,
"step": 870
},
{
"epoch": 1.4216478190630049,
"grad_norm": 0.5155729651451111,
"learning_rate": 9.770722078882973e-05,
"loss": 0.0665,
"step": 880
},
{
"epoch": 1.4378029079159935,
"grad_norm": 0.48947158455848694,
"learning_rate": 9.762657210608029e-05,
"loss": 0.0657,
"step": 890
},
{
"epoch": 1.4539579967689822,
"grad_norm": 0.7648037075996399,
"learning_rate": 9.754456387896422e-05,
"loss": 0.0707,
"step": 900
},
{
"epoch": 1.4701130856219708,
"grad_norm": 0.8872023224830627,
"learning_rate": 9.746119844848195e-05,
"loss": 0.062,
"step": 910
},
{
"epoch": 1.4862681744749597,
"grad_norm": 1.083450436592102,
"learning_rate": 9.737647819437645e-05,
"loss": 0.0728,
"step": 920
},
{
"epoch": 1.5024232633279482,
"grad_norm": 0.6818684339523315,
"learning_rate": 9.729040553506539e-05,
"loss": 0.0637,
"step": 930
},
{
"epoch": 1.518578352180937,
"grad_norm": 0.7897723913192749,
"learning_rate": 9.720298292757215e-05,
"loss": 0.0682,
"step": 940
},
{
"epoch": 1.5347334410339257,
"grad_norm": 0.46110132336616516,
"learning_rate": 9.711421286745555e-05,
"loss": 0.0726,
"step": 950
},
{
"epoch": 1.5508885298869144,
"grad_norm": 0.4637523293495178,
"learning_rate": 9.70240978887387e-05,
"loss": 0.0622,
"step": 960
},
{
"epoch": 1.567043618739903,
"grad_norm": 0.7092505693435669,
"learning_rate": 9.69326405638367e-05,
"loss": 0.0592,
"step": 970
},
{
"epoch": 1.5831987075928917,
"grad_norm": 0.5965023040771484,
"learning_rate": 9.683984350348312e-05,
"loss": 0.0697,
"step": 980
},
{
"epoch": 1.5993537964458806,
"grad_norm": 0.5201593041419983,
"learning_rate": 9.67457093566555e-05,
"loss": 0.0706,
"step": 990
},
{
"epoch": 1.615508885298869,
"grad_norm": 0.6693015098571777,
"learning_rate": 9.665024081049977e-05,
"loss": 0.0653,
"step": 1000
},
{
"epoch": 1.631663974151858,
"grad_norm": 0.7377516627311707,
"learning_rate": 9.655344059025351e-05,
"loss": 0.061,
"step": 1010
},
{
"epoch": 1.6478190630048464,
"grad_norm": 0.6737310290336609,
"learning_rate": 9.645531145916817e-05,
"loss": 0.0552,
"step": 1020
},
{
"epoch": 1.6639741518578353,
"grad_norm": 0.6933907866477966,
"learning_rate": 9.635585621843018e-05,
"loss": 0.0671,
"step": 1030
},
{
"epoch": 1.680129240710824,
"grad_norm": 0.6938374638557434,
"learning_rate": 9.625507770708097e-05,
"loss": 0.068,
"step": 1040
},
{
"epoch": 1.6962843295638126,
"grad_norm": 0.5320965051651001,
"learning_rate": 9.615297880193598e-05,
"loss": 0.0632,
"step": 1050
},
{
"epoch": 1.7124394184168013,
"grad_norm": 0.6312500238418579,
"learning_rate": 9.60495624175025e-05,
"loss": 0.0706,
"step": 1060
},
{
"epoch": 1.72859450726979,
"grad_norm": 0.5120170712471008,
"learning_rate": 9.594483150589646e-05,
"loss": 0.0706,
"step": 1070
},
{
"epoch": 1.7447495961227788,
"grad_norm": 0.6575292348861694,
"learning_rate": 9.58387890567582e-05,
"loss": 0.066,
"step": 1080
},
{
"epoch": 1.7609046849757672,
"grad_norm": 0.8916189670562744,
"learning_rate": 9.573143809716711e-05,
"loss": 0.0572,
"step": 1090
},
{
"epoch": 1.7770597738287561,
"grad_norm": 0.7182980179786682,
"learning_rate": 9.562278169155518e-05,
"loss": 0.061,
"step": 1100
},
{
"epoch": 1.7932148626817448,
"grad_norm": 0.5273639559745789,
"learning_rate": 9.551282294161962e-05,
"loss": 0.0564,
"step": 1110
},
{
"epoch": 1.8093699515347335,
"grad_norm": 0.5014919638633728,
"learning_rate": 9.540156498623418e-05,
"loss": 0.0674,
"step": 1120
},
{
"epoch": 1.8255250403877221,
"grad_norm": 0.49997884035110474,
"learning_rate": 9.528901100135971e-05,
"loss": 0.0719,
"step": 1130
},
{
"epoch": 1.8416801292407108,
"grad_norm": 0.5391654968261719,
"learning_rate": 9.517516419995335e-05,
"loss": 0.0634,
"step": 1140
},
{
"epoch": 1.8578352180936997,
"grad_norm": 0.5763863921165466,
"learning_rate": 9.506002783187691e-05,
"loss": 0.0622,
"step": 1150
},
{
"epoch": 1.8739903069466881,
"grad_norm": 0.5951936841011047,
"learning_rate": 9.494360518380405e-05,
"loss": 0.066,
"step": 1160
},
{
"epoch": 1.890145395799677,
"grad_norm": 0.7027397751808167,
"learning_rate": 9.482589957912651e-05,
"loss": 0.0623,
"step": 1170
},
{
"epoch": 1.9063004846526654,
"grad_norm": 0.6542057991027832,
"learning_rate": 9.470691437785918e-05,
"loss": 0.0635,
"step": 1180
},
{
"epoch": 1.9224555735056543,
"grad_norm": 0.39720842242240906,
"learning_rate": 9.45866529765442e-05,
"loss": 0.064,
"step": 1190
},
{
"epoch": 1.938610662358643,
"grad_norm": 0.47394442558288574,
"learning_rate": 9.446511880815407e-05,
"loss": 0.0595,
"step": 1200
},
{
"epoch": 1.9547657512116317,
"grad_norm": 0.3531631529331207,
"learning_rate": 9.434231534199356e-05,
"loss": 0.0583,
"step": 1210
},
{
"epoch": 1.9709208400646203,
"grad_norm": 0.6005557775497437,
"learning_rate": 9.421824608360068e-05,
"loss": 0.0599,
"step": 1220
},
{
"epoch": 1.987075928917609,
"grad_norm": 0.5101392269134521,
"learning_rate": 9.409291457464672e-05,
"loss": 0.0617,
"step": 1230
},
{
"epoch": 2.003231017770598,
"grad_norm": 0.42682531476020813,
"learning_rate": 9.396632439283501e-05,
"loss": 0.0554,
"step": 1240
},
{
"epoch": 2.0193861066235863,
"grad_norm": 0.6450132727622986,
"learning_rate": 9.383847915179892e-05,
"loss": 0.0677,
"step": 1250
},
{
"epoch": 2.035541195476575,
"grad_norm": 0.399069607257843,
"learning_rate": 9.370938250099857e-05,
"loss": 0.0618,
"step": 1260
},
{
"epoch": 2.0516962843295636,
"grad_norm": 0.4468577802181244,
"learning_rate": 9.357903812561679e-05,
"loss": 0.0685,
"step": 1270
},
{
"epoch": 2.0678513731825525,
"grad_norm": 0.559262752532959,
"learning_rate": 9.344744974645381e-05,
"loss": 0.0637,
"step": 1280
},
{
"epoch": 2.0840064620355414,
"grad_norm": 0.5825755596160889,
"learning_rate": 9.33146211198211e-05,
"loss": 0.0625,
"step": 1290
},
{
"epoch": 2.10016155088853,
"grad_norm": 1.090774655342102,
"learning_rate": 9.318055603743418e-05,
"loss": 0.0691,
"step": 1300
},
{
"epoch": 2.1163166397415187,
"grad_norm": 0.3790472149848938,
"learning_rate": 9.304525832630426e-05,
"loss": 0.0572,
"step": 1310
},
{
"epoch": 2.132471728594507,
"grad_norm": 0.46555569767951965,
"learning_rate": 9.290873184862917e-05,
"loss": 0.0611,
"step": 1320
},
{
"epoch": 2.148626817447496,
"grad_norm": 0.5333315134048462,
"learning_rate": 9.277098050168293e-05,
"loss": 0.0554,
"step": 1330
},
{
"epoch": 2.1647819063004845,
"grad_norm": 0.5820637345314026,
"learning_rate": 9.263200821770461e-05,
"loss": 0.0593,
"step": 1340
},
{
"epoch": 2.1809369951534734,
"grad_norm": 0.5108340978622437,
"learning_rate": 9.249181896378607e-05,
"loss": 0.0561,
"step": 1350
},
{
"epoch": 2.197092084006462,
"grad_norm": 0.44887450337409973,
"learning_rate": 9.235041674175868e-05,
"loss": 0.0608,
"step": 1360
},
{
"epoch": 2.2132471728594507,
"grad_norm": 0.462615042924881,
"learning_rate": 9.22078055880791e-05,
"loss": 0.0495,
"step": 1370
},
{
"epoch": 2.2294022617124396,
"grad_norm": 0.48509976267814636,
"learning_rate": 9.206398957371406e-05,
"loss": 0.0589,
"step": 1380
},
{
"epoch": 2.245557350565428,
"grad_norm": 0.48090824484825134,
"learning_rate": 9.191897280402415e-05,
"loss": 0.0521,
"step": 1390
},
{
"epoch": 2.261712439418417,
"grad_norm": 0.5474804043769836,
"learning_rate": 9.177275941864662e-05,
"loss": 0.0591,
"step": 1400
},
{
"epoch": 2.2778675282714054,
"grad_norm": 0.6736873984336853,
"learning_rate": 9.162535359137725e-05,
"loss": 0.0532,
"step": 1410
},
{
"epoch": 2.2940226171243943,
"grad_norm": 0.4108855426311493,
"learning_rate": 9.147675953005112e-05,
"loss": 0.0608,
"step": 1420
},
{
"epoch": 2.3101777059773827,
"grad_norm": 0.6929683685302734,
"learning_rate": 9.132698147642258e-05,
"loss": 0.0572,
"step": 1430
},
{
"epoch": 2.3263327948303716,
"grad_norm": 0.662588357925415,
"learning_rate": 9.117602370604412e-05,
"loss": 0.0606,
"step": 1440
},
{
"epoch": 2.3424878836833605,
"grad_norm": 0.6598329544067383,
"learning_rate": 9.102389052814435e-05,
"loss": 0.0617,
"step": 1450
},
{
"epoch": 2.358642972536349,
"grad_norm": 0.6328267455101013,
"learning_rate": 9.087058628550492e-05,
"loss": 0.0635,
"step": 1460
},
{
"epoch": 2.374798061389338,
"grad_norm": 0.7304327487945557,
"learning_rate": 9.071611535433665e-05,
"loss": 0.0636,
"step": 1470
},
{
"epoch": 2.3909531502423262,
"grad_norm": 0.7994436621665955,
"learning_rate": 9.056048214415456e-05,
"loss": 0.0682,
"step": 1480
},
{
"epoch": 2.407108239095315,
"grad_norm": 0.5563200116157532,
"learning_rate": 9.040369109765196e-05,
"loss": 0.0602,
"step": 1490
},
{
"epoch": 2.4232633279483036,
"grad_norm": 0.862169623374939,
"learning_rate": 9.024574669057368e-05,
"loss": 0.0694,
"step": 1500
},
{
"epoch": 2.4394184168012925,
"grad_norm": 0.5530250668525696,
"learning_rate": 9.00866534315883e-05,
"loss": 0.0621,
"step": 1510
},
{
"epoch": 2.4555735056542813,
"grad_norm": 0.5109930634498596,
"learning_rate": 8.992641586215944e-05,
"loss": 0.0568,
"step": 1520
},
{
"epoch": 2.47172859450727,
"grad_norm": 0.772769570350647,
"learning_rate": 8.97650385564161e-05,
"loss": 0.0634,
"step": 1530
},
{
"epoch": 2.4878836833602587,
"grad_norm": 0.7233314514160156,
"learning_rate": 8.960252612102209e-05,
"loss": 0.0682,
"step": 1540
},
{
"epoch": 2.504038772213247,
"grad_norm": 0.9270318746566772,
"learning_rate": 8.943888319504457e-05,
"loss": 0.0616,
"step": 1550
},
{
"epoch": 2.520193861066236,
"grad_norm": 1.1452592611312866,
"learning_rate": 8.927411444982157e-05,
"loss": 0.0536,
"step": 1560
},
{
"epoch": 2.5363489499192244,
"grad_norm": 0.8335738778114319,
"learning_rate": 8.91082245888287e-05,
"loss": 0.0588,
"step": 1570
},
{
"epoch": 2.5525040387722133,
"grad_norm": 0.5370670557022095,
"learning_rate": 8.894121834754481e-05,
"loss": 0.0593,
"step": 1580
},
{
"epoch": 2.568659127625202,
"grad_norm": 0.5532761216163635,
"learning_rate": 8.877310049331691e-05,
"loss": 0.0601,
"step": 1590
},
{
"epoch": 2.5848142164781907,
"grad_norm": 0.6287941932678223,
"learning_rate": 8.860387582522397e-05,
"loss": 0.0627,
"step": 1600
},
{
"epoch": 2.600969305331179,
"grad_norm": 0.6329537034034729,
"learning_rate": 8.843354917394e-05,
"loss": 0.0572,
"step": 1610
},
{
"epoch": 2.617124394184168,
"grad_norm": 0.4902884364128113,
"learning_rate": 8.826212540159615e-05,
"loss": 0.0528,
"step": 1620
},
{
"epoch": 2.633279483037157,
"grad_norm": 0.4139235019683838,
"learning_rate": 8.808960940164188e-05,
"loss": 0.0591,
"step": 1630
},
{
"epoch": 2.6494345718901453,
"grad_norm": 0.481642484664917,
"learning_rate": 8.79160060987053e-05,
"loss": 0.063,
"step": 1640
},
{
"epoch": 2.665589660743134,
"grad_norm": 0.575612485408783,
"learning_rate": 8.77413204484526e-05,
"loss": 0.0682,
"step": 1650
},
{
"epoch": 2.6817447495961226,
"grad_norm": 0.7415863871574402,
"learning_rate": 8.756555743744655e-05,
"loss": 0.0488,
"step": 1660
},
{
"epoch": 2.6978998384491115,
"grad_norm": 0.614101767539978,
"learning_rate": 8.738872208300417e-05,
"loss": 0.0627,
"step": 1670
},
{
"epoch": 2.7140549273021,
"grad_norm": 0.5911862850189209,
"learning_rate": 8.721081943305356e-05,
"loss": 0.0622,
"step": 1680
},
{
"epoch": 2.730210016155089,
"grad_norm": 0.5863639116287231,
"learning_rate": 8.703185456598968e-05,
"loss": 0.0598,
"step": 1690
},
{
"epoch": 2.7463651050080777,
"grad_norm": 0.6773284077644348,
"learning_rate": 8.685183259052952e-05,
"loss": 0.0591,
"step": 1700
},
{
"epoch": 2.762520193861066,
"grad_norm": 0.48102864623069763,
"learning_rate": 8.667075864556615e-05,
"loss": 0.0554,
"step": 1710
},
{
"epoch": 2.778675282714055,
"grad_norm": 0.6997978687286377,
"learning_rate": 8.648863790002213e-05,
"loss": 0.0605,
"step": 1720
},
{
"epoch": 2.7948303715670435,
"grad_norm": 0.6587175130844116,
"learning_rate": 8.630547555270188e-05,
"loss": 0.064,
"step": 1730
},
{
"epoch": 2.8109854604200324,
"grad_norm": 0.8421849608421326,
"learning_rate": 8.612127683214329e-05,
"loss": 0.0523,
"step": 1740
},
{
"epoch": 2.827140549273021,
"grad_norm": 0.3728615939617157,
"learning_rate": 8.59360469964685e-05,
"loss": 0.057,
"step": 1750
},
{
"epoch": 2.8432956381260097,
"grad_norm": 0.6552137732505798,
"learning_rate": 8.574979133323377e-05,
"loss": 0.0605,
"step": 1760
},
{
"epoch": 2.8594507269789986,
"grad_norm": 0.7351179718971252,
"learning_rate": 8.556251515927855e-05,
"loss": 0.0566,
"step": 1770
},
{
"epoch": 2.875605815831987,
"grad_norm": 0.5557317733764648,
"learning_rate": 8.537422382057374e-05,
"loss": 0.0531,
"step": 1780
},
{
"epoch": 2.891760904684976,
"grad_norm": 0.5497432351112366,
"learning_rate": 8.518492269206899e-05,
"loss": 0.0588,
"step": 1790
},
{
"epoch": 2.9079159935379644,
"grad_norm": 0.6453426480293274,
"learning_rate": 8.499461717753939e-05,
"loss": 0.0589,
"step": 1800
},
{
"epoch": 2.9240710823909533,
"grad_norm": 0.5362476706504822,
"learning_rate": 8.480331270943111e-05,
"loss": 0.0626,
"step": 1810
},
{
"epoch": 2.9402261712439417,
"grad_norm": 0.42626962065696716,
"learning_rate": 8.461101474870641e-05,
"loss": 0.0495,
"step": 1820
},
{
"epoch": 2.9563812600969306,
"grad_norm": 0.5444236397743225,
"learning_rate": 8.44177287846877e-05,
"loss": 0.0558,
"step": 1830
},
{
"epoch": 2.9725363489499195,
"grad_norm": 0.5531013607978821,
"learning_rate": 8.422346033490082e-05,
"loss": 0.0497,
"step": 1840
},
{
"epoch": 2.988691437802908,
"grad_norm": 0.5683811902999878,
"learning_rate": 8.402821494491762e-05,
"loss": 0.0528,
"step": 1850
},
{
"epoch": 3.004846526655897,
"grad_norm": 0.5049775838851929,
"learning_rate": 8.383199818819758e-05,
"loss": 0.0616,
"step": 1860
},
{
"epoch": 3.0210016155088852,
"grad_norm": 0.38788193464279175,
"learning_rate": 8.363481566592874e-05,
"loss": 0.0549,
"step": 1870
},
{
"epoch": 3.037156704361874,
"grad_norm": 0.543121337890625,
"learning_rate": 8.34366730068678e-05,
"loss": 0.0561,
"step": 1880
},
{
"epoch": 3.0533117932148626,
"grad_norm": 0.48212480545043945,
"learning_rate": 8.323757586717947e-05,
"loss": 0.0473,
"step": 1890
},
{
"epoch": 3.0694668820678515,
"grad_norm": 0.7454380393028259,
"learning_rate": 8.303752993027498e-05,
"loss": 0.0564,
"step": 1900
},
{
"epoch": 3.08562197092084,
"grad_norm": 0.5166053175926208,
"learning_rate": 8.283654090664985e-05,
"loss": 0.0571,
"step": 1910
},
{
"epoch": 3.101777059773829,
"grad_norm": 0.5176417231559753,
"learning_rate": 8.263461453372086e-05,
"loss": 0.0593,
"step": 1920
},
{
"epoch": 3.1179321486268172,
"grad_norm": 0.6009415984153748,
"learning_rate": 8.243175657566233e-05,
"loss": 0.0518,
"step": 1930
},
{
"epoch": 3.134087237479806,
"grad_norm": 0.4920412302017212,
"learning_rate": 8.222797282324152e-05,
"loss": 0.0517,
"step": 1940
},
{
"epoch": 3.150242326332795,
"grad_norm": 0.5730708241462708,
"learning_rate": 8.20232690936533e-05,
"loss": 0.055,
"step": 1950
},
{
"epoch": 3.1663974151857834,
"grad_norm": 0.5689309239387512,
"learning_rate": 8.18176512303542e-05,
"loss": 0.0462,
"step": 1960
},
{
"epoch": 3.1825525040387723,
"grad_norm": 0.3386596143245697,
"learning_rate": 8.161112510289549e-05,
"loss": 0.0593,
"step": 1970
},
{
"epoch": 3.1987075928917608,
"grad_norm": 0.5641984939575195,
"learning_rate": 8.140369660675571e-05,
"loss": 0.0561,
"step": 1980
},
{
"epoch": 3.2148626817447497,
"grad_norm": 0.3364955186843872,
"learning_rate": 8.119537166317232e-05,
"loss": 0.0488,
"step": 1990
},
{
"epoch": 3.231017770597738,
"grad_norm": 0.5797820687294006,
"learning_rate": 8.098615621897272e-05,
"loss": 0.0471,
"step": 2000
},
{
"epoch": 3.247172859450727,
"grad_norm": 0.6893600225448608,
"learning_rate": 8.077605624640448e-05,
"loss": 0.0489,
"step": 2010
},
{
"epoch": 3.263327948303716,
"grad_norm": 0.6242002844810486,
"learning_rate": 8.056507774296477e-05,
"loss": 0.0502,
"step": 2020
},
{
"epoch": 3.2794830371567043,
"grad_norm": 0.29608842730522156,
"learning_rate": 8.035322673122934e-05,
"loss": 0.0574,
"step": 2030
},
{
"epoch": 3.295638126009693,
"grad_norm": 0.39050793647766113,
"learning_rate": 8.014050925868042e-05,
"loss": 0.0553,
"step": 2040
},
{
"epoch": 3.3117932148626816,
"grad_norm": 0.7243764400482178,
"learning_rate": 7.99269313975342e-05,
"loss": 0.0496,
"step": 2050
},
{
"epoch": 3.3279483037156705,
"grad_norm": 0.6739727258682251,
"learning_rate": 7.971249924456742e-05,
"loss": 0.0486,
"step": 2060
},
{
"epoch": 3.344103392568659,
"grad_norm": 0.4816618263721466,
"learning_rate": 7.94972189209434e-05,
"loss": 0.0455,
"step": 2070
},
{
"epoch": 3.360258481421648,
"grad_norm": 0.5240322351455688,
"learning_rate": 7.928109657203725e-05,
"loss": 0.0573,
"step": 2080
},
{
"epoch": 3.3764135702746367,
"grad_norm": 0.3253321051597595,
"learning_rate": 7.906413836726048e-05,
"loss": 0.0467,
"step": 2090
},
{
"epoch": 3.392568659127625,
"grad_norm": 0.5213293433189392,
"learning_rate": 7.884635049988488e-05,
"loss": 0.0488,
"step": 2100
},
{
"epoch": 3.408723747980614,
"grad_norm": 0.4129197895526886,
"learning_rate": 7.86277391868657e-05,
"loss": 0.0483,
"step": 2110
},
{
"epoch": 3.4248788368336025,
"grad_norm": 0.5131278038024902,
"learning_rate": 7.840831066866423e-05,
"loss": 0.0429,
"step": 2120
},
{
"epoch": 3.4410339256865914,
"grad_norm": 0.529063880443573,
"learning_rate": 7.818807120906964e-05,
"loss": 0.0536,
"step": 2130
},
{
"epoch": 3.45718901453958,
"grad_norm": 0.6816816926002502,
"learning_rate": 7.796702709502012e-05,
"loss": 0.0514,
"step": 2140
},
{
"epoch": 3.4733441033925687,
"grad_norm": 0.3989129066467285,
"learning_rate": 7.774518463642351e-05,
"loss": 0.0613,
"step": 2150
},
{
"epoch": 3.489499192245557,
"grad_norm": 0.4334792494773865,
"learning_rate": 7.75225501659771e-05,
"loss": 0.0483,
"step": 2160
},
{
"epoch": 3.505654281098546,
"grad_norm": 0.46373841166496277,
"learning_rate": 7.729913003898694e-05,
"loss": 0.0443,
"step": 2170
},
{
"epoch": 3.5218093699515345,
"grad_norm": 0.3799467980861664,
"learning_rate": 7.707493063318629e-05,
"loss": 0.0511,
"step": 2180
},
{
"epoch": 3.5379644588045234,
"grad_norm": 0.4075853228569031,
"learning_rate": 7.684995834855372e-05,
"loss": 0.0478,
"step": 2190
},
{
"epoch": 3.5541195476575123,
"grad_norm": 0.39337170124053955,
"learning_rate": 7.662421960713028e-05,
"loss": 0.0484,
"step": 2200
},
{
"epoch": 3.5702746365105007,
"grad_norm": 0.30496665835380554,
"learning_rate": 7.639772085283628e-05,
"loss": 0.0446,
"step": 2210
},
{
"epoch": 3.5864297253634896,
"grad_norm": 0.36177757382392883,
"learning_rate": 7.617046855128724e-05,
"loss": 0.0469,
"step": 2220
},
{
"epoch": 3.602584814216478,
"grad_norm": 0.39714500308036804,
"learning_rate": 7.594246918960946e-05,
"loss": 0.0433,
"step": 2230
},
{
"epoch": 3.618739903069467,
"grad_norm": 0.40002134442329407,
"learning_rate": 7.571372927625469e-05,
"loss": 0.0518,
"step": 2240
},
{
"epoch": 3.6348949919224554,
"grad_norm": 0.6046271324157715,
"learning_rate": 7.548425534081442e-05,
"loss": 0.052,
"step": 2250
},
{
"epoch": 3.6510500807754442,
"grad_norm": 0.43297943472862244,
"learning_rate": 7.525405393383351e-05,
"loss": 0.0462,
"step": 2260
},
{
"epoch": 3.667205169628433,
"grad_norm": 0.4702610671520233,
"learning_rate": 7.502313162662315e-05,
"loss": 0.0543,
"step": 2270
},
{
"epoch": 3.6833602584814216,
"grad_norm": 0.3743409216403961,
"learning_rate": 7.479149501107328e-05,
"loss": 0.0472,
"step": 2280
},
{
"epoch": 3.6995153473344105,
"grad_norm": 0.3397691249847412,
"learning_rate": 7.455915069946444e-05,
"loss": 0.045,
"step": 2290
},
{
"epoch": 3.715670436187399,
"grad_norm": 0.39391201734542847,
"learning_rate": 7.4326105324279e-05,
"loss": 0.0407,
"step": 2300
},
{
"epoch": 3.731825525040388,
"grad_norm": 0.5775906443595886,
"learning_rate": 7.409236553801183e-05,
"loss": 0.0511,
"step": 2310
},
{
"epoch": 3.7479806138933762,
"grad_norm": 0.5497547388076782,
"learning_rate": 7.385793801298042e-05,
"loss": 0.0426,
"step": 2320
},
{
"epoch": 3.764135702746365,
"grad_norm": 0.4124547243118286,
"learning_rate": 7.36228294411344e-05,
"loss": 0.05,
"step": 2330
},
{
"epoch": 3.780290791599354,
"grad_norm": 0.4284408390522003,
"learning_rate": 7.338704653386448e-05,
"loss": 0.0498,
"step": 2340
},
{
"epoch": 3.7964458804523424,
"grad_norm": 0.47924646735191345,
"learning_rate": 7.315059602181092e-05,
"loss": 0.0491,
"step": 2350
},
{
"epoch": 3.8126009693053313,
"grad_norm": 0.34164971113204956,
"learning_rate": 7.291348465467136e-05,
"loss": 0.0503,
"step": 2360
},
{
"epoch": 3.8287560581583198,
"grad_norm": 0.4297367334365845,
"learning_rate": 7.267571920100816e-05,
"loss": 0.0505,
"step": 2370
},
{
"epoch": 3.8449111470113086,
"grad_norm": 0.45141902565956116,
"learning_rate": 7.24373064480552e-05,
"loss": 0.0442,
"step": 2380
},
{
"epoch": 3.861066235864297,
"grad_norm": 0.4785975217819214,
"learning_rate": 7.219825320152411e-05,
"loss": 0.0538,
"step": 2390
},
{
"epoch": 3.877221324717286,
"grad_norm": 0.3574664890766144,
"learning_rate": 7.195856628540995e-05,
"loss": 0.0499,
"step": 2400
},
{
"epoch": 3.893376413570275,
"grad_norm": 0.40025898814201355,
"learning_rate": 7.171825254179654e-05,
"loss": 0.0429,
"step": 2410
},
{
"epoch": 3.9095315024232633,
"grad_norm": 0.3183038830757141,
"learning_rate": 7.1477318830661e-05,
"loss": 0.0466,
"step": 2420
},
{
"epoch": 3.9256865912762517,
"grad_norm": 0.4639292061328888,
"learning_rate": 7.123577202967805e-05,
"loss": 0.0446,
"step": 2430
},
{
"epoch": 3.9418416801292406,
"grad_norm": 0.5322105884552002,
"learning_rate": 7.099361903402359e-05,
"loss": 0.0495,
"step": 2440
},
{
"epoch": 3.9579967689822295,
"grad_norm": 0.3138383626937866,
"learning_rate": 7.075086675617788e-05,
"loss": 0.0444,
"step": 2450
},
{
"epoch": 3.974151857835218,
"grad_norm": 0.544747531414032,
"learning_rate": 7.050752212572831e-05,
"loss": 0.0541,
"step": 2460
},
{
"epoch": 3.990306946688207,
"grad_norm": 0.4654453694820404,
"learning_rate": 7.026359208917148e-05,
"loss": 0.0504,
"step": 2470
},
{
"epoch": 4.006462035541196,
"grad_norm": 0.31848329305648804,
"learning_rate": 7.001908360971494e-05,
"loss": 0.0451,
"step": 2480
},
{
"epoch": 4.022617124394184,
"grad_norm": 0.43173283338546753,
"learning_rate": 6.977400366707847e-05,
"loss": 0.0467,
"step": 2490
},
{
"epoch": 4.038772213247173,
"grad_norm": 0.5474691390991211,
"learning_rate": 6.952835925729472e-05,
"loss": 0.0479,
"step": 2500
},
{
"epoch": 4.054927302100162,
"grad_norm": 0.4897683560848236,
"learning_rate": 6.928215739250963e-05,
"loss": 0.0505,
"step": 2510
},
{
"epoch": 4.07108239095315,
"grad_norm": 0.31264185905456543,
"learning_rate": 6.903540510078219e-05,
"loss": 0.0457,
"step": 2520
},
{
"epoch": 4.087237479806139,
"grad_norm": 0.4703519642353058,
"learning_rate": 6.878810942588383e-05,
"loss": 0.0451,
"step": 2530
},
{
"epoch": 4.103392568659127,
"grad_norm": 0.3018874228000641,
"learning_rate": 6.85402774270974e-05,
"loss": 0.0449,
"step": 2540
},
{
"epoch": 4.119547657512117,
"grad_norm": 0.3613886535167694,
"learning_rate": 6.829191617901551e-05,
"loss": 0.0481,
"step": 2550
},
{
"epoch": 4.135702746365105,
"grad_norm": 0.34348440170288086,
"learning_rate": 6.804303277133877e-05,
"loss": 0.0396,
"step": 2560
},
{
"epoch": 4.1518578352180935,
"grad_norm": 0.44307631254196167,
"learning_rate": 6.779363430867326e-05,
"loss": 0.0459,
"step": 2570
},
{
"epoch": 4.168012924071083,
"grad_norm": 0.5705850124359131,
"learning_rate": 6.754372791032783e-05,
"loss": 0.0468,
"step": 2580
},
{
"epoch": 4.184168012924071,
"grad_norm": 0.3443628251552582,
"learning_rate": 6.729332071011077e-05,
"loss": 0.0452,
"step": 2590
},
{
"epoch": 4.20032310177706,
"grad_norm": 0.4537239372730255,
"learning_rate": 6.704241985612625e-05,
"loss": 0.0446,
"step": 2600
},
{
"epoch": 4.216478190630048,
"grad_norm": 0.3705506920814514,
"learning_rate": 6.679103251057024e-05,
"loss": 0.0384,
"step": 2610
},
{
"epoch": 4.2326332794830375,
"grad_norm": 0.5850950479507446,
"learning_rate": 6.653916584952607e-05,
"loss": 0.0483,
"step": 2620
},
{
"epoch": 4.248788368336026,
"grad_norm": 0.7132898569107056,
"learning_rate": 6.628682706275953e-05,
"loss": 0.0432,
"step": 2630
},
{
"epoch": 4.264943457189014,
"grad_norm": 0.3713912069797516,
"learning_rate": 6.603402335351371e-05,
"loss": 0.0382,
"step": 2640
},
{
"epoch": 4.281098546042003,
"grad_norm": 0.6300288438796997,
"learning_rate": 6.578076193830335e-05,
"loss": 0.0444,
"step": 2650
},
{
"epoch": 4.297253634894992,
"grad_norm": 0.5276614427566528,
"learning_rate": 6.55270500467088e-05,
"loss": 0.0554,
"step": 2660
},
{
"epoch": 4.313408723747981,
"grad_norm": 0.38638073205947876,
"learning_rate": 6.527289492116968e-05,
"loss": 0.054,
"step": 2670
},
{
"epoch": 4.329563812600969,
"grad_norm": 0.7961811423301697,
"learning_rate": 6.501830381677813e-05,
"loss": 0.0529,
"step": 2680
},
{
"epoch": 4.345718901453958,
"grad_norm": 0.3550907373428345,
"learning_rate": 6.476328400107171e-05,
"loss": 0.0488,
"step": 2690
},
{
"epoch": 4.361873990306947,
"grad_norm": 0.5453242659568787,
"learning_rate": 6.450784275382595e-05,
"loss": 0.0503,
"step": 2700
},
{
"epoch": 4.378029079159935,
"grad_norm": 0.4048435688018799,
"learning_rate": 6.425198736684655e-05,
"loss": 0.0474,
"step": 2710
},
{
"epoch": 4.394184168012924,
"grad_norm": 0.47286099195480347,
"learning_rate": 6.399572514376113e-05,
"loss": 0.0406,
"step": 2720
},
{
"epoch": 4.410339256865913,
"grad_norm": 0.28871116042137146,
"learning_rate": 6.373906339981092e-05,
"loss": 0.0465,
"step": 2730
},
{
"epoch": 4.426494345718901,
"grad_norm": 0.686854362487793,
"learning_rate": 6.348200946164178e-05,
"loss": 0.0477,
"step": 2740
},
{
"epoch": 4.44264943457189,
"grad_norm": 0.7823249101638794,
"learning_rate": 6.322457066709511e-05,
"loss": 0.0407,
"step": 2750
},
{
"epoch": 4.458804523424879,
"grad_norm": 0.4921092987060547,
"learning_rate": 6.296675436499844e-05,
"loss": 0.0408,
"step": 2760
},
{
"epoch": 4.474959612277868,
"grad_norm": 0.5457318425178528,
"learning_rate": 6.270856791495556e-05,
"loss": 0.0421,
"step": 2770
},
{
"epoch": 4.491114701130856,
"grad_norm": 0.7720049619674683,
"learning_rate": 6.245001868713649e-05,
"loss": 0.0495,
"step": 2780
},
{
"epoch": 4.5072697899838445,
"grad_norm": 0.4767976999282837,
"learning_rate": 6.219111406206707e-05,
"loss": 0.0446,
"step": 2790
},
{
"epoch": 4.523424878836834,
"grad_norm": 0.46596401929855347,
"learning_rate": 6.193186143041828e-05,
"loss": 0.044,
"step": 2800
},
{
"epoch": 4.539579967689822,
"grad_norm": 0.4272357225418091,
"learning_rate": 6.167226819279528e-05,
"loss": 0.043,
"step": 2810
},
{
"epoch": 4.555735056542811,
"grad_norm": 0.39680230617523193,
"learning_rate": 6.141234175952612e-05,
"loss": 0.0376,
"step": 2820
},
{
"epoch": 4.5718901453958,
"grad_norm": 0.42455387115478516,
"learning_rate": 6.115208955045025e-05,
"loss": 0.0415,
"step": 2830
},
{
"epoch": 4.5880452342487885,
"grad_norm": 0.4186107814311981,
"learning_rate": 6.089151899470668e-05,
"loss": 0.0394,
"step": 2840
},
{
"epoch": 4.604200323101777,
"grad_norm": 0.4375015199184418,
"learning_rate": 6.0630637530521905e-05,
"loss": 0.0392,
"step": 2850
},
{
"epoch": 4.620355411954765,
"grad_norm": 0.4540638327598572,
"learning_rate": 6.036945260499762e-05,
"loss": 0.0498,
"step": 2860
},
{
"epoch": 4.636510500807755,
"grad_norm": 0.33841922879219055,
"learning_rate": 6.010797167389808e-05,
"loss": 0.0403,
"step": 2870
},
{
"epoch": 4.652665589660743,
"grad_norm": 0.4046776592731476,
"learning_rate": 5.9846202201437285e-05,
"loss": 0.0394,
"step": 2880
},
{
"epoch": 4.668820678513732,
"grad_norm": 0.5421432852745056,
"learning_rate": 5.9584151660065946e-05,
"loss": 0.0433,
"step": 2890
},
{
"epoch": 4.684975767366721,
"grad_norm": 0.38528770208358765,
"learning_rate": 5.93218275302581e-05,
"loss": 0.0421,
"step": 2900
},
{
"epoch": 4.701130856219709,
"grad_norm": 0.4037356376647949,
"learning_rate": 5.9059237300297656e-05,
"loss": 0.0467,
"step": 2910
},
{
"epoch": 4.717285945072698,
"grad_norm": 0.3471173644065857,
"learning_rate": 5.879638846606459e-05,
"loss": 0.0395,
"step": 2920
},
{
"epoch": 4.733441033925686,
"grad_norm": 0.37581634521484375,
"learning_rate": 5.853328853082097e-05,
"loss": 0.0454,
"step": 2930
},
{
"epoch": 4.749596122778676,
"grad_norm": 0.3175153136253357,
"learning_rate": 5.826994500499675e-05,
"loss": 0.0438,
"step": 2940
},
{
"epoch": 4.765751211631664,
"grad_norm": 0.6848868131637573,
"learning_rate": 5.8006365405975436e-05,
"loss": 0.0408,
"step": 2950
},
{
"epoch": 4.7819063004846525,
"grad_norm": 0.5808501839637756,
"learning_rate": 5.774255725787946e-05,
"loss": 0.0469,
"step": 2960
},
{
"epoch": 4.798061389337642,
"grad_norm": 0.4114396870136261,
"learning_rate": 5.747852809135539e-05,
"loss": 0.0475,
"step": 2970
},
{
"epoch": 4.81421647819063,
"grad_norm": 0.4883790910243988,
"learning_rate": 5.721428544335893e-05,
"loss": 0.0427,
"step": 2980
},
{
"epoch": 4.830371567043619,
"grad_norm": 0.476870059967041,
"learning_rate": 5.694983685693988e-05,
"loss": 0.0375,
"step": 2990
},
{
"epoch": 4.846526655896607,
"grad_norm": 0.4612770974636078,
"learning_rate": 5.668518988102668e-05,
"loss": 0.0416,
"step": 3000
},
{
"epoch": 4.8626817447495965,
"grad_norm": 0.6491737961769104,
"learning_rate": 5.6420352070211016e-05,
"loss": 0.0372,
"step": 3010
},
{
"epoch": 4.878836833602585,
"grad_norm": 0.433662474155426,
"learning_rate": 5.615533098453215e-05,
"loss": 0.0467,
"step": 3020
},
{
"epoch": 4.894991922455573,
"grad_norm": 0.2737475037574768,
"learning_rate": 5.589013418926104e-05,
"loss": 0.0413,
"step": 3030
},
{
"epoch": 4.911147011308563,
"grad_norm": 0.388280987739563,
"learning_rate": 5.562476925468445e-05,
"loss": 0.0338,
"step": 3040
},
{
"epoch": 4.927302100161551,
"grad_norm": 0.4380597174167633,
"learning_rate": 5.535924375588887e-05,
"loss": 0.0415,
"step": 3050
},
{
"epoch": 4.94345718901454,
"grad_norm": 0.5273949503898621,
"learning_rate": 5.509356527254421e-05,
"loss": 0.0393,
"step": 3060
},
{
"epoch": 4.959612277867528,
"grad_norm": 0.9131794571876526,
"learning_rate": 5.482774138868749e-05,
"loss": 0.0459,
"step": 3070
},
{
"epoch": 4.975767366720517,
"grad_norm": 0.4145738482475281,
"learning_rate": 5.456177969250632e-05,
"loss": 0.038,
"step": 3080
},
{
"epoch": 4.991922455573506,
"grad_norm": 0.4649810791015625,
"learning_rate": 5.4295687776122236e-05,
"loss": 0.0451,
"step": 3090
},
{
"epoch": 5.008077544426494,
"grad_norm": 0.4478986859321594,
"learning_rate": 5.4029473235374106e-05,
"loss": 0.0439,
"step": 3100
},
{
"epoch": 5.024232633279483,
"grad_norm": 0.34594130516052246,
"learning_rate": 5.376314366960118e-05,
"loss": 0.0451,
"step": 3110
},
{
"epoch": 5.040387722132472,
"grad_norm": 0.5222188830375671,
"learning_rate": 5.3496706681426204e-05,
"loss": 0.0413,
"step": 3120
},
{
"epoch": 5.05654281098546,
"grad_norm": 0.5172345638275146,
"learning_rate": 5.323016987653842e-05,
"loss": 0.0452,
"step": 3130
},
{
"epoch": 5.072697899838449,
"grad_norm": 0.3387891352176666,
"learning_rate": 5.29635408634764e-05,
"loss": 0.042,
"step": 3140
},
{
"epoch": 5.088852988691438,
"grad_norm": 0.45280104875564575,
"learning_rate": 5.26968272534109e-05,
"loss": 0.039,
"step": 3150
},
{
"epoch": 5.105008077544427,
"grad_norm": 0.4317404329776764,
"learning_rate": 5.2430036659927573e-05,
"loss": 0.0377,
"step": 3160
},
{
"epoch": 5.121163166397415,
"grad_norm": 0.6537325978279114,
"learning_rate": 5.2163176698809645e-05,
"loss": 0.044,
"step": 3170
},
{
"epoch": 5.1373182552504035,
"grad_norm": 0.32357853651046753,
"learning_rate": 5.189625498782047e-05,
"loss": 0.0418,
"step": 3180
},
{
"epoch": 5.153473344103393,
"grad_norm": 0.5868157148361206,
"learning_rate": 5.1629279146486155e-05,
"loss": 0.0452,
"step": 3190
},
{
"epoch": 5.169628432956381,
"grad_norm": 0.46574723720550537,
"learning_rate": 5.136225679587797e-05,
"loss": 0.0432,
"step": 3200
},
{
"epoch": 5.18578352180937,
"grad_norm": 0.2721109390258789,
"learning_rate": 5.109519555839486e-05,
"loss": 0.0424,
"step": 3210
},
{
"epoch": 5.201938610662358,
"grad_norm": 0.3568851053714752,
"learning_rate": 5.082810305754583e-05,
"loss": 0.0391,
"step": 3220
},
{
"epoch": 5.2180936995153475,
"grad_norm": 0.484744131565094,
"learning_rate": 5.05609869177323e-05,
"loss": 0.0371,
"step": 3230
},
{
"epoch": 5.234248788368336,
"grad_norm": 0.29547053575515747,
"learning_rate": 5.029385476403051e-05,
"loss": 0.0311,
"step": 3240
},
{
"epoch": 5.250403877221324,
"grad_norm": 0.3213876783847809,
"learning_rate": 5.002671422197384e-05,
"loss": 0.0334,
"step": 3250
},
{
"epoch": 5.266558966074314,
"grad_norm": 0.3671923577785492,
"learning_rate": 4.9759572917335104e-05,
"loss": 0.0451,
"step": 3260
},
{
"epoch": 5.282714054927302,
"grad_norm": 0.34725895524024963,
"learning_rate": 4.949243847590887e-05,
"loss": 0.0375,
"step": 3270
},
{
"epoch": 5.298869143780291,
"grad_norm": 0.4185596704483032,
"learning_rate": 4.922531852329384e-05,
"loss": 0.0379,
"step": 3280
},
{
"epoch": 5.315024232633279,
"grad_norm": 0.5074782371520996,
"learning_rate": 4.895822068467505e-05,
"loss": 0.0402,
"step": 3290
},
{
"epoch": 5.331179321486268,
"grad_norm": 1.0807178020477295,
"learning_rate": 4.869115258460635e-05,
"loss": 0.0332,
"step": 3300
},
{
"epoch": 5.347334410339257,
"grad_norm": 0.40008237957954407,
"learning_rate": 4.8424121846792614e-05,
"loss": 0.0422,
"step": 3310
},
{
"epoch": 5.363489499192245,
"grad_norm": 0.3162868618965149,
"learning_rate": 4.8157136093872215e-05,
"loss": 0.0332,
"step": 3320
},
{
"epoch": 5.379644588045235,
"grad_norm": 0.5000666379928589,
"learning_rate": 4.789020294719933e-05,
"loss": 0.0359,
"step": 3330
},
{
"epoch": 5.395799676898223,
"grad_norm": 0.4171485900878906,
"learning_rate": 4.762333002662655e-05,
"loss": 0.0383,
"step": 3340
},
{
"epoch": 5.4119547657512115,
"grad_norm": 0.6137621402740479,
"learning_rate": 4.735652495028714e-05,
"loss": 0.0393,
"step": 3350
},
{
"epoch": 5.4281098546042,
"grad_norm": 0.6010169386863708,
"learning_rate": 4.708979533437778e-05,
"loss": 0.0401,
"step": 3360
},
{
"epoch": 5.444264943457189,
"grad_norm": 0.32663294672966003,
"learning_rate": 4.6823148792941e-05,
"loss": 0.0422,
"step": 3370
},
{
"epoch": 5.460420032310178,
"grad_norm": 0.3632521629333496,
"learning_rate": 4.655659293764793e-05,
"loss": 0.0426,
"step": 3380
},
{
"epoch": 5.476575121163166,
"grad_norm": 0.30977901816368103,
"learning_rate": 4.629013537758093e-05,
"loss": 0.0417,
"step": 3390
},
{
"epoch": 5.4927302100161555,
"grad_norm": 0.42319706082344055,
"learning_rate": 4.6023783719016526e-05,
"loss": 0.0431,
"step": 3400
},
{
"epoch": 5.508885298869144,
"grad_norm": 0.3542233109474182,
"learning_rate": 4.57575455652081e-05,
"loss": 0.0365,
"step": 3410
},
{
"epoch": 5.525040387722132,
"grad_norm": 0.4000030755996704,
"learning_rate": 4.5491428516168975e-05,
"loss": 0.0467,
"step": 3420
},
{
"epoch": 5.541195476575121,
"grad_norm": 0.3602658212184906,
"learning_rate": 4.52254401684554e-05,
"loss": 0.0339,
"step": 3430
},
{
"epoch": 5.55735056542811,
"grad_norm": 0.41686055064201355,
"learning_rate": 4.495958811494978e-05,
"loss": 0.0324,
"step": 3440
},
{
"epoch": 5.573505654281099,
"grad_norm": 0.3794184625148773,
"learning_rate": 4.469387994464381e-05,
"loss": 0.0393,
"step": 3450
},
{
"epoch": 5.589660743134087,
"grad_norm": 0.3583800196647644,
"learning_rate": 4.442832324242197e-05,
"loss": 0.0427,
"step": 3460
},
{
"epoch": 5.605815831987076,
"grad_norm": 0.27712520956993103,
"learning_rate": 4.416292558884489e-05,
"loss": 0.0355,
"step": 3470
},
{
"epoch": 5.621970920840065,
"grad_norm": 0.41386884450912476,
"learning_rate": 4.389769455993303e-05,
"loss": 0.0387,
"step": 3480
},
{
"epoch": 5.638126009693053,
"grad_norm": 0.38608935475349426,
"learning_rate": 4.3632637726950415e-05,
"loss": 0.0353,
"step": 3490
},
{
"epoch": 5.654281098546042,
"grad_norm": 0.5377467274665833,
"learning_rate": 4.336776265618844e-05,
"loss": 0.0388,
"step": 3500
},
{
"epoch": 5.670436187399031,
"grad_norm": 0.5470876693725586,
"learning_rate": 4.3103076908749996e-05,
"loss": 0.0407,
"step": 3510
},
{
"epoch": 5.686591276252019,
"grad_norm": 0.3438394367694855,
"learning_rate": 4.283858804033351e-05,
"loss": 0.0348,
"step": 3520
},
{
"epoch": 5.702746365105008,
"grad_norm": 0.38908761739730835,
"learning_rate": 4.257430360101734e-05,
"loss": 0.0342,
"step": 3530
},
{
"epoch": 5.718901453957997,
"grad_norm": 0.4778120219707489,
"learning_rate": 4.2310231135044196e-05,
"loss": 0.0421,
"step": 3540
},
{
"epoch": 5.735056542810986,
"grad_norm": 0.4164102375507355,
"learning_rate": 4.2046378180605894e-05,
"loss": 0.0395,
"step": 3550
},
{
"epoch": 5.751211631663974,
"grad_norm": 0.31713828444480896,
"learning_rate": 4.1782752269627986e-05,
"loss": 0.0378,
"step": 3560
},
{
"epoch": 5.7673667205169625,
"grad_norm": 0.36085453629493713,
"learning_rate": 4.1519360927554953e-05,
"loss": 0.0419,
"step": 3570
},
{
"epoch": 5.783521809369952,
"grad_norm": 0.3456893861293793,
"learning_rate": 4.125621167313519e-05,
"loss": 0.0408,
"step": 3580
},
{
"epoch": 5.79967689822294,
"grad_norm": 0.4086418151855469,
"learning_rate": 4.09933120182066e-05,
"loss": 0.0361,
"step": 3590
},
{
"epoch": 5.815831987075929,
"grad_norm": 0.3052937984466553,
"learning_rate": 4.073066946748192e-05,
"loss": 0.0372,
"step": 3600
},
{
"epoch": 5.831987075928918,
"grad_norm": 0.3931577801704407,
"learning_rate": 4.046829151833469e-05,
"loss": 0.0331,
"step": 3610
},
{
"epoch": 5.8481421647819065,
"grad_norm": 0.46110355854034424,
"learning_rate": 4.020618566058513e-05,
"loss": 0.0354,
"step": 3620
},
{
"epoch": 5.864297253634895,
"grad_norm": 0.39353641867637634,
"learning_rate": 3.994435937628636e-05,
"loss": 0.035,
"step": 3630
},
{
"epoch": 5.880452342487883,
"grad_norm": 0.4410620927810669,
"learning_rate": 3.968282013951079e-05,
"loss": 0.0374,
"step": 3640
},
{
"epoch": 5.896607431340873,
"grad_norm": 0.36808839440345764,
"learning_rate": 3.9421575416136866e-05,
"loss": 0.0381,
"step": 3650
},
{
"epoch": 5.912762520193861,
"grad_norm": 0.38404178619384766,
"learning_rate": 3.9160632663635786e-05,
"loss": 0.0348,
"step": 3660
},
{
"epoch": 5.92891760904685,
"grad_norm": 0.2804437577724457,
"learning_rate": 3.88999993308588e-05,
"loss": 0.0346,
"step": 3670
},
{
"epoch": 5.945072697899839,
"grad_norm": 0.4300249218940735,
"learning_rate": 3.86396828578244e-05,
"loss": 0.0378,
"step": 3680
},
{
"epoch": 5.961227786752827,
"grad_norm": 0.3507043421268463,
"learning_rate": 3.837969067550611e-05,
"loss": 0.0353,
"step": 3690
},
{
"epoch": 5.977382875605816,
"grad_norm": 0.3638635277748108,
"learning_rate": 3.812003020562022e-05,
"loss": 0.0314,
"step": 3700
},
{
"epoch": 5.993537964458804,
"grad_norm": 0.3249291181564331,
"learning_rate": 3.7860708860414005e-05,
"loss": 0.0421,
"step": 3710
},
{
"epoch": 6.009693053311794,
"grad_norm": 0.4243714511394501,
"learning_rate": 3.760173404245409e-05,
"loss": 0.034,
"step": 3720
},
{
"epoch": 6.025848142164782,
"grad_norm": 0.2857236862182617,
"learning_rate": 3.734311314441521e-05,
"loss": 0.0373,
"step": 3730
},
{
"epoch": 6.0420032310177705,
"grad_norm": 0.3825433850288391,
"learning_rate": 3.708485354886906e-05,
"loss": 0.0298,
"step": 3740
},
{
"epoch": 6.058158319870759,
"grad_norm": 0.347135990858078,
"learning_rate": 3.6826962628073705e-05,
"loss": 0.0348,
"step": 3750
},
{
"epoch": 6.074313408723748,
"grad_norm": 0.4767064154148102,
"learning_rate": 3.6569447743762986e-05,
"loss": 0.0341,
"step": 3760
},
{
"epoch": 6.090468497576737,
"grad_norm": 0.2834322154521942,
"learning_rate": 3.631231624693645e-05,
"loss": 0.0391,
"step": 3770
},
{
"epoch": 6.106623586429725,
"grad_norm": 0.635104775428772,
"learning_rate": 3.605557547764951e-05,
"loss": 0.0355,
"step": 3780
},
{
"epoch": 6.1227786752827145,
"grad_norm": 0.35917991399765015,
"learning_rate": 3.579923276480387e-05,
"loss": 0.0303,
"step": 3790
},
{
"epoch": 6.138933764135703,
"grad_norm": 0.40180811285972595,
"learning_rate": 3.5543295425938414e-05,
"loss": 0.036,
"step": 3800
},
{
"epoch": 6.155088852988691,
"grad_norm": 0.24985694885253906,
"learning_rate": 3.5287770767020164e-05,
"loss": 0.0291,
"step": 3810
},
{
"epoch": 6.17124394184168,
"grad_norm": 0.36490491032600403,
"learning_rate": 3.5032666082235896e-05,
"loss": 0.0397,
"step": 3820
},
{
"epoch": 6.187399030694669,
"grad_norm": 0.2522122263908386,
"learning_rate": 3.477798865378375e-05,
"loss": 0.0335,
"step": 3830
},
{
"epoch": 6.203554119547658,
"grad_norm": 0.2659394145011902,
"learning_rate": 3.4523745751665534e-05,
"loss": 0.0303,
"step": 3840
},
{
"epoch": 6.219709208400646,
"grad_norm": 0.2996593713760376,
"learning_rate": 3.426994463347902e-05,
"loss": 0.0327,
"step": 3850
},
{
"epoch": 6.2358642972536344,
"grad_norm": 0.429979145526886,
"learning_rate": 3.401659254421094e-05,
"loss": 0.0367,
"step": 3860
},
{
"epoch": 6.252019386106624,
"grad_norm": 0.3394151031970978,
"learning_rate": 3.3763696716029957e-05,
"loss": 0.0316,
"step": 3870
},
{
"epoch": 6.268174474959612,
"grad_norm": 0.5161323547363281,
"learning_rate": 3.351126436808048e-05,
"loss": 0.0365,
"step": 3880
},
{
"epoch": 6.284329563812601,
"grad_norm": 0.3514617681503296,
"learning_rate": 3.325930270627632e-05,
"loss": 0.0316,
"step": 3890
},
{
"epoch": 6.30048465266559,
"grad_norm": 0.4464913606643677,
"learning_rate": 3.300781892309523e-05,
"loss": 0.0351,
"step": 3900
},
{
"epoch": 6.316639741518578,
"grad_norm": 0.4298667311668396,
"learning_rate": 3.2756820197373394e-05,
"loss": 0.0347,
"step": 3910
},
{
"epoch": 6.332794830371567,
"grad_norm": 0.44100216031074524,
"learning_rate": 3.250631369410064e-05,
"loss": 0.0328,
"step": 3920
},
{
"epoch": 6.348949919224555,
"grad_norm": 0.3341505527496338,
"learning_rate": 3.2256306564215796e-05,
"loss": 0.0345,
"step": 3930
},
{
"epoch": 6.365105008077545,
"grad_norm": 0.41437703371047974,
"learning_rate": 3.20068059444027e-05,
"loss": 0.0334,
"step": 3940
},
{
"epoch": 6.381260096930533,
"grad_norm": 0.5106116533279419,
"learning_rate": 3.1757818956886295e-05,
"loss": 0.0357,
"step": 3950
},
{
"epoch": 6.3974151857835215,
"grad_norm": 0.2565278708934784,
"learning_rate": 3.150935270922951e-05,
"loss": 0.0286,
"step": 3960
},
{
"epoch": 6.413570274636511,
"grad_norm": 0.3734126389026642,
"learning_rate": 3.126141429413019e-05,
"loss": 0.0297,
"step": 3970
},
{
"epoch": 6.429725363489499,
"grad_norm": 0.34675681591033936,
"learning_rate": 3.101401078921878e-05,
"loss": 0.028,
"step": 3980
},
{
"epoch": 6.445880452342488,
"grad_norm": 0.42493683099746704,
"learning_rate": 3.076714925685617e-05,
"loss": 0.03,
"step": 3990
},
{
"epoch": 6.462035541195476,
"grad_norm": 0.30656370520591736,
"learning_rate": 3.052083674393221e-05,
"loss": 0.0312,
"step": 4000
},
{
"epoch": 6.4781906300484655,
"grad_norm": 0.36631324887275696,
"learning_rate": 3.0275080281664414e-05,
"loss": 0.0279,
"step": 4010
},
{
"epoch": 6.494345718901454,
"grad_norm": 0.5831628441810608,
"learning_rate": 3.0029886885397367e-05,
"loss": 0.0354,
"step": 4020
},
{
"epoch": 6.510500807754442,
"grad_norm": 0.3962215781211853,
"learning_rate": 2.9785263554402366e-05,
"loss": 0.0392,
"step": 4030
},
{
"epoch": 6.526655896607432,
"grad_norm": 0.45189252495765686,
"learning_rate": 2.9541217271677745e-05,
"loss": 0.0356,
"step": 4040
},
{
"epoch": 6.54281098546042,
"grad_norm": 0.4892602860927582,
"learning_rate": 2.9297755003749394e-05,
"loss": 0.0297,
"step": 4050
},
{
"epoch": 6.558966074313409,
"grad_norm": 0.32902640104293823,
"learning_rate": 2.9054883700471974e-05,
"loss": 0.0315,
"step": 4060
},
{
"epoch": 6.575121163166397,
"grad_norm": 0.3130761682987213,
"learning_rate": 2.8812610294830566e-05,
"loss": 0.0336,
"step": 4070
},
{
"epoch": 6.591276252019386,
"grad_norm": 0.5444455146789551,
"learning_rate": 2.8570941702742663e-05,
"loss": 0.0293,
"step": 4080
},
{
"epoch": 6.607431340872375,
"grad_norm": 0.5223131775856018,
"learning_rate": 2.832988482286081e-05,
"loss": 0.032,
"step": 4090
},
{
"epoch": 6.623586429725363,
"grad_norm": 0.5296066403388977,
"learning_rate": 2.808944653637564e-05,
"loss": 0.0336,
"step": 4100
},
{
"epoch": 6.639741518578353,
"grad_norm": 0.4030674397945404,
"learning_rate": 2.7849633706819533e-05,
"loss": 0.0355,
"step": 4110
},
{
"epoch": 6.655896607431341,
"grad_norm": 0.42938342690467834,
"learning_rate": 2.7610453179870554e-05,
"loss": 0.0291,
"step": 4120
},
{
"epoch": 6.6720516962843295,
"grad_norm": 0.4580219089984894,
"learning_rate": 2.7371911783157178e-05,
"loss": 0.0318,
"step": 4130
},
{
"epoch": 6.688206785137318,
"grad_norm": 0.30596330761909485,
"learning_rate": 2.7134016326063234e-05,
"loss": 0.034,
"step": 4140
},
{
"epoch": 6.704361873990307,
"grad_norm": 0.35359278321266174,
"learning_rate": 2.6896773599533694e-05,
"loss": 0.0299,
"step": 4150
},
{
"epoch": 6.720516962843296,
"grad_norm": 0.29407617449760437,
"learning_rate": 2.6660190375880657e-05,
"loss": 0.0266,
"step": 4160
},
{
"epoch": 6.736672051696284,
"grad_norm": 0.357388973236084,
"learning_rate": 2.6424273408590188e-05,
"loss": 0.0352,
"step": 4170
},
{
"epoch": 6.7528271405492735,
"grad_norm": 0.8390901684761047,
"learning_rate": 2.6189029432129385e-05,
"loss": 0.0377,
"step": 4180
},
{
"epoch": 6.768982229402262,
"grad_norm": 0.28982290625572205,
"learning_rate": 2.5954465161754227e-05,
"loss": 0.0315,
"step": 4190
},
{
"epoch": 6.78513731825525,
"grad_norm": 0.5228689908981323,
"learning_rate": 2.5720587293317826e-05,
"loss": 0.0283,
"step": 4200
},
{
"epoch": 6.801292407108239,
"grad_norm": 0.5332914590835571,
"learning_rate": 2.5487402503079395e-05,
"loss": 0.0314,
"step": 4210
},
{
"epoch": 6.817447495961228,
"grad_norm": 0.5198635458946228,
"learning_rate": 2.5254917447513504e-05,
"loss": 0.0298,
"step": 4220
},
{
"epoch": 6.833602584814217,
"grad_norm": 0.37016230821609497,
"learning_rate": 2.5023138763120217e-05,
"loss": 0.0281,
"step": 4230
},
{
"epoch": 6.849757673667205,
"grad_norm": 0.32923170924186707,
"learning_rate": 2.479207306623554e-05,
"loss": 0.0308,
"step": 4240
},
{
"epoch": 6.865912762520194,
"grad_norm": 0.2647690176963806,
"learning_rate": 2.456172695284263e-05,
"loss": 0.0336,
"step": 4250
},
{
"epoch": 6.882067851373183,
"grad_norm": 0.39588427543640137,
"learning_rate": 2.433210699838342e-05,
"loss": 0.0328,
"step": 4260
},
{
"epoch": 6.898222940226171,
"grad_norm": 0.28190135955810547,
"learning_rate": 2.4103219757571033e-05,
"loss": 0.0292,
"step": 4270
},
{
"epoch": 6.91437802907916,
"grad_norm": 0.4510742723941803,
"learning_rate": 2.3875071764202563e-05,
"loss": 0.0293,
"step": 4280
},
{
"epoch": 6.930533117932149,
"grad_norm": 0.35639435052871704,
"learning_rate": 2.36476695309726e-05,
"loss": 0.0274,
"step": 4290
},
{
"epoch": 6.946688206785137,
"grad_norm": 0.38058537244796753,
"learning_rate": 2.342101954928733e-05,
"loss": 0.0332,
"step": 4300
},
{
"epoch": 6.962843295638126,
"grad_norm": 0.5739650726318359,
"learning_rate": 2.3195128289079264e-05,
"loss": 0.0266,
"step": 4310
},
{
"epoch": 6.978998384491114,
"grad_norm": 0.5040541887283325,
"learning_rate": 2.2970002198622444e-05,
"loss": 0.0386,
"step": 4320
},
{
"epoch": 6.995153473344104,
"grad_norm": 0.2796167731285095,
"learning_rate": 2.2745647704348506e-05,
"loss": 0.0304,
"step": 4330
},
{
"epoch": 7.011308562197092,
"grad_norm": 0.8160725235939026,
"learning_rate": 2.2522071210663108e-05,
"loss": 0.0257,
"step": 4340
},
{
"epoch": 7.0274636510500805,
"grad_norm": 0.2881336510181427,
"learning_rate": 2.2299279099763176e-05,
"loss": 0.0291,
"step": 4350
},
{
"epoch": 7.04361873990307,
"grad_norm": 0.43697014451026917,
"learning_rate": 2.2077277731454743e-05,
"loss": 0.0302,
"step": 4360
},
{
"epoch": 7.059773828756058,
"grad_norm": 0.2801852822303772,
"learning_rate": 2.185607344297132e-05,
"loss": 0.0285,
"step": 4370
},
{
"epoch": 7.075928917609047,
"grad_norm": 0.4039601683616638,
"learning_rate": 2.1635672548793067e-05,
"loss": 0.0249,
"step": 4380
},
{
"epoch": 7.092084006462035,
"grad_norm": 0.312288761138916,
"learning_rate": 2.1416081340466477e-05,
"loss": 0.0289,
"step": 4390
},
{
"epoch": 7.1082390953150245,
"grad_norm": 0.3759534955024719,
"learning_rate": 2.119730608642489e-05,
"loss": 0.0343,
"step": 4400
},
{
"epoch": 7.124394184168013,
"grad_norm": 0.5132532119750977,
"learning_rate": 2.0979353031809383e-05,
"loss": 0.0346,
"step": 4410
},
{
"epoch": 7.140549273021001,
"grad_norm": 0.9728456139564514,
"learning_rate": 2.0762228398290697e-05,
"loss": 0.0339,
"step": 4420
},
{
"epoch": 7.156704361873991,
"grad_norm": 0.32944944500923157,
"learning_rate": 2.054593838389143e-05,
"loss": 0.0257,
"step": 4430
},
{
"epoch": 7.172859450726979,
"grad_norm": 0.22434404492378235,
"learning_rate": 2.033048916280928e-05,
"loss": 0.0317,
"step": 4440
},
{
"epoch": 7.189014539579968,
"grad_norm": 0.36417004466056824,
"learning_rate": 2.0115886885240682e-05,
"loss": 0.0264,
"step": 4450
},
{
"epoch": 7.205169628432956,
"grad_norm": 0.29687365889549255,
"learning_rate": 1.990213767720533e-05,
"loss": 0.03,
"step": 4460
},
{
"epoch": 7.221324717285945,
"grad_norm": 0.41539931297302246,
"learning_rate": 1.9689247640371223e-05,
"loss": 0.0294,
"step": 4470
},
{
"epoch": 7.237479806138934,
"grad_norm": 0.44902583956718445,
"learning_rate": 1.9477222851880545e-05,
"loss": 0.0282,
"step": 4480
},
{
"epoch": 7.253634894991922,
"grad_norm": 0.38103097677230835,
"learning_rate": 1.926606936417614e-05,
"loss": 0.0311,
"step": 4490
},
{
"epoch": 7.269789983844911,
"grad_norm": 0.42052754759788513,
"learning_rate": 1.9055793204828842e-05,
"loss": 0.0298,
"step": 4500
},
{
"epoch": 7.2859450726979,
"grad_norm": 0.645574688911438,
"learning_rate": 1.8846400376365253e-05,
"loss": 0.0291,
"step": 4510
},
{
"epoch": 7.3021001615508885,
"grad_norm": 0.4374733865261078,
"learning_rate": 1.8637896856096548e-05,
"loss": 0.0301,
"step": 4520
},
{
"epoch": 7.318255250403877,
"grad_norm": 0.46677830815315247,
"learning_rate": 1.843028859594772e-05,
"loss": 0.0283,
"step": 4530
},
{
"epoch": 7.334410339256866,
"grad_norm": 0.4820699393749237,
"learning_rate": 1.8223581522287807e-05,
"loss": 0.0264,
"step": 4540
},
{
"epoch": 7.350565428109855,
"grad_norm": 0.2922935485839844,
"learning_rate": 1.801778153576058e-05,
"loss": 0.0256,
"step": 4550
},
{
"epoch": 7.366720516962843,
"grad_norm": 0.6086759567260742,
"learning_rate": 1.7812894511116235e-05,
"loss": 0.0247,
"step": 4560
},
{
"epoch": 7.382875605815832,
"grad_norm": 0.2402912974357605,
"learning_rate": 1.7608926297043583e-05,
"loss": 0.0309,
"step": 4570
},
{
"epoch": 7.399030694668821,
"grad_norm": 0.25733429193496704,
"learning_rate": 1.7405882716003154e-05,
"loss": 0.0219,
"step": 4580
},
{
"epoch": 7.415185783521809,
"grad_norm": 0.4837753474712372,
"learning_rate": 1.7203769564060962e-05,
"loss": 0.0262,
"step": 4590
},
{
"epoch": 7.431340872374798,
"grad_norm": 0.31810057163238525,
"learning_rate": 1.700259261072312e-05,
"loss": 0.0234,
"step": 4600
},
{
"epoch": 7.447495961227787,
"grad_norm": 0.5520622134208679,
"learning_rate": 1.6802357598771012e-05,
"loss": 0.0274,
"step": 4610
},
{
"epoch": 7.463651050080776,
"grad_norm": 0.3220314085483551,
"learning_rate": 1.6603070244097523e-05,
"loss": 0.0277,
"step": 4620
},
{
"epoch": 7.479806138933764,
"grad_norm": 0.3330337703227997,
"learning_rate": 1.6404736235543705e-05,
"loss": 0.032,
"step": 4630
},
{
"epoch": 7.4959612277867524,
"grad_norm": 0.8944841027259827,
"learning_rate": 1.6207361234736533e-05,
"loss": 0.0257,
"step": 4640
},
{
"epoch": 7.512116316639742,
"grad_norm": 0.3682458698749542,
"learning_rate": 1.6010950875927182e-05,
"loss": 0.0268,
"step": 4650
},
{
"epoch": 7.52827140549273,
"grad_norm": 0.4134623408317566,
"learning_rate": 1.581551076583023e-05,
"loss": 0.0353,
"step": 4660
},
{
"epoch": 7.544426494345719,
"grad_norm": 0.2340182512998581,
"learning_rate": 1.5621046483463663e-05,
"loss": 0.0252,
"step": 4670
},
{
"epoch": 7.560581583198708,
"grad_norm": 0.6091485619544983,
"learning_rate": 1.5427563579989507e-05,
"loss": 0.0214,
"step": 4680
},
{
"epoch": 7.576736672051696,
"grad_norm": 0.7271833419799805,
"learning_rate": 1.523506757855545e-05,
"loss": 0.0305,
"step": 4690
},
{
"epoch": 7.592891760904685,
"grad_norm": 0.3721354007720947,
"learning_rate": 1.504356397413713e-05,
"loss": 0.032,
"step": 4700
},
{
"epoch": 7.609046849757673,
"grad_norm": 0.3686143755912781,
"learning_rate": 1.485305823338135e-05,
"loss": 0.0258,
"step": 4710
},
{
"epoch": 7.625201938610663,
"grad_norm": 0.3255109488964081,
"learning_rate": 1.4663555794449918e-05,
"loss": 0.0248,
"step": 4720
},
{
"epoch": 7.641357027463651,
"grad_norm": 0.35630714893341064,
"learning_rate": 1.4475062066864514e-05,
"loss": 0.031,
"step": 4730
},
{
"epoch": 7.6575121163166395,
"grad_norm": 0.2801692485809326,
"learning_rate": 1.4287582431352175e-05,
"loss": 0.0246,
"step": 4740
},
{
"epoch": 7.673667205169629,
"grad_norm": 0.3327733874320984,
"learning_rate": 1.41011222396918e-05,
"loss": 0.0251,
"step": 4750
},
{
"epoch": 7.689822294022617,
"grad_norm": 0.6043513417243958,
"learning_rate": 1.3915686814561285e-05,
"loss": 0.0288,
"step": 4760
},
{
"epoch": 7.705977382875606,
"grad_norm": 0.3464643657207489,
"learning_rate": 1.373128144938563e-05,
"loss": 0.0299,
"step": 4770
},
{
"epoch": 7.722132471728594,
"grad_norm": 0.21582302451133728,
"learning_rate": 1.354791140818582e-05,
"loss": 0.0337,
"step": 4780
},
{
"epoch": 7.7382875605815835,
"grad_norm": 0.4652714431285858,
"learning_rate": 1.3365581925428594e-05,
"loss": 0.0241,
"step": 4790
},
{
"epoch": 7.754442649434572,
"grad_norm": 0.2494271844625473,
"learning_rate": 1.3184298205876938e-05,
"loss": 0.0271,
"step": 4800
},
{
"epoch": 7.77059773828756,
"grad_norm": 0.44884902238845825,
"learning_rate": 1.3004065424441636e-05,
"loss": 0.0258,
"step": 4810
},
{
"epoch": 7.78675282714055,
"grad_norm": 0.3176079988479614,
"learning_rate": 1.282488872603339e-05,
"loss": 0.0226,
"step": 4820
},
{
"epoch": 7.802907915993538,
"grad_norm": 0.42613035440444946,
"learning_rate": 1.2646773225416132e-05,
"loss": 0.0283,
"step": 4830
},
{
"epoch": 7.819063004846527,
"grad_norm": 0.7398589849472046,
"learning_rate": 1.2469724007060835e-05,
"loss": 0.0377,
"step": 4840
},
{
"epoch": 7.835218093699515,
"grad_norm": 0.38897576928138733,
"learning_rate": 1.2293746125000538e-05,
"loss": 0.0257,
"step": 4850
},
{
"epoch": 7.851373182552504,
"grad_norm": 0.50649094581604,
"learning_rate": 1.2118844602685958e-05,
"loss": 0.0253,
"step": 4860
},
{
"epoch": 7.867528271405493,
"grad_norm": 0.28981852531433105,
"learning_rate": 1.1945024432842134e-05,
"loss": 0.0285,
"step": 4870
},
{
"epoch": 7.883683360258481,
"grad_norm": 0.406024307012558,
"learning_rate": 1.1772290577325895e-05,
"loss": 0.0306,
"step": 4880
},
{
"epoch": 7.899838449111471,
"grad_norm": 0.2725732922554016,
"learning_rate": 1.1600647966984274e-05,
"loss": 0.0246,
"step": 4890
},
{
"epoch": 7.915993537964459,
"grad_norm": 0.4214000999927521,
"learning_rate": 1.1430101501513634e-05,
"loss": 0.0281,
"step": 4900
},
{
"epoch": 7.9321486268174475,
"grad_norm": 0.2999376952648163,
"learning_rate": 1.1260656049319957e-05,
"loss": 0.024,
"step": 4910
},
{
"epoch": 7.948303715670436,
"grad_norm": 0.31904590129852295,
"learning_rate": 1.1092316447379692e-05,
"loss": 0.0212,
"step": 4920
},
{
"epoch": 7.964458804523425,
"grad_norm": 0.3466980755329132,
"learning_rate": 1.0925087501101872e-05,
"loss": 0.0293,
"step": 4930
},
{
"epoch": 7.980613893376414,
"grad_norm": 0.3411683440208435,
"learning_rate": 1.0758973984190762e-05,
"loss": 0.0219,
"step": 4940
},
{
"epoch": 7.996768982229402,
"grad_norm": 0.3246071934700012,
"learning_rate": 1.0593980638509693e-05,
"loss": 0.0295,
"step": 4950
},
{
"epoch": 8.012924071082391,
"grad_norm": 0.29273203015327454,
"learning_rate": 1.043011217394571e-05,
"loss": 0.0264,
"step": 4960
},
{
"epoch": 8.02907915993538,
"grad_norm": 0.36481159925460815,
"learning_rate": 1.0267373268275049e-05,
"loss": 0.0303,
"step": 4970
},
{
"epoch": 8.045234248788368,
"grad_norm": 0.26860660314559937,
"learning_rate": 1.0105768567029655e-05,
"loss": 0.0314,
"step": 4980
},
{
"epoch": 8.061389337641357,
"grad_norm": 0.3127424716949463,
"learning_rate": 9.945302683364566e-06,
"loss": 0.0224,
"step": 4990
},
{
"epoch": 8.077544426494345,
"grad_norm": 0.3091331422328949,
"learning_rate": 9.785980197926242e-06,
"loss": 0.0267,
"step": 5000
},
{
"epoch": 8.093699515347334,
"grad_norm": 0.3343771696090698,
"learning_rate": 9.627805658721756e-06,
"loss": 0.0311,
"step": 5010
},
{
"epoch": 8.109854604200324,
"grad_norm": 0.37236693501472473,
"learning_rate": 9.470783580989029e-06,
"loss": 0.0261,
"step": 5020
},
{
"epoch": 8.126009693053312,
"grad_norm": 0.28066885471343994,
"learning_rate": 9.314918447067878e-06,
"loss": 0.0256,
"step": 5030
},
{
"epoch": 8.1421647819063,
"grad_norm": 0.3097597360610962,
"learning_rate": 9.16021470627213e-06,
"loss": 0.0246,
"step": 5040
},
{
"epoch": 8.15831987075929,
"grad_norm": 0.2532176971435547,
"learning_rate": 9.006676774762535e-06,
"loss": 0.0238,
"step": 5050
},
{
"epoch": 8.174474959612278,
"grad_norm": 0.6101159453392029,
"learning_rate": 8.854309035420772e-06,
"loss": 0.0248,
"step": 5060
},
{
"epoch": 8.190630048465266,
"grad_norm": 0.36472347378730774,
"learning_rate": 8.703115837724274e-06,
"loss": 0.0215,
"step": 5070
},
{
"epoch": 8.206785137318255,
"grad_norm": 0.2975756525993347,
"learning_rate": 8.553101497622162e-06,
"loss": 0.0258,
"step": 5080
},
{
"epoch": 8.222940226171245,
"grad_norm": 0.25580790638923645,
"learning_rate": 8.404270297411904e-06,
"loss": 0.025,
"step": 5090
},
{
"epoch": 8.239095315024233,
"grad_norm": 0.3066563606262207,
"learning_rate": 8.256626485617219e-06,
"loss": 0.0303,
"step": 5100
},
{
"epoch": 8.255250403877222,
"grad_norm": 0.44430306553840637,
"learning_rate": 8.110174276866683e-06,
"loss": 0.0224,
"step": 5110
},
{
"epoch": 8.27140549273021,
"grad_norm": 0.3054925799369812,
"learning_rate": 7.964917851773496e-06,
"loss": 0.0278,
"step": 5120
},
{
"epoch": 8.287560581583199,
"grad_norm": 0.25573277473449707,
"learning_rate": 7.820861356816078e-06,
"loss": 0.0211,
"step": 5130
},
{
"epoch": 8.303715670436187,
"grad_norm": 0.3430786728858948,
"learning_rate": 7.678008904219786e-06,
"loss": 0.0237,
"step": 5140
},
{
"epoch": 8.319870759289175,
"grad_norm": 0.4758915603160858,
"learning_rate": 7.536364571839438e-06,
"loss": 0.0215,
"step": 5150
},
{
"epoch": 8.336025848142166,
"grad_norm": 0.3592261075973511,
"learning_rate": 7.3959324030429654e-06,
"loss": 0.0266,
"step": 5160
},
{
"epoch": 8.352180936995154,
"grad_norm": 0.33260300755500793,
"learning_rate": 7.256716406595948e-06,
"loss": 0.0189,
"step": 5170
},
{
"epoch": 8.368336025848143,
"grad_norm": 0.5978755950927734,
"learning_rate": 7.118720556547259e-06,
"loss": 0.0236,
"step": 5180
},
{
"epoch": 8.384491114701131,
"grad_norm": 0.18789972364902496,
"learning_rate": 6.9819487921155116e-06,
"loss": 0.0234,
"step": 5190
},
{
"epoch": 8.40064620355412,
"grad_norm": 0.31928345561027527,
"learning_rate": 6.846405017576718e-06,
"loss": 0.0281,
"step": 5200
},
{
"epoch": 8.416801292407108,
"grad_norm": 0.34838926792144775,
"learning_rate": 6.712093102152739e-06,
"loss": 0.0276,
"step": 5210
},
{
"epoch": 8.432956381260096,
"grad_norm": 0.37636154890060425,
"learning_rate": 6.579016879900924e-06,
"loss": 0.0251,
"step": 5220
},
{
"epoch": 8.449111470113085,
"grad_norm": 0.26992267370224,
"learning_rate": 6.447180149604603e-06,
"loss": 0.0298,
"step": 5230
},
{
"epoch": 8.465266558966075,
"grad_norm": 0.31432321667671204,
"learning_rate": 6.316586674664654e-06,
"loss": 0.0225,
"step": 5240
},
{
"epoch": 8.481421647819063,
"grad_norm": 0.4834333062171936,
"learning_rate": 6.187240182992126e-06,
"loss": 0.0211,
"step": 5250
},
{
"epoch": 8.497576736672052,
"grad_norm": 0.3510620594024658,
"learning_rate": 6.059144366901736e-06,
"loss": 0.0267,
"step": 5260
},
{
"epoch": 8.51373182552504,
"grad_norm": 0.24435961246490479,
"learning_rate": 5.932302883006546e-06,
"loss": 0.0264,
"step": 5270
},
{
"epoch": 8.529886914378029,
"grad_norm": 0.3356267809867859,
"learning_rate": 5.806719352113521e-06,
"loss": 0.0284,
"step": 5280
},
{
"epoch": 8.546042003231017,
"grad_norm": 0.46301284432411194,
"learning_rate": 5.682397359120245e-06,
"loss": 0.0232,
"step": 5290
},
{
"epoch": 8.562197092084006,
"grad_norm": 0.37262049317359924,
"learning_rate": 5.5593404529124875e-06,
"loss": 0.0226,
"step": 5300
},
{
"epoch": 8.578352180936996,
"grad_norm": 0.5886579155921936,
"learning_rate": 5.437552146263003e-06,
"loss": 0.0276,
"step": 5310
},
{
"epoch": 8.594507269789984,
"grad_norm": 0.3321017324924469,
"learning_rate": 5.3170359157311445e-06,
"loss": 0.0234,
"step": 5320
},
{
"epoch": 8.610662358642973,
"grad_norm": 0.5492444038391113,
"learning_rate": 5.197795201563743e-06,
"loss": 0.0242,
"step": 5330
},
{
"epoch": 8.626817447495961,
"grad_norm": 0.3654116690158844,
"learning_rate": 5.07983340759679e-06,
"loss": 0.0251,
"step": 5340
},
{
"epoch": 8.64297253634895,
"grad_norm": 0.3987561762332916,
"learning_rate": 4.963153901158352e-06,
"loss": 0.0219,
"step": 5350
},
{
"epoch": 8.659127625201938,
"grad_norm": 0.2271428108215332,
"learning_rate": 4.847760012972402e-06,
"loss": 0.0255,
"step": 5360
},
{
"epoch": 8.675282714054926,
"grad_norm": 0.3363126218318939,
"learning_rate": 4.733655037063761e-06,
"loss": 0.026,
"step": 5370
},
{
"epoch": 8.691437802907917,
"grad_norm": 0.4031514823436737,
"learning_rate": 4.620842230664052e-06,
"loss": 0.0263,
"step": 5380
},
{
"epoch": 8.707592891760905,
"grad_norm": 0.4956108033657074,
"learning_rate": 4.509324814118754e-06,
"loss": 0.0219,
"step": 5390
},
{
"epoch": 8.723747980613894,
"grad_norm": 0.3220359981060028,
"learning_rate": 4.39910597079522e-06,
"loss": 0.0241,
"step": 5400
},
{
"epoch": 8.739903069466882,
"grad_norm": 0.2299569994211197,
"learning_rate": 4.290188846991866e-06,
"loss": 0.0255,
"step": 5410
},
{
"epoch": 8.75605815831987,
"grad_norm": 0.2428327053785324,
"learning_rate": 4.182576551848283e-06,
"loss": 0.0212,
"step": 5420
},
{
"epoch": 8.772213247172859,
"grad_norm": 0.42381399869918823,
"learning_rate": 4.076272157256577e-06,
"loss": 0.0218,
"step": 5430
},
{
"epoch": 8.788368336025847,
"grad_norm": 0.4133065342903137,
"learning_rate": 3.971278697773584e-06,
"loss": 0.024,
"step": 5440
},
{
"epoch": 8.804523424878838,
"grad_norm": 0.5403354167938232,
"learning_rate": 3.86759917053432e-06,
"loss": 0.0275,
"step": 5450
},
{
"epoch": 8.820678513731826,
"grad_norm": 0.4583636224269867,
"learning_rate": 3.765236535166361e-06,
"loss": 0.0239,
"step": 5460
},
{
"epoch": 8.836833602584814,
"grad_norm": 0.28426027297973633,
"learning_rate": 3.6641937137054382e-06,
"loss": 0.0212,
"step": 5470
},
{
"epoch": 8.852988691437803,
"grad_norm": 0.3920319676399231,
"learning_rate": 3.564473590511941e-06,
"loss": 0.0168,
"step": 5480
},
{
"epoch": 8.869143780290791,
"grad_norm": 0.28189295530319214,
"learning_rate": 3.4660790121886387e-06,
"loss": 0.0246,
"step": 5490
},
{
"epoch": 8.88529886914378,
"grad_norm": 0.24748258292675018,
"learning_rate": 3.369012787499387e-06,
"loss": 0.0185,
"step": 5500
},
{
"epoch": 8.901453957996768,
"grad_norm": 0.6315116286277771,
"learning_rate": 3.273277687288978e-06,
"loss": 0.025,
"step": 5510
},
{
"epoch": 8.917609046849758,
"grad_norm": 0.34694400429725647,
"learning_rate": 3.178876444404022e-06,
"loss": 0.0188,
"step": 5520
},
{
"epoch": 8.933764135702747,
"grad_norm": 0.38088199496269226,
"learning_rate": 3.0858117536149365e-06,
"loss": 0.0278,
"step": 5530
},
{
"epoch": 8.949919224555735,
"grad_norm": 0.2522503435611725,
"learning_rate": 2.9940862715390485e-06,
"loss": 0.021,
"step": 5540
},
{
"epoch": 8.966074313408724,
"grad_norm": 0.20651240646839142,
"learning_rate": 2.9037026165647186e-06,
"loss": 0.02,
"step": 5550
},
{
"epoch": 8.982229402261712,
"grad_norm": 0.24644720554351807,
"learning_rate": 2.8146633687766267e-06,
"loss": 0.0196,
"step": 5560
},
{
"epoch": 8.9983844911147,
"grad_norm": 0.26605224609375,
"learning_rate": 2.7269710698821004e-06,
"loss": 0.0205,
"step": 5570
},
{
"epoch": 9.014539579967689,
"grad_norm": 0.3996153473854065,
"learning_rate": 2.640628223138597e-06,
"loss": 0.0206,
"step": 5580
},
{
"epoch": 9.03069466882068,
"grad_norm": 0.2248448133468628,
"learning_rate": 2.555637293282187e-06,
"loss": 0.0236,
"step": 5590
},
{
"epoch": 9.046849757673668,
"grad_norm": 0.32477447390556335,
"learning_rate": 2.4720007064572504e-06,
"loss": 0.0195,
"step": 5600
},
{
"epoch": 9.063004846526656,
"grad_norm": 0.395511656999588,
"learning_rate": 2.389720850147181e-06,
"loss": 0.0286,
"step": 5610
},
{
"epoch": 9.079159935379645,
"grad_norm": 0.3446315824985504,
"learning_rate": 2.308800073106282e-06,
"loss": 0.0249,
"step": 5620
},
{
"epoch": 9.095315024232633,
"grad_norm": 0.2906045615673065,
"learning_rate": 2.2292406852926383e-06,
"loss": 0.0199,
"step": 5630
},
{
"epoch": 9.111470113085621,
"grad_norm": 0.250783771276474,
"learning_rate": 2.1510449578022674e-06,
"loss": 0.0241,
"step": 5640
},
{
"epoch": 9.12762520193861,
"grad_norm": 0.2330540418624878,
"learning_rate": 2.074215122804235e-06,
"loss": 0.021,
"step": 5650
},
{
"epoch": 9.1437802907916,
"grad_norm": 0.4786008894443512,
"learning_rate": 1.998753373476936e-06,
"loss": 0.0214,
"step": 5660
},
{
"epoch": 9.159935379644589,
"grad_norm": 0.30990907549858093,
"learning_rate": 1.924661863945498e-06,
"loss": 0.0229,
"step": 5670
},
{
"epoch": 9.176090468497577,
"grad_norm": 0.3807198107242584,
"learning_rate": 1.851942709220328e-06,
"loss": 0.024,
"step": 5680
},
{
"epoch": 9.192245557350565,
"grad_norm": 0.2552647590637207,
"learning_rate": 1.7805979851366505e-06,
"loss": 0.0262,
"step": 5690
},
{
"epoch": 9.208400646203554,
"grad_norm": 0.2620588541030884,
"learning_rate": 1.7106297282953376e-06,
"loss": 0.021,
"step": 5700
},
{
"epoch": 9.224555735056542,
"grad_norm": 0.3594263792037964,
"learning_rate": 1.642039936004719e-06,
"loss": 0.0248,
"step": 5710
},
{
"epoch": 9.24071082390953,
"grad_norm": 0.2621122896671295,
"learning_rate": 1.5748305662236007e-06,
"loss": 0.0262,
"step": 5720
},
{
"epoch": 9.256865912762521,
"grad_norm": 0.2171621173620224,
"learning_rate": 1.5090035375053268e-06,
"loss": 0.0274,
"step": 5730
},
{
"epoch": 9.27302100161551,
"grad_norm": 0.2977316975593567,
"learning_rate": 1.4445607289430784e-06,
"loss": 0.0235,
"step": 5740
},
{
"epoch": 9.289176090468498,
"grad_norm": 0.32373809814453125,
"learning_rate": 1.3815039801161721e-06,
"loss": 0.0229,
"step": 5750
},
{
"epoch": 9.305331179321486,
"grad_norm": 0.5977046489715576,
"learning_rate": 1.31983509103758e-06,
"loss": 0.0222,
"step": 5760
},
{
"epoch": 9.321486268174475,
"grad_norm": 0.3709489405155182,
"learning_rate": 1.2595558221025372e-06,
"loss": 0.0268,
"step": 5770
},
{
"epoch": 9.337641357027463,
"grad_norm": 0.5009976029396057,
"learning_rate": 1.2006678940383098e-06,
"loss": 0.0223,
"step": 5780
},
{
"epoch": 9.353796445880452,
"grad_norm": 0.5957422256469727,
"learning_rate": 1.1431729878550235e-06,
"loss": 0.0245,
"step": 5790
},
{
"epoch": 9.369951534733442,
"grad_norm": 0.30659547448158264,
"learning_rate": 1.0870727447977402e-06,
"loss": 0.0191,
"step": 5800
},
{
"epoch": 9.38610662358643,
"grad_norm": 0.41769152879714966,
"learning_rate": 1.0323687662995685e-06,
"loss": 0.0237,
"step": 5810
},
{
"epoch": 9.402261712439419,
"grad_norm": 0.2959621846675873,
"learning_rate": 9.79062613935955e-07,
"loss": 0.0205,
"step": 5820
},
{
"epoch": 9.418416801292407,
"grad_norm": 0.2948278486728668,
"learning_rate": 9.271558093801202e-07,
"loss": 0.0217,
"step": 5830
},
{
"epoch": 9.434571890145396,
"grad_norm": 0.30144789814949036,
"learning_rate": 8.766498343596052e-07,
"loss": 0.0211,
"step": 5840
},
{
"epoch": 9.450726978998384,
"grad_norm": 0.23101864755153656,
"learning_rate": 8.275461306139876e-07,
"loss": 0.0265,
"step": 5850
},
{
"epoch": 9.466882067851373,
"grad_norm": 0.20476695895195007,
"learning_rate": 7.79846099853715e-07,
"loss": 0.0254,
"step": 5860
},
{
"epoch": 9.483037156704363,
"grad_norm": 0.16762420535087585,
"learning_rate": 7.335511037200982e-07,
"loss": 0.0198,
"step": 5870
},
{
"epoch": 9.499192245557351,
"grad_norm": 0.2673914134502411,
"learning_rate": 6.886624637464422e-07,
"loss": 0.0255,
"step": 5880
},
{
"epoch": 9.51534733441034,
"grad_norm": 0.20123161375522614,
"learning_rate": 6.451814613203211e-07,
"loss": 0.0227,
"step": 5890
},
{
"epoch": 9.531502423263328,
"grad_norm": 1.0308411121368408,
"learning_rate": 6.031093376469899e-07,
"loss": 0.0249,
"step": 5900
},
{
"epoch": 9.547657512116317,
"grad_norm": 0.38134875893592834,
"learning_rate": 5.624472937139802e-07,
"loss": 0.0192,
"step": 5910
},
{
"epoch": 9.563812600969305,
"grad_norm": 0.685291588306427,
"learning_rate": 5.231964902567721e-07,
"loss": 0.0261,
"step": 5920
},
{
"epoch": 9.579967689822293,
"grad_norm": 0.33756789565086365,
"learning_rate": 4.853580477257203e-07,
"loss": 0.0211,
"step": 5930
},
{
"epoch": 9.596122778675284,
"grad_norm": 0.21888208389282227,
"learning_rate": 4.489330462540076e-07,
"loss": 0.0228,
"step": 5940
},
{
"epoch": 9.612277867528272,
"grad_norm": 0.2490539699792862,
"learning_rate": 4.139225256268475e-07,
"loss": 0.0216,
"step": 5950
},
{
"epoch": 9.62843295638126,
"grad_norm": 0.4339372515678406,
"learning_rate": 3.8032748525179685e-07,
"loss": 0.017,
"step": 5960
},
{
"epoch": 9.644588045234249,
"grad_norm": 0.29505810141563416,
"learning_rate": 3.481488841302283e-07,
"loss": 0.0231,
"step": 5970
},
{
"epoch": 9.660743134087237,
"grad_norm": 0.6434254050254822,
"learning_rate": 3.17387640829947e-07,
"loss": 0.0197,
"step": 5980
},
{
"epoch": 9.676898222940226,
"grad_norm": 0.42414963245391846,
"learning_rate": 2.880446334589837e-07,
"loss": 0.023,
"step": 5990
},
{
"epoch": 9.693053311793214,
"grad_norm": 0.35125264525413513,
"learning_rate": 2.601206996404981e-07,
"loss": 0.0241,
"step": 6000
},
{
"epoch": 9.709208400646204,
"grad_norm": 0.2720375657081604,
"learning_rate": 2.336166364889092e-07,
"loss": 0.0194,
"step": 6010
},
{
"epoch": 9.725363489499193,
"grad_norm": 0.36113637685775757,
"learning_rate": 2.0853320058710214e-07,
"loss": 0.0235,
"step": 6020
},
{
"epoch": 9.741518578352181,
"grad_norm": 0.43229228258132935,
"learning_rate": 1.848711079648624e-07,
"loss": 0.0202,
"step": 6030
},
{
"epoch": 9.75767366720517,
"grad_norm": 0.3741385042667389,
"learning_rate": 1.626310340784143e-07,
"loss": 0.0235,
"step": 6040
},
{
"epoch": 9.773828756058158,
"grad_norm": 0.6551222801208496,
"learning_rate": 1.4181361379115855e-07,
"loss": 0.021,
"step": 6050
},
{
"epoch": 9.789983844911147,
"grad_norm": 0.22424794733524323,
"learning_rate": 1.2241944135552574e-07,
"loss": 0.0203,
"step": 6060
},
{
"epoch": 9.806138933764135,
"grad_norm": 0.40934470295906067,
"learning_rate": 1.044490703960288e-07,
"loss": 0.0198,
"step": 6070
},
{
"epoch": 9.822294022617124,
"grad_norm": 0.4585173428058624,
"learning_rate": 8.79030138934589e-08,
"loss": 0.0232,
"step": 6080
},
{
"epoch": 9.838449111470114,
"grad_norm": 0.2896837592124939,
"learning_rate": 7.278174417024164e-08,
"loss": 0.0207,
"step": 6090
},
{
"epoch": 9.854604200323102,
"grad_norm": 0.2858301103115082,
"learning_rate": 5.908569287694787e-08,
"loss": 0.0168,
"step": 6100
},
{
"epoch": 9.87075928917609,
"grad_norm": 0.37577348947525024,
"learning_rate": 4.6815250979970195e-08,
"loss": 0.022,
"step": 6110
},
{
"epoch": 9.88691437802908,
"grad_norm": 0.18759319186210632,
"learning_rate": 3.5970768750387405e-08,
"loss": 0.0218,
"step": 6120
},
{
"epoch": 9.903069466882068,
"grad_norm": 0.32479435205459595,
"learning_rate": 2.6552555753917017e-08,
"loss": 0.0237,
"step": 6130
},
{
"epoch": 9.919224555735056,
"grad_norm": 0.24623480439186096,
"learning_rate": 1.8560880842133366e-08,
"loss": 0.0184,
"step": 6140
},
{
"epoch": 9.935379644588044,
"grad_norm": 0.22339710593223572,
"learning_rate": 1.1995972144757116e-08,
"loss": 0.0186,
"step": 6150
},
{
"epoch": 9.951534733441035,
"grad_norm": 0.3188174366950989,
"learning_rate": 6.858017063149369e-09,
"loss": 0.0235,
"step": 6160
},
{
"epoch": 9.967689822294023,
"grad_norm": 0.31466570496559143,
"learning_rate": 3.1471622649714703e-09,
"loss": 0.0245,
"step": 6170
},
{
"epoch": 9.983844911147012,
"grad_norm": 0.3918437659740448,
"learning_rate": 8.635136799939325e-10,
"loss": 0.0212,
"step": 6180
},
{
"epoch": 10.0,
"grad_norm": 0.3111129403114319,
"learning_rate": 7.136497065518555e-12,
"loss": 0.0254,
"step": 6190
},
{
"epoch": 10.0,
"step": 6190,
"total_flos": 0.0,
"train_loss": 0.05139205720341841,
"train_runtime": 6075.3943,
"train_samples_per_second": 32.592,
"train_steps_per_second": 1.019
}
],
"logging_steps": 10,
"max_steps": 6190,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}