{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 6190, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01615508885298869, "grad_norm": 13.517727851867676, "learning_rate": 2.903225806451613e-06, "loss": 1.0539, "step": 10 }, { "epoch": 0.03231017770597738, "grad_norm": 2.3650765419006348, "learning_rate": 6.129032258064516e-06, "loss": 0.5986, "step": 20 }, { "epoch": 0.048465266558966075, "grad_norm": 3.7127814292907715, "learning_rate": 9.35483870967742e-06, "loss": 0.325, "step": 30 }, { "epoch": 0.06462035541195477, "grad_norm": 2.470418691635132, "learning_rate": 1.2580645161290322e-05, "loss": 0.2792, "step": 40 }, { "epoch": 0.08077544426494346, "grad_norm": 1.4894506931304932, "learning_rate": 1.5806451612903226e-05, "loss": 0.2738, "step": 50 }, { "epoch": 0.09693053311793215, "grad_norm": 1.6425580978393555, "learning_rate": 1.9032258064516127e-05, "loss": 0.2348, "step": 60 }, { "epoch": 0.11308562197092084, "grad_norm": 9.821566581726074, "learning_rate": 2.2258064516129034e-05, "loss": 0.2049, "step": 70 }, { "epoch": 0.12924071082390953, "grad_norm": 2.8060154914855957, "learning_rate": 2.5483870967741935e-05, "loss": 0.2037, "step": 80 }, { "epoch": 0.14539579967689822, "grad_norm": 1.5872341394424438, "learning_rate": 2.8709677419354843e-05, "loss": 0.1972, "step": 90 }, { "epoch": 0.16155088852988692, "grad_norm": 5.6780219078063965, "learning_rate": 3.193548387096774e-05, "loss": 0.1868, "step": 100 }, { "epoch": 0.1777059773828756, "grad_norm": 0.8519928455352783, "learning_rate": 3.516129032258065e-05, "loss": 0.1799, "step": 110 }, { "epoch": 0.1938610662358643, "grad_norm": 0.983458399772644, "learning_rate": 3.838709677419355e-05, "loss": 0.1686, "step": 120 }, { "epoch": 0.210016155088853, "grad_norm": 1.8987292051315308, "learning_rate": 4.161290322580645e-05, "loss": 0.1778, "step": 130 }, { "epoch": 0.22617124394184168, "grad_norm": 1.6562193632125854, "learning_rate": 4.4838709677419356e-05, "loss": 0.1735, "step": 140 }, { "epoch": 0.24232633279483037, "grad_norm": 1.770867109298706, "learning_rate": 4.806451612903226e-05, "loss": 0.171, "step": 150 }, { "epoch": 0.25848142164781907, "grad_norm": 1.1404958963394165, "learning_rate": 5.1290322580645164e-05, "loss": 0.1575, "step": 160 }, { "epoch": 0.27463651050080773, "grad_norm": 1.507441759109497, "learning_rate": 5.451612903225807e-05, "loss": 0.1526, "step": 170 }, { "epoch": 0.29079159935379645, "grad_norm": 1.0781203508377075, "learning_rate": 5.7741935483870965e-05, "loss": 0.1508, "step": 180 }, { "epoch": 0.3069466882067851, "grad_norm": 1.5736271142959595, "learning_rate": 6.096774193548387e-05, "loss": 0.1368, "step": 190 }, { "epoch": 0.32310177705977383, "grad_norm": 1.4114209413528442, "learning_rate": 6.419354838709679e-05, "loss": 0.1326, "step": 200 }, { "epoch": 0.3392568659127625, "grad_norm": 1.3289586305618286, "learning_rate": 6.741935483870968e-05, "loss": 0.1338, "step": 210 }, { "epoch": 0.3554119547657512, "grad_norm": 0.8117440342903137, "learning_rate": 7.064516129032258e-05, "loss": 0.1326, "step": 220 }, { "epoch": 0.3715670436187399, "grad_norm": 1.1739834547042847, "learning_rate": 7.387096774193549e-05, "loss": 0.1102, "step": 230 }, { "epoch": 0.3877221324717286, "grad_norm": 1.4124845266342163, "learning_rate": 7.709677419354839e-05, "loss": 0.1204, "step": 240 }, { "epoch": 0.40387722132471726, "grad_norm": 0.8694249987602234, "learning_rate": 8.03225806451613e-05, "loss": 0.1075, "step": 250 }, { "epoch": 0.420032310177706, "grad_norm": 1.367783546447754, "learning_rate": 8.35483870967742e-05, "loss": 0.1086, "step": 260 }, { "epoch": 0.43618739903069464, "grad_norm": 1.4387221336364746, "learning_rate": 8.677419354838711e-05, "loss": 0.1285, "step": 270 }, { "epoch": 0.45234248788368336, "grad_norm": 0.9508649706840515, "learning_rate": 9e-05, "loss": 0.1077, "step": 280 }, { "epoch": 0.46849757673667203, "grad_norm": 0.9184303283691406, "learning_rate": 9.32258064516129e-05, "loss": 0.1263, "step": 290 }, { "epoch": 0.48465266558966075, "grad_norm": 1.1463005542755127, "learning_rate": 9.645161290322581e-05, "loss": 0.1027, "step": 300 }, { "epoch": 0.5008077544426495, "grad_norm": 1.938699722290039, "learning_rate": 9.967741935483872e-05, "loss": 0.1061, "step": 310 }, { "epoch": 0.5169628432956381, "grad_norm": 0.9912849068641663, "learning_rate": 9.999942194483773e-05, "loss": 0.1036, "step": 320 }, { "epoch": 0.5331179321486268, "grad_norm": 1.1873068809509277, "learning_rate": 9.999742374662181e-05, "loss": 0.0954, "step": 330 }, { "epoch": 0.5492730210016155, "grad_norm": 1.0425370931625366, "learning_rate": 9.999399832589556e-05, "loss": 0.0923, "step": 340 }, { "epoch": 0.5654281098546042, "grad_norm": 1.1135231256484985, "learning_rate": 9.998914578044079e-05, "loss": 0.0958, "step": 350 }, { "epoch": 0.5815831987075929, "grad_norm": 0.9654638767242432, "learning_rate": 9.998286624877786e-05, "loss": 0.1026, "step": 360 }, { "epoch": 0.5977382875605816, "grad_norm": 1.106973648071289, "learning_rate": 9.99751599101618e-05, "loss": 0.0945, "step": 370 }, { "epoch": 0.6138933764135702, "grad_norm": 1.0972684621810913, "learning_rate": 9.996602698457715e-05, "loss": 0.0857, "step": 380 }, { "epoch": 0.630048465266559, "grad_norm": 0.9330363869667053, "learning_rate": 9.995546773273166e-05, "loss": 0.0908, "step": 390 }, { "epoch": 0.6462035541195477, "grad_norm": 0.9228382706642151, "learning_rate": 9.994348245604892e-05, "loss": 0.0929, "step": 400 }, { "epoch": 0.6623586429725363, "grad_norm": 1.4199814796447754, "learning_rate": 9.993007149665967e-05, "loss": 0.1023, "step": 410 }, { "epoch": 0.678513731825525, "grad_norm": 1.0425035953521729, "learning_rate": 9.991523523739211e-05, "loss": 0.0924, "step": 420 }, { "epoch": 0.6946688206785138, "grad_norm": 0.9444074034690857, "learning_rate": 9.989897410176093e-05, "loss": 0.0961, "step": 430 }, { "epoch": 0.7108239095315024, "grad_norm": 0.8055535554885864, "learning_rate": 9.988128855395523e-05, "loss": 0.0891, "step": 440 }, { "epoch": 0.7269789983844911, "grad_norm": 1.0856647491455078, "learning_rate": 9.986217909882522e-05, "loss": 0.0849, "step": 450 }, { "epoch": 0.7431340872374798, "grad_norm": 0.8828626275062561, "learning_rate": 9.984164628186796e-05, "loss": 0.0893, "step": 460 }, { "epoch": 0.7592891760904685, "grad_norm": 0.7011072039604187, "learning_rate": 9.981969068921158e-05, "loss": 0.0951, "step": 470 }, { "epoch": 0.7754442649434572, "grad_norm": 0.6536422967910767, "learning_rate": 9.979631294759871e-05, "loss": 0.0805, "step": 480 }, { "epoch": 0.7915993537964459, "grad_norm": 0.6991639733314514, "learning_rate": 9.97715137243685e-05, "loss": 0.0809, "step": 490 }, { "epoch": 0.8077544426494345, "grad_norm": 0.9698547124862671, "learning_rate": 9.974529372743761e-05, "loss": 0.0875, "step": 500 }, { "epoch": 0.8239095315024233, "grad_norm": 0.7342029809951782, "learning_rate": 9.971765370528006e-05, "loss": 0.0821, "step": 510 }, { "epoch": 0.840064620355412, "grad_norm": 0.5005660057067871, "learning_rate": 9.968859444690567e-05, "loss": 0.0748, "step": 520 }, { "epoch": 0.8562197092084006, "grad_norm": 0.5115198493003845, "learning_rate": 9.965811678183777e-05, "loss": 0.0804, "step": 530 }, { "epoch": 0.8723747980613893, "grad_norm": 0.7139051556587219, "learning_rate": 9.962622158008938e-05, "loss": 0.0686, "step": 540 }, { "epoch": 0.8885298869143781, "grad_norm": 0.5260514616966248, "learning_rate": 9.959290975213841e-05, "loss": 0.0831, "step": 550 }, { "epoch": 0.9046849757673667, "grad_norm": 0.5752175450325012, "learning_rate": 9.955818224890165e-05, "loss": 0.0656, "step": 560 }, { "epoch": 0.9208400646203554, "grad_norm": 0.6161171197891235, "learning_rate": 9.952204006170771e-05, "loss": 0.0697, "step": 570 }, { "epoch": 0.9369951534733441, "grad_norm": 0.935058057308197, "learning_rate": 9.948448422226856e-05, "loss": 0.0774, "step": 580 }, { "epoch": 0.9531502423263328, "grad_norm": 1.006998062133789, "learning_rate": 9.944551580265026e-05, "loss": 0.0788, "step": 590 }, { "epoch": 0.9693053311793215, "grad_norm": 0.9937463998794556, "learning_rate": 9.940513591524222e-05, "loss": 0.075, "step": 600 }, { "epoch": 0.9854604200323102, "grad_norm": 0.840084433555603, "learning_rate": 9.936334571272554e-05, "loss": 0.0805, "step": 610 }, { "epoch": 1.001615508885299, "grad_norm": 0.9836556315422058, "learning_rate": 9.932014638804001e-05, "loss": 0.0753, "step": 620 }, { "epoch": 1.0177705977382876, "grad_norm": 0.7406233549118042, "learning_rate": 9.927553917435017e-05, "loss": 0.0695, "step": 630 }, { "epoch": 1.0339256865912763, "grad_norm": 0.8061002492904663, "learning_rate": 9.922952534501002e-05, "loss": 0.0682, "step": 640 }, { "epoch": 1.050080775444265, "grad_norm": 0.6358613967895508, "learning_rate": 9.918210621352668e-05, "loss": 0.077, "step": 650 }, { "epoch": 1.0662358642972536, "grad_norm": 0.6549187898635864, "learning_rate": 9.913328313352292e-05, "loss": 0.0739, "step": 660 }, { "epoch": 1.0823909531502423, "grad_norm": 0.8390158414840698, "learning_rate": 9.908305749869858e-05, "loss": 0.0883, "step": 670 }, { "epoch": 1.098546042003231, "grad_norm": 0.942304253578186, "learning_rate": 9.90314307427906e-05, "loss": 0.0788, "step": 680 }, { "epoch": 1.1147011308562198, "grad_norm": 1.1538914442062378, "learning_rate": 9.897840433953234e-05, "loss": 0.0766, "step": 690 }, { "epoch": 1.1308562197092085, "grad_norm": 0.6114380359649658, "learning_rate": 9.892397980261128e-05, "loss": 0.0754, "step": 700 }, { "epoch": 1.1470113085621971, "grad_norm": 0.9622769355773926, "learning_rate": 9.886815868562596e-05, "loss": 0.0824, "step": 710 }, { "epoch": 1.1631663974151858, "grad_norm": 0.6100155115127563, "learning_rate": 9.88109425820416e-05, "loss": 0.067, "step": 720 }, { "epoch": 1.1793214862681745, "grad_norm": 0.5996105670928955, "learning_rate": 9.875233312514454e-05, "loss": 0.0663, "step": 730 }, { "epoch": 1.1954765751211631, "grad_norm": 0.5155414342880249, "learning_rate": 9.869233198799572e-05, "loss": 0.0629, "step": 740 }, { "epoch": 1.2116316639741518, "grad_norm": 0.6942029595375061, "learning_rate": 9.863094088338288e-05, "loss": 0.0764, "step": 750 }, { "epoch": 1.2277867528271407, "grad_norm": 0.5376043319702148, "learning_rate": 9.856816156377163e-05, "loss": 0.0675, "step": 760 }, { "epoch": 1.2439418416801293, "grad_norm": 0.6686906814575195, "learning_rate": 9.850399582125548e-05, "loss": 0.0767, "step": 770 }, { "epoch": 1.260096930533118, "grad_norm": 0.768054723739624, "learning_rate": 9.843844548750464e-05, "loss": 0.0716, "step": 780 }, { "epoch": 1.2762520193861067, "grad_norm": 0.6787708401679993, "learning_rate": 9.837151243371376e-05, "loss": 0.0672, "step": 790 }, { "epoch": 1.2924071082390953, "grad_norm": 0.500952959060669, "learning_rate": 9.830319857054852e-05, "loss": 0.0702, "step": 800 }, { "epoch": 1.308562197092084, "grad_norm": 0.6068538427352905, "learning_rate": 9.823350584809105e-05, "loss": 0.0738, "step": 810 }, { "epoch": 1.3247172859450727, "grad_norm": 0.6218283176422119, "learning_rate": 9.816243625578432e-05, "loss": 0.0756, "step": 820 }, { "epoch": 1.3408723747980613, "grad_norm": 0.7377462983131409, "learning_rate": 9.808999182237528e-05, "loss": 0.0692, "step": 830 }, { "epoch": 1.35702746365105, "grad_norm": 0.5580537915229797, "learning_rate": 9.8016174615857e-05, "loss": 0.0633, "step": 840 }, { "epoch": 1.3731825525040389, "grad_norm": 0.615639328956604, "learning_rate": 9.794098674340965e-05, "loss": 0.0718, "step": 850 }, { "epoch": 1.3893376413570275, "grad_norm": 0.7121309041976929, "learning_rate": 9.78644303513403e-05, "loss": 0.0633, "step": 860 }, { "epoch": 1.4054927302100162, "grad_norm": 0.5688542127609253, "learning_rate": 9.778650762502166e-05, "loss": 0.0678, "step": 870 }, { "epoch": 1.4216478190630049, "grad_norm": 0.5155729651451111, "learning_rate": 9.770722078882973e-05, "loss": 0.0665, "step": 880 }, { "epoch": 1.4378029079159935, "grad_norm": 0.48947158455848694, "learning_rate": 9.762657210608029e-05, "loss": 0.0657, "step": 890 }, { "epoch": 1.4539579967689822, "grad_norm": 0.7648037075996399, "learning_rate": 9.754456387896422e-05, "loss": 0.0707, "step": 900 }, { "epoch": 1.4701130856219708, "grad_norm": 0.8872023224830627, "learning_rate": 9.746119844848195e-05, "loss": 0.062, "step": 910 }, { "epoch": 1.4862681744749597, "grad_norm": 1.083450436592102, "learning_rate": 9.737647819437645e-05, "loss": 0.0728, "step": 920 }, { "epoch": 1.5024232633279482, "grad_norm": 0.6818684339523315, "learning_rate": 9.729040553506539e-05, "loss": 0.0637, "step": 930 }, { "epoch": 1.518578352180937, "grad_norm": 0.7897723913192749, "learning_rate": 9.720298292757215e-05, "loss": 0.0682, "step": 940 }, { "epoch": 1.5347334410339257, "grad_norm": 0.46110132336616516, "learning_rate": 9.711421286745555e-05, "loss": 0.0726, "step": 950 }, { "epoch": 1.5508885298869144, "grad_norm": 0.4637523293495178, "learning_rate": 9.70240978887387e-05, "loss": 0.0622, "step": 960 }, { "epoch": 1.567043618739903, "grad_norm": 0.7092505693435669, "learning_rate": 9.69326405638367e-05, "loss": 0.0592, "step": 970 }, { "epoch": 1.5831987075928917, "grad_norm": 0.5965023040771484, "learning_rate": 9.683984350348312e-05, "loss": 0.0697, "step": 980 }, { "epoch": 1.5993537964458806, "grad_norm": 0.5201593041419983, "learning_rate": 9.67457093566555e-05, "loss": 0.0706, "step": 990 }, { "epoch": 1.615508885298869, "grad_norm": 0.6693015098571777, "learning_rate": 9.665024081049977e-05, "loss": 0.0653, "step": 1000 }, { "epoch": 1.631663974151858, "grad_norm": 0.7377516627311707, "learning_rate": 9.655344059025351e-05, "loss": 0.061, "step": 1010 }, { "epoch": 1.6478190630048464, "grad_norm": 0.6737310290336609, "learning_rate": 9.645531145916817e-05, "loss": 0.0552, "step": 1020 }, { "epoch": 1.6639741518578353, "grad_norm": 0.6933907866477966, "learning_rate": 9.635585621843018e-05, "loss": 0.0671, "step": 1030 }, { "epoch": 1.680129240710824, "grad_norm": 0.6938374638557434, "learning_rate": 9.625507770708097e-05, "loss": 0.068, "step": 1040 }, { "epoch": 1.6962843295638126, "grad_norm": 0.5320965051651001, "learning_rate": 9.615297880193598e-05, "loss": 0.0632, "step": 1050 }, { "epoch": 1.7124394184168013, "grad_norm": 0.6312500238418579, "learning_rate": 9.60495624175025e-05, "loss": 0.0706, "step": 1060 }, { "epoch": 1.72859450726979, "grad_norm": 0.5120170712471008, "learning_rate": 9.594483150589646e-05, "loss": 0.0706, "step": 1070 }, { "epoch": 1.7447495961227788, "grad_norm": 0.6575292348861694, "learning_rate": 9.58387890567582e-05, "loss": 0.066, "step": 1080 }, { "epoch": 1.7609046849757672, "grad_norm": 0.8916189670562744, "learning_rate": 9.573143809716711e-05, "loss": 0.0572, "step": 1090 }, { "epoch": 1.7770597738287561, "grad_norm": 0.7182980179786682, "learning_rate": 9.562278169155518e-05, "loss": 0.061, "step": 1100 }, { "epoch": 1.7932148626817448, "grad_norm": 0.5273639559745789, "learning_rate": 9.551282294161962e-05, "loss": 0.0564, "step": 1110 }, { "epoch": 1.8093699515347335, "grad_norm": 0.5014919638633728, "learning_rate": 9.540156498623418e-05, "loss": 0.0674, "step": 1120 }, { "epoch": 1.8255250403877221, "grad_norm": 0.49997884035110474, "learning_rate": 9.528901100135971e-05, "loss": 0.0719, "step": 1130 }, { "epoch": 1.8416801292407108, "grad_norm": 0.5391654968261719, "learning_rate": 9.517516419995335e-05, "loss": 0.0634, "step": 1140 }, { "epoch": 1.8578352180936997, "grad_norm": 0.5763863921165466, "learning_rate": 9.506002783187691e-05, "loss": 0.0622, "step": 1150 }, { "epoch": 1.8739903069466881, "grad_norm": 0.5951936841011047, "learning_rate": 9.494360518380405e-05, "loss": 0.066, "step": 1160 }, { "epoch": 1.890145395799677, "grad_norm": 0.7027397751808167, "learning_rate": 9.482589957912651e-05, "loss": 0.0623, "step": 1170 }, { "epoch": 1.9063004846526654, "grad_norm": 0.6542057991027832, "learning_rate": 9.470691437785918e-05, "loss": 0.0635, "step": 1180 }, { "epoch": 1.9224555735056543, "grad_norm": 0.39720842242240906, "learning_rate": 9.45866529765442e-05, "loss": 0.064, "step": 1190 }, { "epoch": 1.938610662358643, "grad_norm": 0.47394442558288574, "learning_rate": 9.446511880815407e-05, "loss": 0.0595, "step": 1200 }, { "epoch": 1.9547657512116317, "grad_norm": 0.3531631529331207, "learning_rate": 9.434231534199356e-05, "loss": 0.0583, "step": 1210 }, { "epoch": 1.9709208400646203, "grad_norm": 0.6005557775497437, "learning_rate": 9.421824608360068e-05, "loss": 0.0599, "step": 1220 }, { "epoch": 1.987075928917609, "grad_norm": 0.5101392269134521, "learning_rate": 9.409291457464672e-05, "loss": 0.0617, "step": 1230 }, { "epoch": 2.003231017770598, "grad_norm": 0.42682531476020813, "learning_rate": 9.396632439283501e-05, "loss": 0.0554, "step": 1240 }, { "epoch": 2.0193861066235863, "grad_norm": 0.6450132727622986, "learning_rate": 9.383847915179892e-05, "loss": 0.0677, "step": 1250 }, { "epoch": 2.035541195476575, "grad_norm": 0.399069607257843, "learning_rate": 9.370938250099857e-05, "loss": 0.0618, "step": 1260 }, { "epoch": 2.0516962843295636, "grad_norm": 0.4468577802181244, "learning_rate": 9.357903812561679e-05, "loss": 0.0685, "step": 1270 }, { "epoch": 2.0678513731825525, "grad_norm": 0.559262752532959, "learning_rate": 9.344744974645381e-05, "loss": 0.0637, "step": 1280 }, { "epoch": 2.0840064620355414, "grad_norm": 0.5825755596160889, "learning_rate": 9.33146211198211e-05, "loss": 0.0625, "step": 1290 }, { "epoch": 2.10016155088853, "grad_norm": 1.090774655342102, "learning_rate": 9.318055603743418e-05, "loss": 0.0691, "step": 1300 }, { "epoch": 2.1163166397415187, "grad_norm": 0.3790472149848938, "learning_rate": 9.304525832630426e-05, "loss": 0.0572, "step": 1310 }, { "epoch": 2.132471728594507, "grad_norm": 0.46555569767951965, "learning_rate": 9.290873184862917e-05, "loss": 0.0611, "step": 1320 }, { "epoch": 2.148626817447496, "grad_norm": 0.5333315134048462, "learning_rate": 9.277098050168293e-05, "loss": 0.0554, "step": 1330 }, { "epoch": 2.1647819063004845, "grad_norm": 0.5820637345314026, "learning_rate": 9.263200821770461e-05, "loss": 0.0593, "step": 1340 }, { "epoch": 2.1809369951534734, "grad_norm": 0.5108340978622437, "learning_rate": 9.249181896378607e-05, "loss": 0.0561, "step": 1350 }, { "epoch": 2.197092084006462, "grad_norm": 0.44887450337409973, "learning_rate": 9.235041674175868e-05, "loss": 0.0608, "step": 1360 }, { "epoch": 2.2132471728594507, "grad_norm": 0.462615042924881, "learning_rate": 9.22078055880791e-05, "loss": 0.0495, "step": 1370 }, { "epoch": 2.2294022617124396, "grad_norm": 0.48509976267814636, "learning_rate": 9.206398957371406e-05, "loss": 0.0589, "step": 1380 }, { "epoch": 2.245557350565428, "grad_norm": 0.48090824484825134, "learning_rate": 9.191897280402415e-05, "loss": 0.0521, "step": 1390 }, { "epoch": 2.261712439418417, "grad_norm": 0.5474804043769836, "learning_rate": 9.177275941864662e-05, "loss": 0.0591, "step": 1400 }, { "epoch": 2.2778675282714054, "grad_norm": 0.6736873984336853, "learning_rate": 9.162535359137725e-05, "loss": 0.0532, "step": 1410 }, { "epoch": 2.2940226171243943, "grad_norm": 0.4108855426311493, "learning_rate": 9.147675953005112e-05, "loss": 0.0608, "step": 1420 }, { "epoch": 2.3101777059773827, "grad_norm": 0.6929683685302734, "learning_rate": 9.132698147642258e-05, "loss": 0.0572, "step": 1430 }, { "epoch": 2.3263327948303716, "grad_norm": 0.662588357925415, "learning_rate": 9.117602370604412e-05, "loss": 0.0606, "step": 1440 }, { "epoch": 2.3424878836833605, "grad_norm": 0.6598329544067383, "learning_rate": 9.102389052814435e-05, "loss": 0.0617, "step": 1450 }, { "epoch": 2.358642972536349, "grad_norm": 0.6328267455101013, "learning_rate": 9.087058628550492e-05, "loss": 0.0635, "step": 1460 }, { "epoch": 2.374798061389338, "grad_norm": 0.7304327487945557, "learning_rate": 9.071611535433665e-05, "loss": 0.0636, "step": 1470 }, { "epoch": 2.3909531502423262, "grad_norm": 0.7994436621665955, "learning_rate": 9.056048214415456e-05, "loss": 0.0682, "step": 1480 }, { "epoch": 2.407108239095315, "grad_norm": 0.5563200116157532, "learning_rate": 9.040369109765196e-05, "loss": 0.0602, "step": 1490 }, { "epoch": 2.4232633279483036, "grad_norm": 0.862169623374939, "learning_rate": 9.024574669057368e-05, "loss": 0.0694, "step": 1500 }, { "epoch": 2.4394184168012925, "grad_norm": 0.5530250668525696, "learning_rate": 9.00866534315883e-05, "loss": 0.0621, "step": 1510 }, { "epoch": 2.4555735056542813, "grad_norm": 0.5109930634498596, "learning_rate": 8.992641586215944e-05, "loss": 0.0568, "step": 1520 }, { "epoch": 2.47172859450727, "grad_norm": 0.772769570350647, "learning_rate": 8.97650385564161e-05, "loss": 0.0634, "step": 1530 }, { "epoch": 2.4878836833602587, "grad_norm": 0.7233314514160156, "learning_rate": 8.960252612102209e-05, "loss": 0.0682, "step": 1540 }, { "epoch": 2.504038772213247, "grad_norm": 0.9270318746566772, "learning_rate": 8.943888319504457e-05, "loss": 0.0616, "step": 1550 }, { "epoch": 2.520193861066236, "grad_norm": 1.1452592611312866, "learning_rate": 8.927411444982157e-05, "loss": 0.0536, "step": 1560 }, { "epoch": 2.5363489499192244, "grad_norm": 0.8335738778114319, "learning_rate": 8.91082245888287e-05, "loss": 0.0588, "step": 1570 }, { "epoch": 2.5525040387722133, "grad_norm": 0.5370670557022095, "learning_rate": 8.894121834754481e-05, "loss": 0.0593, "step": 1580 }, { "epoch": 2.568659127625202, "grad_norm": 0.5532761216163635, "learning_rate": 8.877310049331691e-05, "loss": 0.0601, "step": 1590 }, { "epoch": 2.5848142164781907, "grad_norm": 0.6287941932678223, "learning_rate": 8.860387582522397e-05, "loss": 0.0627, "step": 1600 }, { "epoch": 2.600969305331179, "grad_norm": 0.6329537034034729, "learning_rate": 8.843354917394e-05, "loss": 0.0572, "step": 1610 }, { "epoch": 2.617124394184168, "grad_norm": 0.4902884364128113, "learning_rate": 8.826212540159615e-05, "loss": 0.0528, "step": 1620 }, { "epoch": 2.633279483037157, "grad_norm": 0.4139235019683838, "learning_rate": 8.808960940164188e-05, "loss": 0.0591, "step": 1630 }, { "epoch": 2.6494345718901453, "grad_norm": 0.481642484664917, "learning_rate": 8.79160060987053e-05, "loss": 0.063, "step": 1640 }, { "epoch": 2.665589660743134, "grad_norm": 0.575612485408783, "learning_rate": 8.77413204484526e-05, "loss": 0.0682, "step": 1650 }, { "epoch": 2.6817447495961226, "grad_norm": 0.7415863871574402, "learning_rate": 8.756555743744655e-05, "loss": 0.0488, "step": 1660 }, { "epoch": 2.6978998384491115, "grad_norm": 0.614101767539978, "learning_rate": 8.738872208300417e-05, "loss": 0.0627, "step": 1670 }, { "epoch": 2.7140549273021, "grad_norm": 0.5911862850189209, "learning_rate": 8.721081943305356e-05, "loss": 0.0622, "step": 1680 }, { "epoch": 2.730210016155089, "grad_norm": 0.5863639116287231, "learning_rate": 8.703185456598968e-05, "loss": 0.0598, "step": 1690 }, { "epoch": 2.7463651050080777, "grad_norm": 0.6773284077644348, "learning_rate": 8.685183259052952e-05, "loss": 0.0591, "step": 1700 }, { "epoch": 2.762520193861066, "grad_norm": 0.48102864623069763, "learning_rate": 8.667075864556615e-05, "loss": 0.0554, "step": 1710 }, { "epoch": 2.778675282714055, "grad_norm": 0.6997978687286377, "learning_rate": 8.648863790002213e-05, "loss": 0.0605, "step": 1720 }, { "epoch": 2.7948303715670435, "grad_norm": 0.6587175130844116, "learning_rate": 8.630547555270188e-05, "loss": 0.064, "step": 1730 }, { "epoch": 2.8109854604200324, "grad_norm": 0.8421849608421326, "learning_rate": 8.612127683214329e-05, "loss": 0.0523, "step": 1740 }, { "epoch": 2.827140549273021, "grad_norm": 0.3728615939617157, "learning_rate": 8.59360469964685e-05, "loss": 0.057, "step": 1750 }, { "epoch": 2.8432956381260097, "grad_norm": 0.6552137732505798, "learning_rate": 8.574979133323377e-05, "loss": 0.0605, "step": 1760 }, { "epoch": 2.8594507269789986, "grad_norm": 0.7351179718971252, "learning_rate": 8.556251515927855e-05, "loss": 0.0566, "step": 1770 }, { "epoch": 2.875605815831987, "grad_norm": 0.5557317733764648, "learning_rate": 8.537422382057374e-05, "loss": 0.0531, "step": 1780 }, { "epoch": 2.891760904684976, "grad_norm": 0.5497432351112366, "learning_rate": 8.518492269206899e-05, "loss": 0.0588, "step": 1790 }, { "epoch": 2.9079159935379644, "grad_norm": 0.6453426480293274, "learning_rate": 8.499461717753939e-05, "loss": 0.0589, "step": 1800 }, { "epoch": 2.9240710823909533, "grad_norm": 0.5362476706504822, "learning_rate": 8.480331270943111e-05, "loss": 0.0626, "step": 1810 }, { "epoch": 2.9402261712439417, "grad_norm": 0.42626962065696716, "learning_rate": 8.461101474870641e-05, "loss": 0.0495, "step": 1820 }, { "epoch": 2.9563812600969306, "grad_norm": 0.5444236397743225, "learning_rate": 8.44177287846877e-05, "loss": 0.0558, "step": 1830 }, { "epoch": 2.9725363489499195, "grad_norm": 0.5531013607978821, "learning_rate": 8.422346033490082e-05, "loss": 0.0497, "step": 1840 }, { "epoch": 2.988691437802908, "grad_norm": 0.5683811902999878, "learning_rate": 8.402821494491762e-05, "loss": 0.0528, "step": 1850 }, { "epoch": 3.004846526655897, "grad_norm": 0.5049775838851929, "learning_rate": 8.383199818819758e-05, "loss": 0.0616, "step": 1860 }, { "epoch": 3.0210016155088852, "grad_norm": 0.38788193464279175, "learning_rate": 8.363481566592874e-05, "loss": 0.0549, "step": 1870 }, { "epoch": 3.037156704361874, "grad_norm": 0.543121337890625, "learning_rate": 8.34366730068678e-05, "loss": 0.0561, "step": 1880 }, { "epoch": 3.0533117932148626, "grad_norm": 0.48212480545043945, "learning_rate": 8.323757586717947e-05, "loss": 0.0473, "step": 1890 }, { "epoch": 3.0694668820678515, "grad_norm": 0.7454380393028259, "learning_rate": 8.303752993027498e-05, "loss": 0.0564, "step": 1900 }, { "epoch": 3.08562197092084, "grad_norm": 0.5166053175926208, "learning_rate": 8.283654090664985e-05, "loss": 0.0571, "step": 1910 }, { "epoch": 3.101777059773829, "grad_norm": 0.5176417231559753, "learning_rate": 8.263461453372086e-05, "loss": 0.0593, "step": 1920 }, { "epoch": 3.1179321486268172, "grad_norm": 0.6009415984153748, "learning_rate": 8.243175657566233e-05, "loss": 0.0518, "step": 1930 }, { "epoch": 3.134087237479806, "grad_norm": 0.4920412302017212, "learning_rate": 8.222797282324152e-05, "loss": 0.0517, "step": 1940 }, { "epoch": 3.150242326332795, "grad_norm": 0.5730708241462708, "learning_rate": 8.20232690936533e-05, "loss": 0.055, "step": 1950 }, { "epoch": 3.1663974151857834, "grad_norm": 0.5689309239387512, "learning_rate": 8.18176512303542e-05, "loss": 0.0462, "step": 1960 }, { "epoch": 3.1825525040387723, "grad_norm": 0.3386596143245697, "learning_rate": 8.161112510289549e-05, "loss": 0.0593, "step": 1970 }, { "epoch": 3.1987075928917608, "grad_norm": 0.5641984939575195, "learning_rate": 8.140369660675571e-05, "loss": 0.0561, "step": 1980 }, { "epoch": 3.2148626817447497, "grad_norm": 0.3364955186843872, "learning_rate": 8.119537166317232e-05, "loss": 0.0488, "step": 1990 }, { "epoch": 3.231017770597738, "grad_norm": 0.5797820687294006, "learning_rate": 8.098615621897272e-05, "loss": 0.0471, "step": 2000 }, { "epoch": 3.247172859450727, "grad_norm": 0.6893600225448608, "learning_rate": 8.077605624640448e-05, "loss": 0.0489, "step": 2010 }, { "epoch": 3.263327948303716, "grad_norm": 0.6242002844810486, "learning_rate": 8.056507774296477e-05, "loss": 0.0502, "step": 2020 }, { "epoch": 3.2794830371567043, "grad_norm": 0.29608842730522156, "learning_rate": 8.035322673122934e-05, "loss": 0.0574, "step": 2030 }, { "epoch": 3.295638126009693, "grad_norm": 0.39050793647766113, "learning_rate": 8.014050925868042e-05, "loss": 0.0553, "step": 2040 }, { "epoch": 3.3117932148626816, "grad_norm": 0.7243764400482178, "learning_rate": 7.99269313975342e-05, "loss": 0.0496, "step": 2050 }, { "epoch": 3.3279483037156705, "grad_norm": 0.6739727258682251, "learning_rate": 7.971249924456742e-05, "loss": 0.0486, "step": 2060 }, { "epoch": 3.344103392568659, "grad_norm": 0.4816618263721466, "learning_rate": 7.94972189209434e-05, "loss": 0.0455, "step": 2070 }, { "epoch": 3.360258481421648, "grad_norm": 0.5240322351455688, "learning_rate": 7.928109657203725e-05, "loss": 0.0573, "step": 2080 }, { "epoch": 3.3764135702746367, "grad_norm": 0.3253321051597595, "learning_rate": 7.906413836726048e-05, "loss": 0.0467, "step": 2090 }, { "epoch": 3.392568659127625, "grad_norm": 0.5213293433189392, "learning_rate": 7.884635049988488e-05, "loss": 0.0488, "step": 2100 }, { "epoch": 3.408723747980614, "grad_norm": 0.4129197895526886, "learning_rate": 7.86277391868657e-05, "loss": 0.0483, "step": 2110 }, { "epoch": 3.4248788368336025, "grad_norm": 0.5131278038024902, "learning_rate": 7.840831066866423e-05, "loss": 0.0429, "step": 2120 }, { "epoch": 3.4410339256865914, "grad_norm": 0.529063880443573, "learning_rate": 7.818807120906964e-05, "loss": 0.0536, "step": 2130 }, { "epoch": 3.45718901453958, "grad_norm": 0.6816816926002502, "learning_rate": 7.796702709502012e-05, "loss": 0.0514, "step": 2140 }, { "epoch": 3.4733441033925687, "grad_norm": 0.3989129066467285, "learning_rate": 7.774518463642351e-05, "loss": 0.0613, "step": 2150 }, { "epoch": 3.489499192245557, "grad_norm": 0.4334792494773865, "learning_rate": 7.75225501659771e-05, "loss": 0.0483, "step": 2160 }, { "epoch": 3.505654281098546, "grad_norm": 0.46373841166496277, "learning_rate": 7.729913003898694e-05, "loss": 0.0443, "step": 2170 }, { "epoch": 3.5218093699515345, "grad_norm": 0.3799467980861664, "learning_rate": 7.707493063318629e-05, "loss": 0.0511, "step": 2180 }, { "epoch": 3.5379644588045234, "grad_norm": 0.4075853228569031, "learning_rate": 7.684995834855372e-05, "loss": 0.0478, "step": 2190 }, { "epoch": 3.5541195476575123, "grad_norm": 0.39337170124053955, "learning_rate": 7.662421960713028e-05, "loss": 0.0484, "step": 2200 }, { "epoch": 3.5702746365105007, "grad_norm": 0.30496665835380554, "learning_rate": 7.639772085283628e-05, "loss": 0.0446, "step": 2210 }, { "epoch": 3.5864297253634896, "grad_norm": 0.36177757382392883, "learning_rate": 7.617046855128724e-05, "loss": 0.0469, "step": 2220 }, { "epoch": 3.602584814216478, "grad_norm": 0.39714500308036804, "learning_rate": 7.594246918960946e-05, "loss": 0.0433, "step": 2230 }, { "epoch": 3.618739903069467, "grad_norm": 0.40002134442329407, "learning_rate": 7.571372927625469e-05, "loss": 0.0518, "step": 2240 }, { "epoch": 3.6348949919224554, "grad_norm": 0.6046271324157715, "learning_rate": 7.548425534081442e-05, "loss": 0.052, "step": 2250 }, { "epoch": 3.6510500807754442, "grad_norm": 0.43297943472862244, "learning_rate": 7.525405393383351e-05, "loss": 0.0462, "step": 2260 }, { "epoch": 3.667205169628433, "grad_norm": 0.4702610671520233, "learning_rate": 7.502313162662315e-05, "loss": 0.0543, "step": 2270 }, { "epoch": 3.6833602584814216, "grad_norm": 0.3743409216403961, "learning_rate": 7.479149501107328e-05, "loss": 0.0472, "step": 2280 }, { "epoch": 3.6995153473344105, "grad_norm": 0.3397691249847412, "learning_rate": 7.455915069946444e-05, "loss": 0.045, "step": 2290 }, { "epoch": 3.715670436187399, "grad_norm": 0.39391201734542847, "learning_rate": 7.4326105324279e-05, "loss": 0.0407, "step": 2300 }, { "epoch": 3.731825525040388, "grad_norm": 0.5775906443595886, "learning_rate": 7.409236553801183e-05, "loss": 0.0511, "step": 2310 }, { "epoch": 3.7479806138933762, "grad_norm": 0.5497547388076782, "learning_rate": 7.385793801298042e-05, "loss": 0.0426, "step": 2320 }, { "epoch": 3.764135702746365, "grad_norm": 0.4124547243118286, "learning_rate": 7.36228294411344e-05, "loss": 0.05, "step": 2330 }, { "epoch": 3.780290791599354, "grad_norm": 0.4284408390522003, "learning_rate": 7.338704653386448e-05, "loss": 0.0498, "step": 2340 }, { "epoch": 3.7964458804523424, "grad_norm": 0.47924646735191345, "learning_rate": 7.315059602181092e-05, "loss": 0.0491, "step": 2350 }, { "epoch": 3.8126009693053313, "grad_norm": 0.34164971113204956, "learning_rate": 7.291348465467136e-05, "loss": 0.0503, "step": 2360 }, { "epoch": 3.8287560581583198, "grad_norm": 0.4297367334365845, "learning_rate": 7.267571920100816e-05, "loss": 0.0505, "step": 2370 }, { "epoch": 3.8449111470113086, "grad_norm": 0.45141902565956116, "learning_rate": 7.24373064480552e-05, "loss": 0.0442, "step": 2380 }, { "epoch": 3.861066235864297, "grad_norm": 0.4785975217819214, "learning_rate": 7.219825320152411e-05, "loss": 0.0538, "step": 2390 }, { "epoch": 3.877221324717286, "grad_norm": 0.3574664890766144, "learning_rate": 7.195856628540995e-05, "loss": 0.0499, "step": 2400 }, { "epoch": 3.893376413570275, "grad_norm": 0.40025898814201355, "learning_rate": 7.171825254179654e-05, "loss": 0.0429, "step": 2410 }, { "epoch": 3.9095315024232633, "grad_norm": 0.3183038830757141, "learning_rate": 7.1477318830661e-05, "loss": 0.0466, "step": 2420 }, { "epoch": 3.9256865912762517, "grad_norm": 0.4639292061328888, "learning_rate": 7.123577202967805e-05, "loss": 0.0446, "step": 2430 }, { "epoch": 3.9418416801292406, "grad_norm": 0.5322105884552002, "learning_rate": 7.099361903402359e-05, "loss": 0.0495, "step": 2440 }, { "epoch": 3.9579967689822295, "grad_norm": 0.3138383626937866, "learning_rate": 7.075086675617788e-05, "loss": 0.0444, "step": 2450 }, { "epoch": 3.974151857835218, "grad_norm": 0.544747531414032, "learning_rate": 7.050752212572831e-05, "loss": 0.0541, "step": 2460 }, { "epoch": 3.990306946688207, "grad_norm": 0.4654453694820404, "learning_rate": 7.026359208917148e-05, "loss": 0.0504, "step": 2470 }, { "epoch": 4.006462035541196, "grad_norm": 0.31848329305648804, "learning_rate": 7.001908360971494e-05, "loss": 0.0451, "step": 2480 }, { "epoch": 4.022617124394184, "grad_norm": 0.43173283338546753, "learning_rate": 6.977400366707847e-05, "loss": 0.0467, "step": 2490 }, { "epoch": 4.038772213247173, "grad_norm": 0.5474691390991211, "learning_rate": 6.952835925729472e-05, "loss": 0.0479, "step": 2500 }, { "epoch": 4.054927302100162, "grad_norm": 0.4897683560848236, "learning_rate": 6.928215739250963e-05, "loss": 0.0505, "step": 2510 }, { "epoch": 4.07108239095315, "grad_norm": 0.31264185905456543, "learning_rate": 6.903540510078219e-05, "loss": 0.0457, "step": 2520 }, { "epoch": 4.087237479806139, "grad_norm": 0.4703519642353058, "learning_rate": 6.878810942588383e-05, "loss": 0.0451, "step": 2530 }, { "epoch": 4.103392568659127, "grad_norm": 0.3018874228000641, "learning_rate": 6.85402774270974e-05, "loss": 0.0449, "step": 2540 }, { "epoch": 4.119547657512117, "grad_norm": 0.3613886535167694, "learning_rate": 6.829191617901551e-05, "loss": 0.0481, "step": 2550 }, { "epoch": 4.135702746365105, "grad_norm": 0.34348440170288086, "learning_rate": 6.804303277133877e-05, "loss": 0.0396, "step": 2560 }, { "epoch": 4.1518578352180935, "grad_norm": 0.44307631254196167, "learning_rate": 6.779363430867326e-05, "loss": 0.0459, "step": 2570 }, { "epoch": 4.168012924071083, "grad_norm": 0.5705850124359131, "learning_rate": 6.754372791032783e-05, "loss": 0.0468, "step": 2580 }, { "epoch": 4.184168012924071, "grad_norm": 0.3443628251552582, "learning_rate": 6.729332071011077e-05, "loss": 0.0452, "step": 2590 }, { "epoch": 4.20032310177706, "grad_norm": 0.4537239372730255, "learning_rate": 6.704241985612625e-05, "loss": 0.0446, "step": 2600 }, { "epoch": 4.216478190630048, "grad_norm": 0.3705506920814514, "learning_rate": 6.679103251057024e-05, "loss": 0.0384, "step": 2610 }, { "epoch": 4.2326332794830375, "grad_norm": 0.5850950479507446, "learning_rate": 6.653916584952607e-05, "loss": 0.0483, "step": 2620 }, { "epoch": 4.248788368336026, "grad_norm": 0.7132898569107056, "learning_rate": 6.628682706275953e-05, "loss": 0.0432, "step": 2630 }, { "epoch": 4.264943457189014, "grad_norm": 0.3713912069797516, "learning_rate": 6.603402335351371e-05, "loss": 0.0382, "step": 2640 }, { "epoch": 4.281098546042003, "grad_norm": 0.6300288438796997, "learning_rate": 6.578076193830335e-05, "loss": 0.0444, "step": 2650 }, { "epoch": 4.297253634894992, "grad_norm": 0.5276614427566528, "learning_rate": 6.55270500467088e-05, "loss": 0.0554, "step": 2660 }, { "epoch": 4.313408723747981, "grad_norm": 0.38638073205947876, "learning_rate": 6.527289492116968e-05, "loss": 0.054, "step": 2670 }, { "epoch": 4.329563812600969, "grad_norm": 0.7961811423301697, "learning_rate": 6.501830381677813e-05, "loss": 0.0529, "step": 2680 }, { "epoch": 4.345718901453958, "grad_norm": 0.3550907373428345, "learning_rate": 6.476328400107171e-05, "loss": 0.0488, "step": 2690 }, { "epoch": 4.361873990306947, "grad_norm": 0.5453242659568787, "learning_rate": 6.450784275382595e-05, "loss": 0.0503, "step": 2700 }, { "epoch": 4.378029079159935, "grad_norm": 0.4048435688018799, "learning_rate": 6.425198736684655e-05, "loss": 0.0474, "step": 2710 }, { "epoch": 4.394184168012924, "grad_norm": 0.47286099195480347, "learning_rate": 6.399572514376113e-05, "loss": 0.0406, "step": 2720 }, { "epoch": 4.410339256865913, "grad_norm": 0.28871116042137146, "learning_rate": 6.373906339981092e-05, "loss": 0.0465, "step": 2730 }, { "epoch": 4.426494345718901, "grad_norm": 0.686854362487793, "learning_rate": 6.348200946164178e-05, "loss": 0.0477, "step": 2740 }, { "epoch": 4.44264943457189, "grad_norm": 0.7823249101638794, "learning_rate": 6.322457066709511e-05, "loss": 0.0407, "step": 2750 }, { "epoch": 4.458804523424879, "grad_norm": 0.4921092987060547, "learning_rate": 6.296675436499844e-05, "loss": 0.0408, "step": 2760 }, { "epoch": 4.474959612277868, "grad_norm": 0.5457318425178528, "learning_rate": 6.270856791495556e-05, "loss": 0.0421, "step": 2770 }, { "epoch": 4.491114701130856, "grad_norm": 0.7720049619674683, "learning_rate": 6.245001868713649e-05, "loss": 0.0495, "step": 2780 }, { "epoch": 4.5072697899838445, "grad_norm": 0.4767976999282837, "learning_rate": 6.219111406206707e-05, "loss": 0.0446, "step": 2790 }, { "epoch": 4.523424878836834, "grad_norm": 0.46596401929855347, "learning_rate": 6.193186143041828e-05, "loss": 0.044, "step": 2800 }, { "epoch": 4.539579967689822, "grad_norm": 0.4272357225418091, "learning_rate": 6.167226819279528e-05, "loss": 0.043, "step": 2810 }, { "epoch": 4.555735056542811, "grad_norm": 0.39680230617523193, "learning_rate": 6.141234175952612e-05, "loss": 0.0376, "step": 2820 }, { "epoch": 4.5718901453958, "grad_norm": 0.42455387115478516, "learning_rate": 6.115208955045025e-05, "loss": 0.0415, "step": 2830 }, { "epoch": 4.5880452342487885, "grad_norm": 0.4186107814311981, "learning_rate": 6.089151899470668e-05, "loss": 0.0394, "step": 2840 }, { "epoch": 4.604200323101777, "grad_norm": 0.4375015199184418, "learning_rate": 6.0630637530521905e-05, "loss": 0.0392, "step": 2850 }, { "epoch": 4.620355411954765, "grad_norm": 0.4540638327598572, "learning_rate": 6.036945260499762e-05, "loss": 0.0498, "step": 2860 }, { "epoch": 4.636510500807755, "grad_norm": 0.33841922879219055, "learning_rate": 6.010797167389808e-05, "loss": 0.0403, "step": 2870 }, { "epoch": 4.652665589660743, "grad_norm": 0.4046776592731476, "learning_rate": 5.9846202201437285e-05, "loss": 0.0394, "step": 2880 }, { "epoch": 4.668820678513732, "grad_norm": 0.5421432852745056, "learning_rate": 5.9584151660065946e-05, "loss": 0.0433, "step": 2890 }, { "epoch": 4.684975767366721, "grad_norm": 0.38528770208358765, "learning_rate": 5.93218275302581e-05, "loss": 0.0421, "step": 2900 }, { "epoch": 4.701130856219709, "grad_norm": 0.4037356376647949, "learning_rate": 5.9059237300297656e-05, "loss": 0.0467, "step": 2910 }, { "epoch": 4.717285945072698, "grad_norm": 0.3471173644065857, "learning_rate": 5.879638846606459e-05, "loss": 0.0395, "step": 2920 }, { "epoch": 4.733441033925686, "grad_norm": 0.37581634521484375, "learning_rate": 5.853328853082097e-05, "loss": 0.0454, "step": 2930 }, { "epoch": 4.749596122778676, "grad_norm": 0.3175153136253357, "learning_rate": 5.826994500499675e-05, "loss": 0.0438, "step": 2940 }, { "epoch": 4.765751211631664, "grad_norm": 0.6848868131637573, "learning_rate": 5.8006365405975436e-05, "loss": 0.0408, "step": 2950 }, { "epoch": 4.7819063004846525, "grad_norm": 0.5808501839637756, "learning_rate": 5.774255725787946e-05, "loss": 0.0469, "step": 2960 }, { "epoch": 4.798061389337642, "grad_norm": 0.4114396870136261, "learning_rate": 5.747852809135539e-05, "loss": 0.0475, "step": 2970 }, { "epoch": 4.81421647819063, "grad_norm": 0.4883790910243988, "learning_rate": 5.721428544335893e-05, "loss": 0.0427, "step": 2980 }, { "epoch": 4.830371567043619, "grad_norm": 0.476870059967041, "learning_rate": 5.694983685693988e-05, "loss": 0.0375, "step": 2990 }, { "epoch": 4.846526655896607, "grad_norm": 0.4612770974636078, "learning_rate": 5.668518988102668e-05, "loss": 0.0416, "step": 3000 }, { "epoch": 4.8626817447495965, "grad_norm": 0.6491737961769104, "learning_rate": 5.6420352070211016e-05, "loss": 0.0372, "step": 3010 }, { "epoch": 4.878836833602585, "grad_norm": 0.433662474155426, "learning_rate": 5.615533098453215e-05, "loss": 0.0467, "step": 3020 }, { "epoch": 4.894991922455573, "grad_norm": 0.2737475037574768, "learning_rate": 5.589013418926104e-05, "loss": 0.0413, "step": 3030 }, { "epoch": 4.911147011308563, "grad_norm": 0.388280987739563, "learning_rate": 5.562476925468445e-05, "loss": 0.0338, "step": 3040 }, { "epoch": 4.927302100161551, "grad_norm": 0.4380597174167633, "learning_rate": 5.535924375588887e-05, "loss": 0.0415, "step": 3050 }, { "epoch": 4.94345718901454, "grad_norm": 0.5273949503898621, "learning_rate": 5.509356527254421e-05, "loss": 0.0393, "step": 3060 }, { "epoch": 4.959612277867528, "grad_norm": 0.9131794571876526, "learning_rate": 5.482774138868749e-05, "loss": 0.0459, "step": 3070 }, { "epoch": 4.975767366720517, "grad_norm": 0.4145738482475281, "learning_rate": 5.456177969250632e-05, "loss": 0.038, "step": 3080 }, { "epoch": 4.991922455573506, "grad_norm": 0.4649810791015625, "learning_rate": 5.4295687776122236e-05, "loss": 0.0451, "step": 3090 }, { "epoch": 5.008077544426494, "grad_norm": 0.4478986859321594, "learning_rate": 5.4029473235374106e-05, "loss": 0.0439, "step": 3100 }, { "epoch": 5.024232633279483, "grad_norm": 0.34594130516052246, "learning_rate": 5.376314366960118e-05, "loss": 0.0451, "step": 3110 }, { "epoch": 5.040387722132472, "grad_norm": 0.5222188830375671, "learning_rate": 5.3496706681426204e-05, "loss": 0.0413, "step": 3120 }, { "epoch": 5.05654281098546, "grad_norm": 0.5172345638275146, "learning_rate": 5.323016987653842e-05, "loss": 0.0452, "step": 3130 }, { "epoch": 5.072697899838449, "grad_norm": 0.3387891352176666, "learning_rate": 5.29635408634764e-05, "loss": 0.042, "step": 3140 }, { "epoch": 5.088852988691438, "grad_norm": 0.45280104875564575, "learning_rate": 5.26968272534109e-05, "loss": 0.039, "step": 3150 }, { "epoch": 5.105008077544427, "grad_norm": 0.4317404329776764, "learning_rate": 5.2430036659927573e-05, "loss": 0.0377, "step": 3160 }, { "epoch": 5.121163166397415, "grad_norm": 0.6537325978279114, "learning_rate": 5.2163176698809645e-05, "loss": 0.044, "step": 3170 }, { "epoch": 5.1373182552504035, "grad_norm": 0.32357853651046753, "learning_rate": 5.189625498782047e-05, "loss": 0.0418, "step": 3180 }, { "epoch": 5.153473344103393, "grad_norm": 0.5868157148361206, "learning_rate": 5.1629279146486155e-05, "loss": 0.0452, "step": 3190 }, { "epoch": 5.169628432956381, "grad_norm": 0.46574723720550537, "learning_rate": 5.136225679587797e-05, "loss": 0.0432, "step": 3200 }, { "epoch": 5.18578352180937, "grad_norm": 0.2721109390258789, "learning_rate": 5.109519555839486e-05, "loss": 0.0424, "step": 3210 }, { "epoch": 5.201938610662358, "grad_norm": 0.3568851053714752, "learning_rate": 5.082810305754583e-05, "loss": 0.0391, "step": 3220 }, { "epoch": 5.2180936995153475, "grad_norm": 0.484744131565094, "learning_rate": 5.05609869177323e-05, "loss": 0.0371, "step": 3230 }, { "epoch": 5.234248788368336, "grad_norm": 0.29547053575515747, "learning_rate": 5.029385476403051e-05, "loss": 0.0311, "step": 3240 }, { "epoch": 5.250403877221324, "grad_norm": 0.3213876783847809, "learning_rate": 5.002671422197384e-05, "loss": 0.0334, "step": 3250 }, { "epoch": 5.266558966074314, "grad_norm": 0.3671923577785492, "learning_rate": 4.9759572917335104e-05, "loss": 0.0451, "step": 3260 }, { "epoch": 5.282714054927302, "grad_norm": 0.34725895524024963, "learning_rate": 4.949243847590887e-05, "loss": 0.0375, "step": 3270 }, { "epoch": 5.298869143780291, "grad_norm": 0.4185596704483032, "learning_rate": 4.922531852329384e-05, "loss": 0.0379, "step": 3280 }, { "epoch": 5.315024232633279, "grad_norm": 0.5074782371520996, "learning_rate": 4.895822068467505e-05, "loss": 0.0402, "step": 3290 }, { "epoch": 5.331179321486268, "grad_norm": 1.0807178020477295, "learning_rate": 4.869115258460635e-05, "loss": 0.0332, "step": 3300 }, { "epoch": 5.347334410339257, "grad_norm": 0.40008237957954407, "learning_rate": 4.8424121846792614e-05, "loss": 0.0422, "step": 3310 }, { "epoch": 5.363489499192245, "grad_norm": 0.3162868618965149, "learning_rate": 4.8157136093872215e-05, "loss": 0.0332, "step": 3320 }, { "epoch": 5.379644588045235, "grad_norm": 0.5000666379928589, "learning_rate": 4.789020294719933e-05, "loss": 0.0359, "step": 3330 }, { "epoch": 5.395799676898223, "grad_norm": 0.4171485900878906, "learning_rate": 4.762333002662655e-05, "loss": 0.0383, "step": 3340 }, { "epoch": 5.4119547657512115, "grad_norm": 0.6137621402740479, "learning_rate": 4.735652495028714e-05, "loss": 0.0393, "step": 3350 }, { "epoch": 5.4281098546042, "grad_norm": 0.6010169386863708, "learning_rate": 4.708979533437778e-05, "loss": 0.0401, "step": 3360 }, { "epoch": 5.444264943457189, "grad_norm": 0.32663294672966003, "learning_rate": 4.6823148792941e-05, "loss": 0.0422, "step": 3370 }, { "epoch": 5.460420032310178, "grad_norm": 0.3632521629333496, "learning_rate": 4.655659293764793e-05, "loss": 0.0426, "step": 3380 }, { "epoch": 5.476575121163166, "grad_norm": 0.30977901816368103, "learning_rate": 4.629013537758093e-05, "loss": 0.0417, "step": 3390 }, { "epoch": 5.4927302100161555, "grad_norm": 0.42319706082344055, "learning_rate": 4.6023783719016526e-05, "loss": 0.0431, "step": 3400 }, { "epoch": 5.508885298869144, "grad_norm": 0.3542233109474182, "learning_rate": 4.57575455652081e-05, "loss": 0.0365, "step": 3410 }, { "epoch": 5.525040387722132, "grad_norm": 0.4000030755996704, "learning_rate": 4.5491428516168975e-05, "loss": 0.0467, "step": 3420 }, { "epoch": 5.541195476575121, "grad_norm": 0.3602658212184906, "learning_rate": 4.52254401684554e-05, "loss": 0.0339, "step": 3430 }, { "epoch": 5.55735056542811, "grad_norm": 0.41686055064201355, "learning_rate": 4.495958811494978e-05, "loss": 0.0324, "step": 3440 }, { "epoch": 5.573505654281099, "grad_norm": 0.3794184625148773, "learning_rate": 4.469387994464381e-05, "loss": 0.0393, "step": 3450 }, { "epoch": 5.589660743134087, "grad_norm": 0.3583800196647644, "learning_rate": 4.442832324242197e-05, "loss": 0.0427, "step": 3460 }, { "epoch": 5.605815831987076, "grad_norm": 0.27712520956993103, "learning_rate": 4.416292558884489e-05, "loss": 0.0355, "step": 3470 }, { "epoch": 5.621970920840065, "grad_norm": 0.41386884450912476, "learning_rate": 4.389769455993303e-05, "loss": 0.0387, "step": 3480 }, { "epoch": 5.638126009693053, "grad_norm": 0.38608935475349426, "learning_rate": 4.3632637726950415e-05, "loss": 0.0353, "step": 3490 }, { "epoch": 5.654281098546042, "grad_norm": 0.5377467274665833, "learning_rate": 4.336776265618844e-05, "loss": 0.0388, "step": 3500 }, { "epoch": 5.670436187399031, "grad_norm": 0.5470876693725586, "learning_rate": 4.3103076908749996e-05, "loss": 0.0407, "step": 3510 }, { "epoch": 5.686591276252019, "grad_norm": 0.3438394367694855, "learning_rate": 4.283858804033351e-05, "loss": 0.0348, "step": 3520 }, { "epoch": 5.702746365105008, "grad_norm": 0.38908761739730835, "learning_rate": 4.257430360101734e-05, "loss": 0.0342, "step": 3530 }, { "epoch": 5.718901453957997, "grad_norm": 0.4778120219707489, "learning_rate": 4.2310231135044196e-05, "loss": 0.0421, "step": 3540 }, { "epoch": 5.735056542810986, "grad_norm": 0.4164102375507355, "learning_rate": 4.2046378180605894e-05, "loss": 0.0395, "step": 3550 }, { "epoch": 5.751211631663974, "grad_norm": 0.31713828444480896, "learning_rate": 4.1782752269627986e-05, "loss": 0.0378, "step": 3560 }, { "epoch": 5.7673667205169625, "grad_norm": 0.36085453629493713, "learning_rate": 4.1519360927554953e-05, "loss": 0.0419, "step": 3570 }, { "epoch": 5.783521809369952, "grad_norm": 0.3456893861293793, "learning_rate": 4.125621167313519e-05, "loss": 0.0408, "step": 3580 }, { "epoch": 5.79967689822294, "grad_norm": 0.4086418151855469, "learning_rate": 4.09933120182066e-05, "loss": 0.0361, "step": 3590 }, { "epoch": 5.815831987075929, "grad_norm": 0.3052937984466553, "learning_rate": 4.073066946748192e-05, "loss": 0.0372, "step": 3600 }, { "epoch": 5.831987075928918, "grad_norm": 0.3931577801704407, "learning_rate": 4.046829151833469e-05, "loss": 0.0331, "step": 3610 }, { "epoch": 5.8481421647819065, "grad_norm": 0.46110355854034424, "learning_rate": 4.020618566058513e-05, "loss": 0.0354, "step": 3620 }, { "epoch": 5.864297253634895, "grad_norm": 0.39353641867637634, "learning_rate": 3.994435937628636e-05, "loss": 0.035, "step": 3630 }, { "epoch": 5.880452342487883, "grad_norm": 0.4410620927810669, "learning_rate": 3.968282013951079e-05, "loss": 0.0374, "step": 3640 }, { "epoch": 5.896607431340873, "grad_norm": 0.36808839440345764, "learning_rate": 3.9421575416136866e-05, "loss": 0.0381, "step": 3650 }, { "epoch": 5.912762520193861, "grad_norm": 0.38404178619384766, "learning_rate": 3.9160632663635786e-05, "loss": 0.0348, "step": 3660 }, { "epoch": 5.92891760904685, "grad_norm": 0.2804437577724457, "learning_rate": 3.88999993308588e-05, "loss": 0.0346, "step": 3670 }, { "epoch": 5.945072697899839, "grad_norm": 0.4300249218940735, "learning_rate": 3.86396828578244e-05, "loss": 0.0378, "step": 3680 }, { "epoch": 5.961227786752827, "grad_norm": 0.3507043421268463, "learning_rate": 3.837969067550611e-05, "loss": 0.0353, "step": 3690 }, { "epoch": 5.977382875605816, "grad_norm": 0.3638635277748108, "learning_rate": 3.812003020562022e-05, "loss": 0.0314, "step": 3700 }, { "epoch": 5.993537964458804, "grad_norm": 0.3249291181564331, "learning_rate": 3.7860708860414005e-05, "loss": 0.0421, "step": 3710 }, { "epoch": 6.009693053311794, "grad_norm": 0.4243714511394501, "learning_rate": 3.760173404245409e-05, "loss": 0.034, "step": 3720 }, { "epoch": 6.025848142164782, "grad_norm": 0.2857236862182617, "learning_rate": 3.734311314441521e-05, "loss": 0.0373, "step": 3730 }, { "epoch": 6.0420032310177705, "grad_norm": 0.3825433850288391, "learning_rate": 3.708485354886906e-05, "loss": 0.0298, "step": 3740 }, { "epoch": 6.058158319870759, "grad_norm": 0.347135990858078, "learning_rate": 3.6826962628073705e-05, "loss": 0.0348, "step": 3750 }, { "epoch": 6.074313408723748, "grad_norm": 0.4767064154148102, "learning_rate": 3.6569447743762986e-05, "loss": 0.0341, "step": 3760 }, { "epoch": 6.090468497576737, "grad_norm": 0.2834322154521942, "learning_rate": 3.631231624693645e-05, "loss": 0.0391, "step": 3770 }, { "epoch": 6.106623586429725, "grad_norm": 0.635104775428772, "learning_rate": 3.605557547764951e-05, "loss": 0.0355, "step": 3780 }, { "epoch": 6.1227786752827145, "grad_norm": 0.35917991399765015, "learning_rate": 3.579923276480387e-05, "loss": 0.0303, "step": 3790 }, { "epoch": 6.138933764135703, "grad_norm": 0.40180811285972595, "learning_rate": 3.5543295425938414e-05, "loss": 0.036, "step": 3800 }, { "epoch": 6.155088852988691, "grad_norm": 0.24985694885253906, "learning_rate": 3.5287770767020164e-05, "loss": 0.0291, "step": 3810 }, { "epoch": 6.17124394184168, "grad_norm": 0.36490491032600403, "learning_rate": 3.5032666082235896e-05, "loss": 0.0397, "step": 3820 }, { "epoch": 6.187399030694669, "grad_norm": 0.2522122263908386, "learning_rate": 3.477798865378375e-05, "loss": 0.0335, "step": 3830 }, { "epoch": 6.203554119547658, "grad_norm": 0.2659394145011902, "learning_rate": 3.4523745751665534e-05, "loss": 0.0303, "step": 3840 }, { "epoch": 6.219709208400646, "grad_norm": 0.2996593713760376, "learning_rate": 3.426994463347902e-05, "loss": 0.0327, "step": 3850 }, { "epoch": 6.2358642972536344, "grad_norm": 0.429979145526886, "learning_rate": 3.401659254421094e-05, "loss": 0.0367, "step": 3860 }, { "epoch": 6.252019386106624, "grad_norm": 0.3394151031970978, "learning_rate": 3.3763696716029957e-05, "loss": 0.0316, "step": 3870 }, { "epoch": 6.268174474959612, "grad_norm": 0.5161323547363281, "learning_rate": 3.351126436808048e-05, "loss": 0.0365, "step": 3880 }, { "epoch": 6.284329563812601, "grad_norm": 0.3514617681503296, "learning_rate": 3.325930270627632e-05, "loss": 0.0316, "step": 3890 }, { "epoch": 6.30048465266559, "grad_norm": 0.4464913606643677, "learning_rate": 3.300781892309523e-05, "loss": 0.0351, "step": 3900 }, { "epoch": 6.316639741518578, "grad_norm": 0.4298667311668396, "learning_rate": 3.2756820197373394e-05, "loss": 0.0347, "step": 3910 }, { "epoch": 6.332794830371567, "grad_norm": 0.44100216031074524, "learning_rate": 3.250631369410064e-05, "loss": 0.0328, "step": 3920 }, { "epoch": 6.348949919224555, "grad_norm": 0.3341505527496338, "learning_rate": 3.2256306564215796e-05, "loss": 0.0345, "step": 3930 }, { "epoch": 6.365105008077545, "grad_norm": 0.41437703371047974, "learning_rate": 3.20068059444027e-05, "loss": 0.0334, "step": 3940 }, { "epoch": 6.381260096930533, "grad_norm": 0.5106116533279419, "learning_rate": 3.1757818956886295e-05, "loss": 0.0357, "step": 3950 }, { "epoch": 6.3974151857835215, "grad_norm": 0.2565278708934784, "learning_rate": 3.150935270922951e-05, "loss": 0.0286, "step": 3960 }, { "epoch": 6.413570274636511, "grad_norm": 0.3734126389026642, "learning_rate": 3.126141429413019e-05, "loss": 0.0297, "step": 3970 }, { "epoch": 6.429725363489499, "grad_norm": 0.34675681591033936, "learning_rate": 3.101401078921878e-05, "loss": 0.028, "step": 3980 }, { "epoch": 6.445880452342488, "grad_norm": 0.42493683099746704, "learning_rate": 3.076714925685617e-05, "loss": 0.03, "step": 3990 }, { "epoch": 6.462035541195476, "grad_norm": 0.30656370520591736, "learning_rate": 3.052083674393221e-05, "loss": 0.0312, "step": 4000 }, { "epoch": 6.4781906300484655, "grad_norm": 0.36631324887275696, "learning_rate": 3.0275080281664414e-05, "loss": 0.0279, "step": 4010 }, { "epoch": 6.494345718901454, "grad_norm": 0.5831628441810608, "learning_rate": 3.0029886885397367e-05, "loss": 0.0354, "step": 4020 }, { "epoch": 6.510500807754442, "grad_norm": 0.3962215781211853, "learning_rate": 2.9785263554402366e-05, "loss": 0.0392, "step": 4030 }, { "epoch": 6.526655896607432, "grad_norm": 0.45189252495765686, "learning_rate": 2.9541217271677745e-05, "loss": 0.0356, "step": 4040 }, { "epoch": 6.54281098546042, "grad_norm": 0.4892602860927582, "learning_rate": 2.9297755003749394e-05, "loss": 0.0297, "step": 4050 }, { "epoch": 6.558966074313409, "grad_norm": 0.32902640104293823, "learning_rate": 2.9054883700471974e-05, "loss": 0.0315, "step": 4060 }, { "epoch": 6.575121163166397, "grad_norm": 0.3130761682987213, "learning_rate": 2.8812610294830566e-05, "loss": 0.0336, "step": 4070 }, { "epoch": 6.591276252019386, "grad_norm": 0.5444455146789551, "learning_rate": 2.8570941702742663e-05, "loss": 0.0293, "step": 4080 }, { "epoch": 6.607431340872375, "grad_norm": 0.5223131775856018, "learning_rate": 2.832988482286081e-05, "loss": 0.032, "step": 4090 }, { "epoch": 6.623586429725363, "grad_norm": 0.5296066403388977, "learning_rate": 2.808944653637564e-05, "loss": 0.0336, "step": 4100 }, { "epoch": 6.639741518578353, "grad_norm": 0.4030674397945404, "learning_rate": 2.7849633706819533e-05, "loss": 0.0355, "step": 4110 }, { "epoch": 6.655896607431341, "grad_norm": 0.42938342690467834, "learning_rate": 2.7610453179870554e-05, "loss": 0.0291, "step": 4120 }, { "epoch": 6.6720516962843295, "grad_norm": 0.4580219089984894, "learning_rate": 2.7371911783157178e-05, "loss": 0.0318, "step": 4130 }, { "epoch": 6.688206785137318, "grad_norm": 0.30596330761909485, "learning_rate": 2.7134016326063234e-05, "loss": 0.034, "step": 4140 }, { "epoch": 6.704361873990307, "grad_norm": 0.35359278321266174, "learning_rate": 2.6896773599533694e-05, "loss": 0.0299, "step": 4150 }, { "epoch": 6.720516962843296, "grad_norm": 0.29407617449760437, "learning_rate": 2.6660190375880657e-05, "loss": 0.0266, "step": 4160 }, { "epoch": 6.736672051696284, "grad_norm": 0.357388973236084, "learning_rate": 2.6424273408590188e-05, "loss": 0.0352, "step": 4170 }, { "epoch": 6.7528271405492735, "grad_norm": 0.8390901684761047, "learning_rate": 2.6189029432129385e-05, "loss": 0.0377, "step": 4180 }, { "epoch": 6.768982229402262, "grad_norm": 0.28982290625572205, "learning_rate": 2.5954465161754227e-05, "loss": 0.0315, "step": 4190 }, { "epoch": 6.78513731825525, "grad_norm": 0.5228689908981323, "learning_rate": 2.5720587293317826e-05, "loss": 0.0283, "step": 4200 }, { "epoch": 6.801292407108239, "grad_norm": 0.5332914590835571, "learning_rate": 2.5487402503079395e-05, "loss": 0.0314, "step": 4210 }, { "epoch": 6.817447495961228, "grad_norm": 0.5198635458946228, "learning_rate": 2.5254917447513504e-05, "loss": 0.0298, "step": 4220 }, { "epoch": 6.833602584814217, "grad_norm": 0.37016230821609497, "learning_rate": 2.5023138763120217e-05, "loss": 0.0281, "step": 4230 }, { "epoch": 6.849757673667205, "grad_norm": 0.32923170924186707, "learning_rate": 2.479207306623554e-05, "loss": 0.0308, "step": 4240 }, { "epoch": 6.865912762520194, "grad_norm": 0.2647690176963806, "learning_rate": 2.456172695284263e-05, "loss": 0.0336, "step": 4250 }, { "epoch": 6.882067851373183, "grad_norm": 0.39588427543640137, "learning_rate": 2.433210699838342e-05, "loss": 0.0328, "step": 4260 }, { "epoch": 6.898222940226171, "grad_norm": 0.28190135955810547, "learning_rate": 2.4103219757571033e-05, "loss": 0.0292, "step": 4270 }, { "epoch": 6.91437802907916, "grad_norm": 0.4510742723941803, "learning_rate": 2.3875071764202563e-05, "loss": 0.0293, "step": 4280 }, { "epoch": 6.930533117932149, "grad_norm": 0.35639435052871704, "learning_rate": 2.36476695309726e-05, "loss": 0.0274, "step": 4290 }, { "epoch": 6.946688206785137, "grad_norm": 0.38058537244796753, "learning_rate": 2.342101954928733e-05, "loss": 0.0332, "step": 4300 }, { "epoch": 6.962843295638126, "grad_norm": 0.5739650726318359, "learning_rate": 2.3195128289079264e-05, "loss": 0.0266, "step": 4310 }, { "epoch": 6.978998384491114, "grad_norm": 0.5040541887283325, "learning_rate": 2.2970002198622444e-05, "loss": 0.0386, "step": 4320 }, { "epoch": 6.995153473344104, "grad_norm": 0.2796167731285095, "learning_rate": 2.2745647704348506e-05, "loss": 0.0304, "step": 4330 }, { "epoch": 7.011308562197092, "grad_norm": 0.8160725235939026, "learning_rate": 2.2522071210663108e-05, "loss": 0.0257, "step": 4340 }, { "epoch": 7.0274636510500805, "grad_norm": 0.2881336510181427, "learning_rate": 2.2299279099763176e-05, "loss": 0.0291, "step": 4350 }, { "epoch": 7.04361873990307, "grad_norm": 0.43697014451026917, "learning_rate": 2.2077277731454743e-05, "loss": 0.0302, "step": 4360 }, { "epoch": 7.059773828756058, "grad_norm": 0.2801852822303772, "learning_rate": 2.185607344297132e-05, "loss": 0.0285, "step": 4370 }, { "epoch": 7.075928917609047, "grad_norm": 0.4039601683616638, "learning_rate": 2.1635672548793067e-05, "loss": 0.0249, "step": 4380 }, { "epoch": 7.092084006462035, "grad_norm": 0.312288761138916, "learning_rate": 2.1416081340466477e-05, "loss": 0.0289, "step": 4390 }, { "epoch": 7.1082390953150245, "grad_norm": 0.3759534955024719, "learning_rate": 2.119730608642489e-05, "loss": 0.0343, "step": 4400 }, { "epoch": 7.124394184168013, "grad_norm": 0.5132532119750977, "learning_rate": 2.0979353031809383e-05, "loss": 0.0346, "step": 4410 }, { "epoch": 7.140549273021001, "grad_norm": 0.9728456139564514, "learning_rate": 2.0762228398290697e-05, "loss": 0.0339, "step": 4420 }, { "epoch": 7.156704361873991, "grad_norm": 0.32944944500923157, "learning_rate": 2.054593838389143e-05, "loss": 0.0257, "step": 4430 }, { "epoch": 7.172859450726979, "grad_norm": 0.22434404492378235, "learning_rate": 2.033048916280928e-05, "loss": 0.0317, "step": 4440 }, { "epoch": 7.189014539579968, "grad_norm": 0.36417004466056824, "learning_rate": 2.0115886885240682e-05, "loss": 0.0264, "step": 4450 }, { "epoch": 7.205169628432956, "grad_norm": 0.29687365889549255, "learning_rate": 1.990213767720533e-05, "loss": 0.03, "step": 4460 }, { "epoch": 7.221324717285945, "grad_norm": 0.41539931297302246, "learning_rate": 1.9689247640371223e-05, "loss": 0.0294, "step": 4470 }, { "epoch": 7.237479806138934, "grad_norm": 0.44902583956718445, "learning_rate": 1.9477222851880545e-05, "loss": 0.0282, "step": 4480 }, { "epoch": 7.253634894991922, "grad_norm": 0.38103097677230835, "learning_rate": 1.926606936417614e-05, "loss": 0.0311, "step": 4490 }, { "epoch": 7.269789983844911, "grad_norm": 0.42052754759788513, "learning_rate": 1.9055793204828842e-05, "loss": 0.0298, "step": 4500 }, { "epoch": 7.2859450726979, "grad_norm": 0.645574688911438, "learning_rate": 1.8846400376365253e-05, "loss": 0.0291, "step": 4510 }, { "epoch": 7.3021001615508885, "grad_norm": 0.4374733865261078, "learning_rate": 1.8637896856096548e-05, "loss": 0.0301, "step": 4520 }, { "epoch": 7.318255250403877, "grad_norm": 0.46677830815315247, "learning_rate": 1.843028859594772e-05, "loss": 0.0283, "step": 4530 }, { "epoch": 7.334410339256866, "grad_norm": 0.4820699393749237, "learning_rate": 1.8223581522287807e-05, "loss": 0.0264, "step": 4540 }, { "epoch": 7.350565428109855, "grad_norm": 0.2922935485839844, "learning_rate": 1.801778153576058e-05, "loss": 0.0256, "step": 4550 }, { "epoch": 7.366720516962843, "grad_norm": 0.6086759567260742, "learning_rate": 1.7812894511116235e-05, "loss": 0.0247, "step": 4560 }, { "epoch": 7.382875605815832, "grad_norm": 0.2402912974357605, "learning_rate": 1.7608926297043583e-05, "loss": 0.0309, "step": 4570 }, { "epoch": 7.399030694668821, "grad_norm": 0.25733429193496704, "learning_rate": 1.7405882716003154e-05, "loss": 0.0219, "step": 4580 }, { "epoch": 7.415185783521809, "grad_norm": 0.4837753474712372, "learning_rate": 1.7203769564060962e-05, "loss": 0.0262, "step": 4590 }, { "epoch": 7.431340872374798, "grad_norm": 0.31810057163238525, "learning_rate": 1.700259261072312e-05, "loss": 0.0234, "step": 4600 }, { "epoch": 7.447495961227787, "grad_norm": 0.5520622134208679, "learning_rate": 1.6802357598771012e-05, "loss": 0.0274, "step": 4610 }, { "epoch": 7.463651050080776, "grad_norm": 0.3220314085483551, "learning_rate": 1.6603070244097523e-05, "loss": 0.0277, "step": 4620 }, { "epoch": 7.479806138933764, "grad_norm": 0.3330337703227997, "learning_rate": 1.6404736235543705e-05, "loss": 0.032, "step": 4630 }, { "epoch": 7.4959612277867524, "grad_norm": 0.8944841027259827, "learning_rate": 1.6207361234736533e-05, "loss": 0.0257, "step": 4640 }, { "epoch": 7.512116316639742, "grad_norm": 0.3682458698749542, "learning_rate": 1.6010950875927182e-05, "loss": 0.0268, "step": 4650 }, { "epoch": 7.52827140549273, "grad_norm": 0.4134623408317566, "learning_rate": 1.581551076583023e-05, "loss": 0.0353, "step": 4660 }, { "epoch": 7.544426494345719, "grad_norm": 0.2340182512998581, "learning_rate": 1.5621046483463663e-05, "loss": 0.0252, "step": 4670 }, { "epoch": 7.560581583198708, "grad_norm": 0.6091485619544983, "learning_rate": 1.5427563579989507e-05, "loss": 0.0214, "step": 4680 }, { "epoch": 7.576736672051696, "grad_norm": 0.7271833419799805, "learning_rate": 1.523506757855545e-05, "loss": 0.0305, "step": 4690 }, { "epoch": 7.592891760904685, "grad_norm": 0.3721354007720947, "learning_rate": 1.504356397413713e-05, "loss": 0.032, "step": 4700 }, { "epoch": 7.609046849757673, "grad_norm": 0.3686143755912781, "learning_rate": 1.485305823338135e-05, "loss": 0.0258, "step": 4710 }, { "epoch": 7.625201938610663, "grad_norm": 0.3255109488964081, "learning_rate": 1.4663555794449918e-05, "loss": 0.0248, "step": 4720 }, { "epoch": 7.641357027463651, "grad_norm": 0.35630714893341064, "learning_rate": 1.4475062066864514e-05, "loss": 0.031, "step": 4730 }, { "epoch": 7.6575121163166395, "grad_norm": 0.2801692485809326, "learning_rate": 1.4287582431352175e-05, "loss": 0.0246, "step": 4740 }, { "epoch": 7.673667205169629, "grad_norm": 0.3327733874320984, "learning_rate": 1.41011222396918e-05, "loss": 0.0251, "step": 4750 }, { "epoch": 7.689822294022617, "grad_norm": 0.6043513417243958, "learning_rate": 1.3915686814561285e-05, "loss": 0.0288, "step": 4760 }, { "epoch": 7.705977382875606, "grad_norm": 0.3464643657207489, "learning_rate": 1.373128144938563e-05, "loss": 0.0299, "step": 4770 }, { "epoch": 7.722132471728594, "grad_norm": 0.21582302451133728, "learning_rate": 1.354791140818582e-05, "loss": 0.0337, "step": 4780 }, { "epoch": 7.7382875605815835, "grad_norm": 0.4652714431285858, "learning_rate": 1.3365581925428594e-05, "loss": 0.0241, "step": 4790 }, { "epoch": 7.754442649434572, "grad_norm": 0.2494271844625473, "learning_rate": 1.3184298205876938e-05, "loss": 0.0271, "step": 4800 }, { "epoch": 7.77059773828756, "grad_norm": 0.44884902238845825, "learning_rate": 1.3004065424441636e-05, "loss": 0.0258, "step": 4810 }, { "epoch": 7.78675282714055, "grad_norm": 0.3176079988479614, "learning_rate": 1.282488872603339e-05, "loss": 0.0226, "step": 4820 }, { "epoch": 7.802907915993538, "grad_norm": 0.42613035440444946, "learning_rate": 1.2646773225416132e-05, "loss": 0.0283, "step": 4830 }, { "epoch": 7.819063004846527, "grad_norm": 0.7398589849472046, "learning_rate": 1.2469724007060835e-05, "loss": 0.0377, "step": 4840 }, { "epoch": 7.835218093699515, "grad_norm": 0.38897576928138733, "learning_rate": 1.2293746125000538e-05, "loss": 0.0257, "step": 4850 }, { "epoch": 7.851373182552504, "grad_norm": 0.50649094581604, "learning_rate": 1.2118844602685958e-05, "loss": 0.0253, "step": 4860 }, { "epoch": 7.867528271405493, "grad_norm": 0.28981852531433105, "learning_rate": 1.1945024432842134e-05, "loss": 0.0285, "step": 4870 }, { "epoch": 7.883683360258481, "grad_norm": 0.406024307012558, "learning_rate": 1.1772290577325895e-05, "loss": 0.0306, "step": 4880 }, { "epoch": 7.899838449111471, "grad_norm": 0.2725732922554016, "learning_rate": 1.1600647966984274e-05, "loss": 0.0246, "step": 4890 }, { "epoch": 7.915993537964459, "grad_norm": 0.4214000999927521, "learning_rate": 1.1430101501513634e-05, "loss": 0.0281, "step": 4900 }, { "epoch": 7.9321486268174475, "grad_norm": 0.2999376952648163, "learning_rate": 1.1260656049319957e-05, "loss": 0.024, "step": 4910 }, { "epoch": 7.948303715670436, "grad_norm": 0.31904590129852295, "learning_rate": 1.1092316447379692e-05, "loss": 0.0212, "step": 4920 }, { "epoch": 7.964458804523425, "grad_norm": 0.3466980755329132, "learning_rate": 1.0925087501101872e-05, "loss": 0.0293, "step": 4930 }, { "epoch": 7.980613893376414, "grad_norm": 0.3411683440208435, "learning_rate": 1.0758973984190762e-05, "loss": 0.0219, "step": 4940 }, { "epoch": 7.996768982229402, "grad_norm": 0.3246071934700012, "learning_rate": 1.0593980638509693e-05, "loss": 0.0295, "step": 4950 }, { "epoch": 8.012924071082391, "grad_norm": 0.29273203015327454, "learning_rate": 1.043011217394571e-05, "loss": 0.0264, "step": 4960 }, { "epoch": 8.02907915993538, "grad_norm": 0.36481159925460815, "learning_rate": 1.0267373268275049e-05, "loss": 0.0303, "step": 4970 }, { "epoch": 8.045234248788368, "grad_norm": 0.26860660314559937, "learning_rate": 1.0105768567029655e-05, "loss": 0.0314, "step": 4980 }, { "epoch": 8.061389337641357, "grad_norm": 0.3127424716949463, "learning_rate": 9.945302683364566e-06, "loss": 0.0224, "step": 4990 }, { "epoch": 8.077544426494345, "grad_norm": 0.3091331422328949, "learning_rate": 9.785980197926242e-06, "loss": 0.0267, "step": 5000 }, { "epoch": 8.093699515347334, "grad_norm": 0.3343771696090698, "learning_rate": 9.627805658721756e-06, "loss": 0.0311, "step": 5010 }, { "epoch": 8.109854604200324, "grad_norm": 0.37236693501472473, "learning_rate": 9.470783580989029e-06, "loss": 0.0261, "step": 5020 }, { "epoch": 8.126009693053312, "grad_norm": 0.28066885471343994, "learning_rate": 9.314918447067878e-06, "loss": 0.0256, "step": 5030 }, { "epoch": 8.1421647819063, "grad_norm": 0.3097597360610962, "learning_rate": 9.16021470627213e-06, "loss": 0.0246, "step": 5040 }, { "epoch": 8.15831987075929, "grad_norm": 0.2532176971435547, "learning_rate": 9.006676774762535e-06, "loss": 0.0238, "step": 5050 }, { "epoch": 8.174474959612278, "grad_norm": 0.6101159453392029, "learning_rate": 8.854309035420772e-06, "loss": 0.0248, "step": 5060 }, { "epoch": 8.190630048465266, "grad_norm": 0.36472347378730774, "learning_rate": 8.703115837724274e-06, "loss": 0.0215, "step": 5070 }, { "epoch": 8.206785137318255, "grad_norm": 0.2975756525993347, "learning_rate": 8.553101497622162e-06, "loss": 0.0258, "step": 5080 }, { "epoch": 8.222940226171245, "grad_norm": 0.25580790638923645, "learning_rate": 8.404270297411904e-06, "loss": 0.025, "step": 5090 }, { "epoch": 8.239095315024233, "grad_norm": 0.3066563606262207, "learning_rate": 8.256626485617219e-06, "loss": 0.0303, "step": 5100 }, { "epoch": 8.255250403877222, "grad_norm": 0.44430306553840637, "learning_rate": 8.110174276866683e-06, "loss": 0.0224, "step": 5110 }, { "epoch": 8.27140549273021, "grad_norm": 0.3054925799369812, "learning_rate": 7.964917851773496e-06, "loss": 0.0278, "step": 5120 }, { "epoch": 8.287560581583199, "grad_norm": 0.25573277473449707, "learning_rate": 7.820861356816078e-06, "loss": 0.0211, "step": 5130 }, { "epoch": 8.303715670436187, "grad_norm": 0.3430786728858948, "learning_rate": 7.678008904219786e-06, "loss": 0.0237, "step": 5140 }, { "epoch": 8.319870759289175, "grad_norm": 0.4758915603160858, "learning_rate": 7.536364571839438e-06, "loss": 0.0215, "step": 5150 }, { "epoch": 8.336025848142166, "grad_norm": 0.3592261075973511, "learning_rate": 7.3959324030429654e-06, "loss": 0.0266, "step": 5160 }, { "epoch": 8.352180936995154, "grad_norm": 0.33260300755500793, "learning_rate": 7.256716406595948e-06, "loss": 0.0189, "step": 5170 }, { "epoch": 8.368336025848143, "grad_norm": 0.5978755950927734, "learning_rate": 7.118720556547259e-06, "loss": 0.0236, "step": 5180 }, { "epoch": 8.384491114701131, "grad_norm": 0.18789972364902496, "learning_rate": 6.9819487921155116e-06, "loss": 0.0234, "step": 5190 }, { "epoch": 8.40064620355412, "grad_norm": 0.31928345561027527, "learning_rate": 6.846405017576718e-06, "loss": 0.0281, "step": 5200 }, { "epoch": 8.416801292407108, "grad_norm": 0.34838926792144775, "learning_rate": 6.712093102152739e-06, "loss": 0.0276, "step": 5210 }, { "epoch": 8.432956381260096, "grad_norm": 0.37636154890060425, "learning_rate": 6.579016879900924e-06, "loss": 0.0251, "step": 5220 }, { "epoch": 8.449111470113085, "grad_norm": 0.26992267370224, "learning_rate": 6.447180149604603e-06, "loss": 0.0298, "step": 5230 }, { "epoch": 8.465266558966075, "grad_norm": 0.31432321667671204, "learning_rate": 6.316586674664654e-06, "loss": 0.0225, "step": 5240 }, { "epoch": 8.481421647819063, "grad_norm": 0.4834333062171936, "learning_rate": 6.187240182992126e-06, "loss": 0.0211, "step": 5250 }, { "epoch": 8.497576736672052, "grad_norm": 0.3510620594024658, "learning_rate": 6.059144366901736e-06, "loss": 0.0267, "step": 5260 }, { "epoch": 8.51373182552504, "grad_norm": 0.24435961246490479, "learning_rate": 5.932302883006546e-06, "loss": 0.0264, "step": 5270 }, { "epoch": 8.529886914378029, "grad_norm": 0.3356267809867859, "learning_rate": 5.806719352113521e-06, "loss": 0.0284, "step": 5280 }, { "epoch": 8.546042003231017, "grad_norm": 0.46301284432411194, "learning_rate": 5.682397359120245e-06, "loss": 0.0232, "step": 5290 }, { "epoch": 8.562197092084006, "grad_norm": 0.37262049317359924, "learning_rate": 5.5593404529124875e-06, "loss": 0.0226, "step": 5300 }, { "epoch": 8.578352180936996, "grad_norm": 0.5886579155921936, "learning_rate": 5.437552146263003e-06, "loss": 0.0276, "step": 5310 }, { "epoch": 8.594507269789984, "grad_norm": 0.3321017324924469, "learning_rate": 5.3170359157311445e-06, "loss": 0.0234, "step": 5320 }, { "epoch": 8.610662358642973, "grad_norm": 0.5492444038391113, "learning_rate": 5.197795201563743e-06, "loss": 0.0242, "step": 5330 }, { "epoch": 8.626817447495961, "grad_norm": 0.3654116690158844, "learning_rate": 5.07983340759679e-06, "loss": 0.0251, "step": 5340 }, { "epoch": 8.64297253634895, "grad_norm": 0.3987561762332916, "learning_rate": 4.963153901158352e-06, "loss": 0.0219, "step": 5350 }, { "epoch": 8.659127625201938, "grad_norm": 0.2271428108215332, "learning_rate": 4.847760012972402e-06, "loss": 0.0255, "step": 5360 }, { "epoch": 8.675282714054926, "grad_norm": 0.3363126218318939, "learning_rate": 4.733655037063761e-06, "loss": 0.026, "step": 5370 }, { "epoch": 8.691437802907917, "grad_norm": 0.4031514823436737, "learning_rate": 4.620842230664052e-06, "loss": 0.0263, "step": 5380 }, { "epoch": 8.707592891760905, "grad_norm": 0.4956108033657074, "learning_rate": 4.509324814118754e-06, "loss": 0.0219, "step": 5390 }, { "epoch": 8.723747980613894, "grad_norm": 0.3220359981060028, "learning_rate": 4.39910597079522e-06, "loss": 0.0241, "step": 5400 }, { "epoch": 8.739903069466882, "grad_norm": 0.2299569994211197, "learning_rate": 4.290188846991866e-06, "loss": 0.0255, "step": 5410 }, { "epoch": 8.75605815831987, "grad_norm": 0.2428327053785324, "learning_rate": 4.182576551848283e-06, "loss": 0.0212, "step": 5420 }, { "epoch": 8.772213247172859, "grad_norm": 0.42381399869918823, "learning_rate": 4.076272157256577e-06, "loss": 0.0218, "step": 5430 }, { "epoch": 8.788368336025847, "grad_norm": 0.4133065342903137, "learning_rate": 3.971278697773584e-06, "loss": 0.024, "step": 5440 }, { "epoch": 8.804523424878838, "grad_norm": 0.5403354167938232, "learning_rate": 3.86759917053432e-06, "loss": 0.0275, "step": 5450 }, { "epoch": 8.820678513731826, "grad_norm": 0.4583636224269867, "learning_rate": 3.765236535166361e-06, "loss": 0.0239, "step": 5460 }, { "epoch": 8.836833602584814, "grad_norm": 0.28426027297973633, "learning_rate": 3.6641937137054382e-06, "loss": 0.0212, "step": 5470 }, { "epoch": 8.852988691437803, "grad_norm": 0.3920319676399231, "learning_rate": 3.564473590511941e-06, "loss": 0.0168, "step": 5480 }, { "epoch": 8.869143780290791, "grad_norm": 0.28189295530319214, "learning_rate": 3.4660790121886387e-06, "loss": 0.0246, "step": 5490 }, { "epoch": 8.88529886914378, "grad_norm": 0.24748258292675018, "learning_rate": 3.369012787499387e-06, "loss": 0.0185, "step": 5500 }, { "epoch": 8.901453957996768, "grad_norm": 0.6315116286277771, "learning_rate": 3.273277687288978e-06, "loss": 0.025, "step": 5510 }, { "epoch": 8.917609046849758, "grad_norm": 0.34694400429725647, "learning_rate": 3.178876444404022e-06, "loss": 0.0188, "step": 5520 }, { "epoch": 8.933764135702747, "grad_norm": 0.38088199496269226, "learning_rate": 3.0858117536149365e-06, "loss": 0.0278, "step": 5530 }, { "epoch": 8.949919224555735, "grad_norm": 0.2522503435611725, "learning_rate": 2.9940862715390485e-06, "loss": 0.021, "step": 5540 }, { "epoch": 8.966074313408724, "grad_norm": 0.20651240646839142, "learning_rate": 2.9037026165647186e-06, "loss": 0.02, "step": 5550 }, { "epoch": 8.982229402261712, "grad_norm": 0.24644720554351807, "learning_rate": 2.8146633687766267e-06, "loss": 0.0196, "step": 5560 }, { "epoch": 8.9983844911147, "grad_norm": 0.26605224609375, "learning_rate": 2.7269710698821004e-06, "loss": 0.0205, "step": 5570 }, { "epoch": 9.014539579967689, "grad_norm": 0.3996153473854065, "learning_rate": 2.640628223138597e-06, "loss": 0.0206, "step": 5580 }, { "epoch": 9.03069466882068, "grad_norm": 0.2248448133468628, "learning_rate": 2.555637293282187e-06, "loss": 0.0236, "step": 5590 }, { "epoch": 9.046849757673668, "grad_norm": 0.32477447390556335, "learning_rate": 2.4720007064572504e-06, "loss": 0.0195, "step": 5600 }, { "epoch": 9.063004846526656, "grad_norm": 0.395511656999588, "learning_rate": 2.389720850147181e-06, "loss": 0.0286, "step": 5610 }, { "epoch": 9.079159935379645, "grad_norm": 0.3446315824985504, "learning_rate": 2.308800073106282e-06, "loss": 0.0249, "step": 5620 }, { "epoch": 9.095315024232633, "grad_norm": 0.2906045615673065, "learning_rate": 2.2292406852926383e-06, "loss": 0.0199, "step": 5630 }, { "epoch": 9.111470113085621, "grad_norm": 0.250783771276474, "learning_rate": 2.1510449578022674e-06, "loss": 0.0241, "step": 5640 }, { "epoch": 9.12762520193861, "grad_norm": 0.2330540418624878, "learning_rate": 2.074215122804235e-06, "loss": 0.021, "step": 5650 }, { "epoch": 9.1437802907916, "grad_norm": 0.4786008894443512, "learning_rate": 1.998753373476936e-06, "loss": 0.0214, "step": 5660 }, { "epoch": 9.159935379644589, "grad_norm": 0.30990907549858093, "learning_rate": 1.924661863945498e-06, "loss": 0.0229, "step": 5670 }, { "epoch": 9.176090468497577, "grad_norm": 0.3807198107242584, "learning_rate": 1.851942709220328e-06, "loss": 0.024, "step": 5680 }, { "epoch": 9.192245557350565, "grad_norm": 0.2552647590637207, "learning_rate": 1.7805979851366505e-06, "loss": 0.0262, "step": 5690 }, { "epoch": 9.208400646203554, "grad_norm": 0.2620588541030884, "learning_rate": 1.7106297282953376e-06, "loss": 0.021, "step": 5700 }, { "epoch": 9.224555735056542, "grad_norm": 0.3594263792037964, "learning_rate": 1.642039936004719e-06, "loss": 0.0248, "step": 5710 }, { "epoch": 9.24071082390953, "grad_norm": 0.2621122896671295, "learning_rate": 1.5748305662236007e-06, "loss": 0.0262, "step": 5720 }, { "epoch": 9.256865912762521, "grad_norm": 0.2171621173620224, "learning_rate": 1.5090035375053268e-06, "loss": 0.0274, "step": 5730 }, { "epoch": 9.27302100161551, "grad_norm": 0.2977316975593567, "learning_rate": 1.4445607289430784e-06, "loss": 0.0235, "step": 5740 }, { "epoch": 9.289176090468498, "grad_norm": 0.32373809814453125, "learning_rate": 1.3815039801161721e-06, "loss": 0.0229, "step": 5750 }, { "epoch": 9.305331179321486, "grad_norm": 0.5977046489715576, "learning_rate": 1.31983509103758e-06, "loss": 0.0222, "step": 5760 }, { "epoch": 9.321486268174475, "grad_norm": 0.3709489405155182, "learning_rate": 1.2595558221025372e-06, "loss": 0.0268, "step": 5770 }, { "epoch": 9.337641357027463, "grad_norm": 0.5009976029396057, "learning_rate": 1.2006678940383098e-06, "loss": 0.0223, "step": 5780 }, { "epoch": 9.353796445880452, "grad_norm": 0.5957422256469727, "learning_rate": 1.1431729878550235e-06, "loss": 0.0245, "step": 5790 }, { "epoch": 9.369951534733442, "grad_norm": 0.30659547448158264, "learning_rate": 1.0870727447977402e-06, "loss": 0.0191, "step": 5800 }, { "epoch": 9.38610662358643, "grad_norm": 0.41769152879714966, "learning_rate": 1.0323687662995685e-06, "loss": 0.0237, "step": 5810 }, { "epoch": 9.402261712439419, "grad_norm": 0.2959621846675873, "learning_rate": 9.79062613935955e-07, "loss": 0.0205, "step": 5820 }, { "epoch": 9.418416801292407, "grad_norm": 0.2948278486728668, "learning_rate": 9.271558093801202e-07, "loss": 0.0217, "step": 5830 }, { "epoch": 9.434571890145396, "grad_norm": 0.30144789814949036, "learning_rate": 8.766498343596052e-07, "loss": 0.0211, "step": 5840 }, { "epoch": 9.450726978998384, "grad_norm": 0.23101864755153656, "learning_rate": 8.275461306139876e-07, "loss": 0.0265, "step": 5850 }, { "epoch": 9.466882067851373, "grad_norm": 0.20476695895195007, "learning_rate": 7.79846099853715e-07, "loss": 0.0254, "step": 5860 }, { "epoch": 9.483037156704363, "grad_norm": 0.16762420535087585, "learning_rate": 7.335511037200982e-07, "loss": 0.0198, "step": 5870 }, { "epoch": 9.499192245557351, "grad_norm": 0.2673914134502411, "learning_rate": 6.886624637464422e-07, "loss": 0.0255, "step": 5880 }, { "epoch": 9.51534733441034, "grad_norm": 0.20123161375522614, "learning_rate": 6.451814613203211e-07, "loss": 0.0227, "step": 5890 }, { "epoch": 9.531502423263328, "grad_norm": 1.0308411121368408, "learning_rate": 6.031093376469899e-07, "loss": 0.0249, "step": 5900 }, { "epoch": 9.547657512116317, "grad_norm": 0.38134875893592834, "learning_rate": 5.624472937139802e-07, "loss": 0.0192, "step": 5910 }, { "epoch": 9.563812600969305, "grad_norm": 0.685291588306427, "learning_rate": 5.231964902567721e-07, "loss": 0.0261, "step": 5920 }, { "epoch": 9.579967689822293, "grad_norm": 0.33756789565086365, "learning_rate": 4.853580477257203e-07, "loss": 0.0211, "step": 5930 }, { "epoch": 9.596122778675284, "grad_norm": 0.21888208389282227, "learning_rate": 4.489330462540076e-07, "loss": 0.0228, "step": 5940 }, { "epoch": 9.612277867528272, "grad_norm": 0.2490539699792862, "learning_rate": 4.139225256268475e-07, "loss": 0.0216, "step": 5950 }, { "epoch": 9.62843295638126, "grad_norm": 0.4339372515678406, "learning_rate": 3.8032748525179685e-07, "loss": 0.017, "step": 5960 }, { "epoch": 9.644588045234249, "grad_norm": 0.29505810141563416, "learning_rate": 3.481488841302283e-07, "loss": 0.0231, "step": 5970 }, { "epoch": 9.660743134087237, "grad_norm": 0.6434254050254822, "learning_rate": 3.17387640829947e-07, "loss": 0.0197, "step": 5980 }, { "epoch": 9.676898222940226, "grad_norm": 0.42414963245391846, "learning_rate": 2.880446334589837e-07, "loss": 0.023, "step": 5990 }, { "epoch": 9.693053311793214, "grad_norm": 0.35125264525413513, "learning_rate": 2.601206996404981e-07, "loss": 0.0241, "step": 6000 }, { "epoch": 9.709208400646204, "grad_norm": 0.2720375657081604, "learning_rate": 2.336166364889092e-07, "loss": 0.0194, "step": 6010 }, { "epoch": 9.725363489499193, "grad_norm": 0.36113637685775757, "learning_rate": 2.0853320058710214e-07, "loss": 0.0235, "step": 6020 }, { "epoch": 9.741518578352181, "grad_norm": 0.43229228258132935, "learning_rate": 1.848711079648624e-07, "loss": 0.0202, "step": 6030 }, { "epoch": 9.75767366720517, "grad_norm": 0.3741385042667389, "learning_rate": 1.626310340784143e-07, "loss": 0.0235, "step": 6040 }, { "epoch": 9.773828756058158, "grad_norm": 0.6551222801208496, "learning_rate": 1.4181361379115855e-07, "loss": 0.021, "step": 6050 }, { "epoch": 9.789983844911147, "grad_norm": 0.22424794733524323, "learning_rate": 1.2241944135552574e-07, "loss": 0.0203, "step": 6060 }, { "epoch": 9.806138933764135, "grad_norm": 0.40934470295906067, "learning_rate": 1.044490703960288e-07, "loss": 0.0198, "step": 6070 }, { "epoch": 9.822294022617124, "grad_norm": 0.4585173428058624, "learning_rate": 8.79030138934589e-08, "loss": 0.0232, "step": 6080 }, { "epoch": 9.838449111470114, "grad_norm": 0.2896837592124939, "learning_rate": 7.278174417024164e-08, "loss": 0.0207, "step": 6090 }, { "epoch": 9.854604200323102, "grad_norm": 0.2858301103115082, "learning_rate": 5.908569287694787e-08, "loss": 0.0168, "step": 6100 }, { "epoch": 9.87075928917609, "grad_norm": 0.37577348947525024, "learning_rate": 4.6815250979970195e-08, "loss": 0.022, "step": 6110 }, { "epoch": 9.88691437802908, "grad_norm": 0.18759319186210632, "learning_rate": 3.5970768750387405e-08, "loss": 0.0218, "step": 6120 }, { "epoch": 9.903069466882068, "grad_norm": 0.32479435205459595, "learning_rate": 2.6552555753917017e-08, "loss": 0.0237, "step": 6130 }, { "epoch": 9.919224555735056, "grad_norm": 0.24623480439186096, "learning_rate": 1.8560880842133366e-08, "loss": 0.0184, "step": 6140 }, { "epoch": 9.935379644588044, "grad_norm": 0.22339710593223572, "learning_rate": 1.1995972144757116e-08, "loss": 0.0186, "step": 6150 }, { "epoch": 9.951534733441035, "grad_norm": 0.3188174366950989, "learning_rate": 6.858017063149369e-09, "loss": 0.0235, "step": 6160 }, { "epoch": 9.967689822294023, "grad_norm": 0.31466570496559143, "learning_rate": 3.1471622649714703e-09, "loss": 0.0245, "step": 6170 }, { "epoch": 9.983844911147012, "grad_norm": 0.3918437659740448, "learning_rate": 8.635136799939325e-10, "loss": 0.0212, "step": 6180 }, { "epoch": 10.0, "grad_norm": 0.3111129403114319, "learning_rate": 7.136497065518555e-12, "loss": 0.0254, "step": 6190 }, { "epoch": 10.0, "step": 6190, "total_flos": 0.0, "train_loss": 0.05139205720341841, "train_runtime": 6075.3943, "train_samples_per_second": 32.592, "train_steps_per_second": 1.019 } ], "logging_steps": 10, "max_steps": 6190, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }