{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 397, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025188916876574307, "grad_norm": 1.420465350151062, "learning_rate": 1e-05, "loss": 2.5777, "step": 1 }, { "epoch": 0.005037783375314861, "grad_norm": 1.4124211072921753, "learning_rate": 9.974811083123427e-06, "loss": 2.5574, "step": 2 }, { "epoch": 0.007556675062972292, "grad_norm": 1.444077730178833, "learning_rate": 9.949622166246852e-06, "loss": 2.7149, "step": 3 }, { "epoch": 0.010075566750629723, "grad_norm": 1.2692691087722778, "learning_rate": 9.924433249370277e-06, "loss": 2.4942, "step": 4 }, { "epoch": 0.012594458438287154, "grad_norm": 1.2546937465667725, "learning_rate": 9.899244332493704e-06, "loss": 2.5284, "step": 5 }, { "epoch": 0.015113350125944584, "grad_norm": 1.2006076574325562, "learning_rate": 9.87405541561713e-06, "loss": 2.5203, "step": 6 }, { "epoch": 0.017632241813602016, "grad_norm": 1.1375973224639893, "learning_rate": 9.848866498740555e-06, "loss": 2.4494, "step": 7 }, { "epoch": 0.020151133501259445, "grad_norm": 1.0649913549423218, "learning_rate": 9.82367758186398e-06, "loss": 2.4138, "step": 8 }, { "epoch": 0.022670025188916875, "grad_norm": 1.0274866819381714, "learning_rate": 9.798488664987406e-06, "loss": 2.3557, "step": 9 }, { "epoch": 0.02518891687657431, "grad_norm": 1.0478529930114746, "learning_rate": 9.773299748110831e-06, "loss": 2.4614, "step": 10 }, { "epoch": 0.027707808564231738, "grad_norm": 0.9700673818588257, "learning_rate": 9.748110831234258e-06, "loss": 2.4212, "step": 11 }, { "epoch": 0.030226700251889168, "grad_norm": 0.8414812684059143, "learning_rate": 9.722921914357684e-06, "loss": 2.2299, "step": 12 }, { "epoch": 0.0327455919395466, "grad_norm": 0.8956544399261475, "learning_rate": 9.69773299748111e-06, "loss": 2.3443, "step": 13 }, { "epoch": 0.03526448362720403, "grad_norm": 0.9195625185966492, "learning_rate": 9.672544080604534e-06, "loss": 2.2813, "step": 14 }, { "epoch": 0.037783375314861464, "grad_norm": 0.8406645655632019, "learning_rate": 9.64735516372796e-06, "loss": 2.2909, "step": 15 }, { "epoch": 0.04030226700251889, "grad_norm": 0.8406001925468445, "learning_rate": 9.622166246851387e-06, "loss": 2.3022, "step": 16 }, { "epoch": 0.042821158690176324, "grad_norm": 0.8053434491157532, "learning_rate": 9.596977329974812e-06, "loss": 2.2592, "step": 17 }, { "epoch": 0.04534005037783375, "grad_norm": 0.8638896346092224, "learning_rate": 9.571788413098237e-06, "loss": 2.3171, "step": 18 }, { "epoch": 0.04785894206549118, "grad_norm": 0.8893205523490906, "learning_rate": 9.546599496221664e-06, "loss": 2.2565, "step": 19 }, { "epoch": 0.05037783375314862, "grad_norm": 0.7514384984970093, "learning_rate": 9.521410579345088e-06, "loss": 2.1959, "step": 20 }, { "epoch": 0.05289672544080604, "grad_norm": 0.7832961678504944, "learning_rate": 9.496221662468515e-06, "loss": 2.2186, "step": 21 }, { "epoch": 0.055415617128463476, "grad_norm": 0.7781046628952026, "learning_rate": 9.47103274559194e-06, "loss": 2.207, "step": 22 }, { "epoch": 0.05793450881612091, "grad_norm": 0.7359276413917542, "learning_rate": 9.445843828715366e-06, "loss": 2.1479, "step": 23 }, { "epoch": 0.060453400503778336, "grad_norm": 0.7263805866241455, "learning_rate": 9.420654911838791e-06, "loss": 2.1799, "step": 24 }, { "epoch": 0.06297229219143577, "grad_norm": 0.6834078431129456, "learning_rate": 9.395465994962218e-06, "loss": 2.1216, "step": 25 }, { "epoch": 0.0654911838790932, "grad_norm": 0.6694800853729248, "learning_rate": 9.370277078085643e-06, "loss": 2.0769, "step": 26 }, { "epoch": 0.06801007556675064, "grad_norm": 0.6812991499900818, "learning_rate": 9.345088161209067e-06, "loss": 2.146, "step": 27 }, { "epoch": 0.07052896725440806, "grad_norm": 0.6379550695419312, "learning_rate": 9.319899244332494e-06, "loss": 2.0901, "step": 28 }, { "epoch": 0.07304785894206549, "grad_norm": 0.6825947761535645, "learning_rate": 9.29471032745592e-06, "loss": 2.1533, "step": 29 }, { "epoch": 0.07556675062972293, "grad_norm": 0.7910833954811096, "learning_rate": 9.269521410579347e-06, "loss": 2.1828, "step": 30 }, { "epoch": 0.07808564231738035, "grad_norm": 0.6861229538917542, "learning_rate": 9.244332493702772e-06, "loss": 2.1502, "step": 31 }, { "epoch": 0.08060453400503778, "grad_norm": 0.6285768747329712, "learning_rate": 9.219143576826197e-06, "loss": 2.1031, "step": 32 }, { "epoch": 0.08312342569269521, "grad_norm": 0.6474770903587341, "learning_rate": 9.193954659949623e-06, "loss": 2.087, "step": 33 }, { "epoch": 0.08564231738035265, "grad_norm": 0.5884003043174744, "learning_rate": 9.168765743073048e-06, "loss": 2.0418, "step": 34 }, { "epoch": 0.08816120906801007, "grad_norm": 0.5800574421882629, "learning_rate": 9.143576826196475e-06, "loss": 2.0484, "step": 35 }, { "epoch": 0.0906801007556675, "grad_norm": 0.5606217980384827, "learning_rate": 9.1183879093199e-06, "loss": 2.0026, "step": 36 }, { "epoch": 0.09319899244332494, "grad_norm": 0.6527896523475647, "learning_rate": 9.093198992443326e-06, "loss": 1.9611, "step": 37 }, { "epoch": 0.09571788413098237, "grad_norm": 0.5732287764549255, "learning_rate": 9.068010075566751e-06, "loss": 2.0348, "step": 38 }, { "epoch": 0.0982367758186398, "grad_norm": 0.5753059387207031, "learning_rate": 9.042821158690178e-06, "loss": 2.0062, "step": 39 }, { "epoch": 0.10075566750629723, "grad_norm": 0.5425299406051636, "learning_rate": 9.017632241813602e-06, "loss": 1.9781, "step": 40 }, { "epoch": 0.10327455919395466, "grad_norm": 0.5520154237747192, "learning_rate": 8.992443324937027e-06, "loss": 1.9927, "step": 41 }, { "epoch": 0.10579345088161209, "grad_norm": 0.5321075320243835, "learning_rate": 8.967254408060454e-06, "loss": 1.9715, "step": 42 }, { "epoch": 0.10831234256926953, "grad_norm": 0.5192540287971497, "learning_rate": 8.94206549118388e-06, "loss": 1.9771, "step": 43 }, { "epoch": 0.11083123425692695, "grad_norm": 0.5216296315193176, "learning_rate": 8.916876574307305e-06, "loss": 1.9554, "step": 44 }, { "epoch": 0.11335012594458438, "grad_norm": 0.5138005614280701, "learning_rate": 8.89168765743073e-06, "loss": 1.9501, "step": 45 }, { "epoch": 0.11586901763224182, "grad_norm": 0.5473687052726746, "learning_rate": 8.866498740554157e-06, "loss": 1.9943, "step": 46 }, { "epoch": 0.11838790931989925, "grad_norm": 0.5291565656661987, "learning_rate": 8.841309823677583e-06, "loss": 1.9401, "step": 47 }, { "epoch": 0.12090680100755667, "grad_norm": 0.5129333734512329, "learning_rate": 8.816120906801008e-06, "loss": 1.9557, "step": 48 }, { "epoch": 0.12342569269521411, "grad_norm": 0.5359098315238953, "learning_rate": 8.790931989924435e-06, "loss": 1.9787, "step": 49 }, { "epoch": 0.12594458438287154, "grad_norm": 0.4913354814052582, "learning_rate": 8.76574307304786e-06, "loss": 1.9198, "step": 50 }, { "epoch": 0.12846347607052896, "grad_norm": 0.4875161647796631, "learning_rate": 8.740554156171286e-06, "loss": 1.9497, "step": 51 }, { "epoch": 0.1309823677581864, "grad_norm": 0.47248420119285583, "learning_rate": 8.715365239294711e-06, "loss": 1.8747, "step": 52 }, { "epoch": 0.13350125944584382, "grad_norm": 0.48350995779037476, "learning_rate": 8.690176322418138e-06, "loss": 1.8919, "step": 53 }, { "epoch": 0.13602015113350127, "grad_norm": 0.48570191860198975, "learning_rate": 8.664987405541562e-06, "loss": 1.8958, "step": 54 }, { "epoch": 0.1385390428211587, "grad_norm": 0.47888582944869995, "learning_rate": 8.639798488664987e-06, "loss": 1.8924, "step": 55 }, { "epoch": 0.14105793450881612, "grad_norm": 0.4759175479412079, "learning_rate": 8.614609571788414e-06, "loss": 1.8559, "step": 56 }, { "epoch": 0.14357682619647355, "grad_norm": 0.47866225242614746, "learning_rate": 8.58942065491184e-06, "loss": 1.8297, "step": 57 }, { "epoch": 0.14609571788413098, "grad_norm": 0.47261252999305725, "learning_rate": 8.564231738035265e-06, "loss": 1.9205, "step": 58 }, { "epoch": 0.1486146095717884, "grad_norm": 0.4570164978504181, "learning_rate": 8.53904282115869e-06, "loss": 1.8286, "step": 59 }, { "epoch": 0.15113350125944586, "grad_norm": 0.45629221200942993, "learning_rate": 8.513853904282117e-06, "loss": 1.8307, "step": 60 }, { "epoch": 0.15365239294710328, "grad_norm": 0.4506438374519348, "learning_rate": 8.488664987405543e-06, "loss": 1.8264, "step": 61 }, { "epoch": 0.1561712846347607, "grad_norm": 0.46889957785606384, "learning_rate": 8.463476070528968e-06, "loss": 1.8619, "step": 62 }, { "epoch": 0.15869017632241814, "grad_norm": 0.4415088891983032, "learning_rate": 8.438287153652393e-06, "loss": 1.8088, "step": 63 }, { "epoch": 0.16120906801007556, "grad_norm": 0.6827300786972046, "learning_rate": 8.41309823677582e-06, "loss": 1.8091, "step": 64 }, { "epoch": 0.163727959697733, "grad_norm": 0.4396965503692627, "learning_rate": 8.387909319899244e-06, "loss": 1.8235, "step": 65 }, { "epoch": 0.16624685138539042, "grad_norm": 0.4572596549987793, "learning_rate": 8.36272040302267e-06, "loss": 1.8473, "step": 66 }, { "epoch": 0.16876574307304787, "grad_norm": 0.47288888692855835, "learning_rate": 8.337531486146096e-06, "loss": 1.7847, "step": 67 }, { "epoch": 0.1712846347607053, "grad_norm": 0.42984092235565186, "learning_rate": 8.312342569269522e-06, "loss": 1.8235, "step": 68 }, { "epoch": 0.17380352644836272, "grad_norm": 0.4297022521495819, "learning_rate": 8.287153652392947e-06, "loss": 1.7944, "step": 69 }, { "epoch": 0.17632241813602015, "grad_norm": 0.44730344414711, "learning_rate": 8.261964735516374e-06, "loss": 1.8026, "step": 70 }, { "epoch": 0.17884130982367757, "grad_norm": 0.45562756061553955, "learning_rate": 8.2367758186398e-06, "loss": 1.8084, "step": 71 }, { "epoch": 0.181360201511335, "grad_norm": 0.43180692195892334, "learning_rate": 8.211586901763225e-06, "loss": 1.804, "step": 72 }, { "epoch": 0.18387909319899245, "grad_norm": 0.4151434302330017, "learning_rate": 8.18639798488665e-06, "loss": 1.745, "step": 73 }, { "epoch": 0.18639798488664988, "grad_norm": 0.42020657658576965, "learning_rate": 8.161209068010076e-06, "loss": 1.729, "step": 74 }, { "epoch": 0.1889168765743073, "grad_norm": 0.4290010631084442, "learning_rate": 8.136020151133503e-06, "loss": 1.7916, "step": 75 }, { "epoch": 0.19143576826196473, "grad_norm": 0.4147432744503021, "learning_rate": 8.110831234256928e-06, "loss": 1.7515, "step": 76 }, { "epoch": 0.19395465994962216, "grad_norm": 0.4140765964984894, "learning_rate": 8.085642317380353e-06, "loss": 1.7494, "step": 77 }, { "epoch": 0.1964735516372796, "grad_norm": 0.42620202898979187, "learning_rate": 8.06045340050378e-06, "loss": 1.7654, "step": 78 }, { "epoch": 0.19899244332493704, "grad_norm": 0.46347954869270325, "learning_rate": 8.035264483627204e-06, "loss": 1.7229, "step": 79 }, { "epoch": 0.20151133501259447, "grad_norm": 0.44873306155204773, "learning_rate": 8.01007556675063e-06, "loss": 1.7153, "step": 80 }, { "epoch": 0.2040302267002519, "grad_norm": 0.46032124757766724, "learning_rate": 7.984886649874056e-06, "loss": 1.7187, "step": 81 }, { "epoch": 0.20654911838790932, "grad_norm": 0.40681278705596924, "learning_rate": 7.959697732997482e-06, "loss": 1.7562, "step": 82 }, { "epoch": 0.20906801007556675, "grad_norm": 0.4236059784889221, "learning_rate": 7.934508816120907e-06, "loss": 1.7158, "step": 83 }, { "epoch": 0.21158690176322417, "grad_norm": 0.42215806245803833, "learning_rate": 7.909319899244334e-06, "loss": 1.7113, "step": 84 }, { "epoch": 0.2141057934508816, "grad_norm": 0.4768604040145874, "learning_rate": 7.884130982367758e-06, "loss": 1.7382, "step": 85 }, { "epoch": 0.21662468513853905, "grad_norm": 0.5776296854019165, "learning_rate": 7.858942065491185e-06, "loss": 1.7144, "step": 86 }, { "epoch": 0.21914357682619648, "grad_norm": 0.42450252175331116, "learning_rate": 7.83375314861461e-06, "loss": 1.7112, "step": 87 }, { "epoch": 0.2216624685138539, "grad_norm": 0.5352158546447754, "learning_rate": 7.808564231738036e-06, "loss": 1.7534, "step": 88 }, { "epoch": 0.22418136020151133, "grad_norm": 0.42181944847106934, "learning_rate": 7.783375314861463e-06, "loss": 1.6889, "step": 89 }, { "epoch": 0.22670025188916876, "grad_norm": 0.41291770339012146, "learning_rate": 7.758186397984888e-06, "loss": 1.7026, "step": 90 }, { "epoch": 0.22921914357682618, "grad_norm": 0.4190363883972168, "learning_rate": 7.732997481108313e-06, "loss": 1.736, "step": 91 }, { "epoch": 0.23173803526448364, "grad_norm": 0.48135876655578613, "learning_rate": 7.70780856423174e-06, "loss": 1.732, "step": 92 }, { "epoch": 0.23425692695214106, "grad_norm": 0.4111890494823456, "learning_rate": 7.682619647355164e-06, "loss": 1.7054, "step": 93 }, { "epoch": 0.2367758186397985, "grad_norm": 0.5191890597343445, "learning_rate": 7.65743073047859e-06, "loss": 1.7147, "step": 94 }, { "epoch": 0.23929471032745592, "grad_norm": 0.4190960228443146, "learning_rate": 7.632241813602015e-06, "loss": 1.7128, "step": 95 }, { "epoch": 0.24181360201511334, "grad_norm": 0.4258028566837311, "learning_rate": 7.607052896725441e-06, "loss": 1.7264, "step": 96 }, { "epoch": 0.24433249370277077, "grad_norm": 0.4177513122558594, "learning_rate": 7.581863979848867e-06, "loss": 1.6779, "step": 97 }, { "epoch": 0.24685138539042822, "grad_norm": 0.4666061997413635, "learning_rate": 7.5566750629722926e-06, "loss": 1.6453, "step": 98 }, { "epoch": 0.24937027707808565, "grad_norm": 0.4100574553012848, "learning_rate": 7.531486146095719e-06, "loss": 1.6963, "step": 99 }, { "epoch": 0.2518891687657431, "grad_norm": 0.4570634067058563, "learning_rate": 7.506297229219144e-06, "loss": 1.7032, "step": 100 }, { "epoch": 0.25440806045340053, "grad_norm": 0.42653965950012207, "learning_rate": 7.48110831234257e-06, "loss": 1.6624, "step": 101 }, { "epoch": 0.25692695214105793, "grad_norm": 0.4480111300945282, "learning_rate": 7.455919395465996e-06, "loss": 1.7011, "step": 102 }, { "epoch": 0.2594458438287154, "grad_norm": 0.4271489977836609, "learning_rate": 7.430730478589421e-06, "loss": 1.6716, "step": 103 }, { "epoch": 0.2619647355163728, "grad_norm": 0.41798117756843567, "learning_rate": 7.405541561712847e-06, "loss": 1.6759, "step": 104 }, { "epoch": 0.26448362720403024, "grad_norm": 0.40775346755981445, "learning_rate": 7.3803526448362725e-06, "loss": 1.6778, "step": 105 }, { "epoch": 0.26700251889168763, "grad_norm": 0.41410258412361145, "learning_rate": 7.355163727959699e-06, "loss": 1.6924, "step": 106 }, { "epoch": 0.2695214105793451, "grad_norm": 0.42046648263931274, "learning_rate": 7.329974811083124e-06, "loss": 1.6468, "step": 107 }, { "epoch": 0.27204030226700254, "grad_norm": 0.4312984347343445, "learning_rate": 7.30478589420655e-06, "loss": 1.7088, "step": 108 }, { "epoch": 0.27455919395465994, "grad_norm": 0.4318784475326538, "learning_rate": 7.279596977329975e-06, "loss": 1.6773, "step": 109 }, { "epoch": 0.2770780856423174, "grad_norm": 0.4189915359020233, "learning_rate": 7.254408060453401e-06, "loss": 1.6194, "step": 110 }, { "epoch": 0.2795969773299748, "grad_norm": 0.5198895931243896, "learning_rate": 7.229219143576827e-06, "loss": 1.6848, "step": 111 }, { "epoch": 0.28211586901763225, "grad_norm": 0.5195222496986389, "learning_rate": 7.2040302267002524e-06, "loss": 1.6813, "step": 112 }, { "epoch": 0.28463476070528965, "grad_norm": 0.45624077320098877, "learning_rate": 7.178841309823679e-06, "loss": 1.6448, "step": 113 }, { "epoch": 0.2871536523929471, "grad_norm": 0.49435746669769287, "learning_rate": 7.153652392947104e-06, "loss": 1.6584, "step": 114 }, { "epoch": 0.28967254408060455, "grad_norm": 0.4301837086677551, "learning_rate": 7.1284634760705296e-06, "loss": 1.6652, "step": 115 }, { "epoch": 0.29219143576826195, "grad_norm": 0.4709468185901642, "learning_rate": 7.103274559193955e-06, "loss": 1.6371, "step": 116 }, { "epoch": 0.2947103274559194, "grad_norm": 0.45211878418922424, "learning_rate": 7.07808564231738e-06, "loss": 1.6672, "step": 117 }, { "epoch": 0.2972292191435768, "grad_norm": 0.4376428723335266, "learning_rate": 7.052896725440807e-06, "loss": 1.695, "step": 118 }, { "epoch": 0.29974811083123426, "grad_norm": 0.4894670844078064, "learning_rate": 7.027707808564232e-06, "loss": 1.6617, "step": 119 }, { "epoch": 0.3022670025188917, "grad_norm": 0.454942911863327, "learning_rate": 7.002518891687659e-06, "loss": 1.5997, "step": 120 }, { "epoch": 0.3047858942065491, "grad_norm": 0.547237753868103, "learning_rate": 6.977329974811084e-06, "loss": 1.6714, "step": 121 }, { "epoch": 0.30730478589420657, "grad_norm": 0.43378302454948425, "learning_rate": 6.9521410579345095e-06, "loss": 1.6393, "step": 122 }, { "epoch": 0.30982367758186397, "grad_norm": 0.43780213594436646, "learning_rate": 6.926952141057935e-06, "loss": 1.6717, "step": 123 }, { "epoch": 0.3123425692695214, "grad_norm": 0.4194709062576294, "learning_rate": 6.90176322418136e-06, "loss": 1.6464, "step": 124 }, { "epoch": 0.3148614609571788, "grad_norm": 0.42024093866348267, "learning_rate": 6.876574307304787e-06, "loss": 1.6275, "step": 125 }, { "epoch": 0.31738035264483627, "grad_norm": 0.4303475022315979, "learning_rate": 6.851385390428212e-06, "loss": 1.6375, "step": 126 }, { "epoch": 0.3198992443324937, "grad_norm": 0.42517420649528503, "learning_rate": 6.826196473551638e-06, "loss": 1.6363, "step": 127 }, { "epoch": 0.3224181360201511, "grad_norm": 0.42485958337783813, "learning_rate": 6.801007556675063e-06, "loss": 1.6302, "step": 128 }, { "epoch": 0.3249370277078086, "grad_norm": 0.4118500053882599, "learning_rate": 6.7758186397984894e-06, "loss": 1.6174, "step": 129 }, { "epoch": 0.327455919395466, "grad_norm": 0.4531554579734802, "learning_rate": 6.750629722921915e-06, "loss": 1.6283, "step": 130 }, { "epoch": 0.32997481108312343, "grad_norm": 0.41642168164253235, "learning_rate": 6.72544080604534e-06, "loss": 1.6328, "step": 131 }, { "epoch": 0.33249370277078083, "grad_norm": 0.4234478771686554, "learning_rate": 6.7002518891687666e-06, "loss": 1.6393, "step": 132 }, { "epoch": 0.3350125944584383, "grad_norm": 0.4314388930797577, "learning_rate": 6.675062972292192e-06, "loss": 1.6371, "step": 133 }, { "epoch": 0.33753148614609574, "grad_norm": 0.49057596921920776, "learning_rate": 6.649874055415617e-06, "loss": 1.6777, "step": 134 }, { "epoch": 0.34005037783375314, "grad_norm": 0.6537044644355774, "learning_rate": 6.624685138539043e-06, "loss": 1.6021, "step": 135 }, { "epoch": 0.3425692695214106, "grad_norm": 0.44913142919540405, "learning_rate": 6.599496221662469e-06, "loss": 1.6369, "step": 136 }, { "epoch": 0.345088161209068, "grad_norm": 0.41060981154441833, "learning_rate": 6.574307304785895e-06, "loss": 1.6001, "step": 137 }, { "epoch": 0.34760705289672544, "grad_norm": 0.44041162729263306, "learning_rate": 6.54911838790932e-06, "loss": 1.6352, "step": 138 }, { "epoch": 0.3501259445843829, "grad_norm": 0.42245715856552124, "learning_rate": 6.5239294710327465e-06, "loss": 1.6195, "step": 139 }, { "epoch": 0.3526448362720403, "grad_norm": 0.4138522446155548, "learning_rate": 6.498740554156172e-06, "loss": 1.6254, "step": 140 }, { "epoch": 0.35516372795969775, "grad_norm": 0.42503440380096436, "learning_rate": 6.473551637279597e-06, "loss": 1.6187, "step": 141 }, { "epoch": 0.35768261964735515, "grad_norm": 0.5783386826515198, "learning_rate": 6.448362720403023e-06, "loss": 1.6704, "step": 142 }, { "epoch": 0.3602015113350126, "grad_norm": 0.4822537302970886, "learning_rate": 6.423173803526449e-06, "loss": 1.5762, "step": 143 }, { "epoch": 0.36272040302267, "grad_norm": 0.43411409854888916, "learning_rate": 6.397984886649875e-06, "loss": 1.6067, "step": 144 }, { "epoch": 0.36523929471032746, "grad_norm": 0.43474212288856506, "learning_rate": 6.3727959697733e-06, "loss": 1.6198, "step": 145 }, { "epoch": 0.3677581863979849, "grad_norm": 0.4297161400318146, "learning_rate": 6.347607052896726e-06, "loss": 1.6004, "step": 146 }, { "epoch": 0.3702770780856423, "grad_norm": 0.7553440928459167, "learning_rate": 6.322418136020152e-06, "loss": 1.7129, "step": 147 }, { "epoch": 0.37279596977329976, "grad_norm": 0.4365249574184418, "learning_rate": 6.297229219143577e-06, "loss": 1.6186, "step": 148 }, { "epoch": 0.37531486146095716, "grad_norm": 0.4731481373310089, "learning_rate": 6.272040302267003e-06, "loss": 1.5799, "step": 149 }, { "epoch": 0.3778337531486146, "grad_norm": 0.44013121724128723, "learning_rate": 6.246851385390429e-06, "loss": 1.6117, "step": 150 }, { "epoch": 0.380352644836272, "grad_norm": 0.4383363425731659, "learning_rate": 6.221662468513855e-06, "loss": 1.5916, "step": 151 }, { "epoch": 0.38287153652392947, "grad_norm": 0.4586566686630249, "learning_rate": 6.19647355163728e-06, "loss": 1.5987, "step": 152 }, { "epoch": 0.3853904282115869, "grad_norm": 0.5225487351417542, "learning_rate": 6.1712846347607055e-06, "loss": 1.5497, "step": 153 }, { "epoch": 0.3879093198992443, "grad_norm": 0.4564357399940491, "learning_rate": 6.146095717884132e-06, "loss": 1.6016, "step": 154 }, { "epoch": 0.3904282115869018, "grad_norm": 0.49243706464767456, "learning_rate": 6.120906801007557e-06, "loss": 1.6004, "step": 155 }, { "epoch": 0.3929471032745592, "grad_norm": 0.6145833134651184, "learning_rate": 6.095717884130983e-06, "loss": 1.5891, "step": 156 }, { "epoch": 0.3954659949622166, "grad_norm": 0.4326134920120239, "learning_rate": 6.070528967254408e-06, "loss": 1.6108, "step": 157 }, { "epoch": 0.3979848866498741, "grad_norm": 0.45841845870018005, "learning_rate": 6.045340050377835e-06, "loss": 1.6044, "step": 158 }, { "epoch": 0.4005037783375315, "grad_norm": 0.5934171676635742, "learning_rate": 6.02015113350126e-06, "loss": 1.6273, "step": 159 }, { "epoch": 0.40302267002518893, "grad_norm": 0.5909122824668884, "learning_rate": 5.9949622166246855e-06, "loss": 1.5819, "step": 160 }, { "epoch": 0.40554156171284633, "grad_norm": 0.47986772656440735, "learning_rate": 5.969773299748112e-06, "loss": 1.6292, "step": 161 }, { "epoch": 0.4080604534005038, "grad_norm": 0.43019899725914, "learning_rate": 5.944584382871537e-06, "loss": 1.6219, "step": 162 }, { "epoch": 0.4105793450881612, "grad_norm": 0.44603484869003296, "learning_rate": 5.919395465994963e-06, "loss": 1.6177, "step": 163 }, { "epoch": 0.41309823677581864, "grad_norm": 0.6486812233924866, "learning_rate": 5.894206549118388e-06, "loss": 1.6169, "step": 164 }, { "epoch": 0.4156171284634761, "grad_norm": 0.4344078600406647, "learning_rate": 5.869017632241813e-06, "loss": 1.6131, "step": 165 }, { "epoch": 0.4181360201511335, "grad_norm": 0.4963393211364746, "learning_rate": 5.84382871536524e-06, "loss": 1.5649, "step": 166 }, { "epoch": 0.42065491183879095, "grad_norm": 0.4491269588470459, "learning_rate": 5.818639798488665e-06, "loss": 1.5886, "step": 167 }, { "epoch": 0.42317380352644834, "grad_norm": 0.44954273104667664, "learning_rate": 5.793450881612092e-06, "loss": 1.5514, "step": 168 }, { "epoch": 0.4256926952141058, "grad_norm": 0.5957120060920715, "learning_rate": 5.768261964735517e-06, "loss": 1.5656, "step": 169 }, { "epoch": 0.4282115869017632, "grad_norm": 0.4787919223308563, "learning_rate": 5.7430730478589425e-06, "loss": 1.5906, "step": 170 }, { "epoch": 0.43073047858942065, "grad_norm": 0.4297046959400177, "learning_rate": 5.717884130982368e-06, "loss": 1.5676, "step": 171 }, { "epoch": 0.4332493702770781, "grad_norm": 0.4834885597229004, "learning_rate": 5.692695214105793e-06, "loss": 1.5672, "step": 172 }, { "epoch": 0.4357682619647355, "grad_norm": 0.5278275012969971, "learning_rate": 5.66750629722922e-06, "loss": 1.5994, "step": 173 }, { "epoch": 0.43828715365239296, "grad_norm": 0.4892403185367584, "learning_rate": 5.642317380352645e-06, "loss": 1.5845, "step": 174 }, { "epoch": 0.44080604534005036, "grad_norm": 0.5153166055679321, "learning_rate": 5.617128463476071e-06, "loss": 1.5573, "step": 175 }, { "epoch": 0.4433249370277078, "grad_norm": 0.5289381146430969, "learning_rate": 5.591939546599497e-06, "loss": 1.5658, "step": 176 }, { "epoch": 0.44584382871536526, "grad_norm": 0.45170825719833374, "learning_rate": 5.5667506297229225e-06, "loss": 1.5322, "step": 177 }, { "epoch": 0.44836272040302266, "grad_norm": 0.45414310693740845, "learning_rate": 5.541561712846348e-06, "loss": 1.5872, "step": 178 }, { "epoch": 0.4508816120906801, "grad_norm": 0.47673285007476807, "learning_rate": 5.516372795969773e-06, "loss": 1.603, "step": 179 }, { "epoch": 0.4534005037783375, "grad_norm": 0.4653848707675934, "learning_rate": 5.4911838790931996e-06, "loss": 1.5235, "step": 180 }, { "epoch": 0.45591939546599497, "grad_norm": 0.4475414752960205, "learning_rate": 5.465994962216625e-06, "loss": 1.5671, "step": 181 }, { "epoch": 0.45843828715365237, "grad_norm": 0.48499029874801636, "learning_rate": 5.440806045340051e-06, "loss": 1.5912, "step": 182 }, { "epoch": 0.4609571788413098, "grad_norm": 0.4531858563423157, "learning_rate": 5.415617128463476e-06, "loss": 1.541, "step": 183 }, { "epoch": 0.4634760705289673, "grad_norm": 0.44078829884529114, "learning_rate": 5.390428211586902e-06, "loss": 1.583, "step": 184 }, { "epoch": 0.4659949622166247, "grad_norm": 0.47280648350715637, "learning_rate": 5.365239294710328e-06, "loss": 1.6233, "step": 185 }, { "epoch": 0.46851385390428213, "grad_norm": 0.5612819194793701, "learning_rate": 5.340050377833753e-06, "loss": 1.6078, "step": 186 }, { "epoch": 0.47103274559193953, "grad_norm": 0.4777447283267975, "learning_rate": 5.3148614609571795e-06, "loss": 1.5722, "step": 187 }, { "epoch": 0.473551637279597, "grad_norm": 0.49805429577827454, "learning_rate": 5.289672544080605e-06, "loss": 1.6243, "step": 188 }, { "epoch": 0.4760705289672544, "grad_norm": 0.4395243525505066, "learning_rate": 5.264483627204031e-06, "loss": 1.5497, "step": 189 }, { "epoch": 0.47858942065491183, "grad_norm": 0.7493352890014648, "learning_rate": 5.239294710327456e-06, "loss": 1.6466, "step": 190 }, { "epoch": 0.4811083123425693, "grad_norm": 0.5018370747566223, "learning_rate": 5.214105793450882e-06, "loss": 1.5492, "step": 191 }, { "epoch": 0.4836272040302267, "grad_norm": 0.4791150391101837, "learning_rate": 5.188916876574308e-06, "loss": 1.5679, "step": 192 }, { "epoch": 0.48614609571788414, "grad_norm": 0.4814487099647522, "learning_rate": 5.163727959697733e-06, "loss": 1.5595, "step": 193 }, { "epoch": 0.48866498740554154, "grad_norm": 0.44743016362190247, "learning_rate": 5.138539042821159e-06, "loss": 1.5971, "step": 194 }, { "epoch": 0.491183879093199, "grad_norm": 0.47840508818626404, "learning_rate": 5.113350125944585e-06, "loss": 1.5414, "step": 195 }, { "epoch": 0.49370277078085645, "grad_norm": 0.4497021436691284, "learning_rate": 5.088161209068011e-06, "loss": 1.5595, "step": 196 }, { "epoch": 0.49622166246851385, "grad_norm": 0.49746012687683105, "learning_rate": 5.062972292191436e-06, "loss": 1.5403, "step": 197 }, { "epoch": 0.4987405541561713, "grad_norm": 0.4701424837112427, "learning_rate": 5.037783375314862e-06, "loss": 1.5597, "step": 198 }, { "epoch": 0.5012594458438288, "grad_norm": 0.4464475214481354, "learning_rate": 5.012594458438288e-06, "loss": 1.5436, "step": 199 }, { "epoch": 0.5037783375314862, "grad_norm": 0.5158559083938599, "learning_rate": 4.987405541561714e-06, "loss": 1.5638, "step": 200 }, { "epoch": 0.5062972292191436, "grad_norm": 0.5568498969078064, "learning_rate": 4.9622166246851385e-06, "loss": 1.5968, "step": 201 }, { "epoch": 0.5088161209068011, "grad_norm": 0.4441608488559723, "learning_rate": 4.937027707808565e-06, "loss": 1.54, "step": 202 }, { "epoch": 0.5113350125944585, "grad_norm": 0.4909915328025818, "learning_rate": 4.91183879093199e-06, "loss": 1.5439, "step": 203 }, { "epoch": 0.5138539042821159, "grad_norm": 0.4911031424999237, "learning_rate": 4.886649874055416e-06, "loss": 1.5438, "step": 204 }, { "epoch": 0.5163727959697733, "grad_norm": 0.7304896116256714, "learning_rate": 4.861460957178842e-06, "loss": 1.5061, "step": 205 }, { "epoch": 0.5188916876574308, "grad_norm": 0.4542643129825592, "learning_rate": 4.836272040302267e-06, "loss": 1.5738, "step": 206 }, { "epoch": 0.5214105793450882, "grad_norm": 0.8241648077964783, "learning_rate": 4.811083123425694e-06, "loss": 1.5982, "step": 207 }, { "epoch": 0.5239294710327456, "grad_norm": 0.45886871218681335, "learning_rate": 4.7858942065491185e-06, "loss": 1.5594, "step": 208 }, { "epoch": 0.5264483627204031, "grad_norm": 0.5265582799911499, "learning_rate": 4.760705289672544e-06, "loss": 1.57, "step": 209 }, { "epoch": 0.5289672544080605, "grad_norm": 0.46276602149009705, "learning_rate": 4.73551637279597e-06, "loss": 1.5475, "step": 210 }, { "epoch": 0.5314861460957179, "grad_norm": 0.5516127943992615, "learning_rate": 4.710327455919396e-06, "loss": 1.5497, "step": 211 }, { "epoch": 0.5340050377833753, "grad_norm": 0.485507071018219, "learning_rate": 4.685138539042821e-06, "loss": 1.5954, "step": 212 }, { "epoch": 0.5365239294710328, "grad_norm": 0.4667035937309265, "learning_rate": 4.659949622166247e-06, "loss": 1.5524, "step": 213 }, { "epoch": 0.5390428211586902, "grad_norm": 0.4725947082042694, "learning_rate": 4.6347607052896736e-06, "loss": 1.5701, "step": 214 }, { "epoch": 0.5415617128463476, "grad_norm": 0.48055243492126465, "learning_rate": 4.609571788413098e-06, "loss": 1.512, "step": 215 }, { "epoch": 0.5440806045340051, "grad_norm": 0.47020798921585083, "learning_rate": 4.584382871536524e-06, "loss": 1.517, "step": 216 }, { "epoch": 0.5465994962216625, "grad_norm": 0.458790123462677, "learning_rate": 4.55919395465995e-06, "loss": 1.5963, "step": 217 }, { "epoch": 0.5491183879093199, "grad_norm": 0.46757379174232483, "learning_rate": 4.5340050377833755e-06, "loss": 1.5307, "step": 218 }, { "epoch": 0.5516372795969773, "grad_norm": 0.48817694187164307, "learning_rate": 4.508816120906801e-06, "loss": 1.5096, "step": 219 }, { "epoch": 0.5541561712846348, "grad_norm": 0.46775302290916443, "learning_rate": 4.483627204030227e-06, "loss": 1.5081, "step": 220 }, { "epoch": 0.5566750629722922, "grad_norm": 0.4632299244403839, "learning_rate": 4.458438287153653e-06, "loss": 1.5274, "step": 221 }, { "epoch": 0.5591939546599496, "grad_norm": 0.6220762729644775, "learning_rate": 4.433249370277078e-06, "loss": 1.4909, "step": 222 }, { "epoch": 0.5617128463476071, "grad_norm": 0.4639570713043213, "learning_rate": 4.408060453400504e-06, "loss": 1.531, "step": 223 }, { "epoch": 0.5642317380352645, "grad_norm": 0.48596182465553284, "learning_rate": 4.38287153652393e-06, "loss": 1.522, "step": 224 }, { "epoch": 0.5667506297229219, "grad_norm": 0.4745020866394043, "learning_rate": 4.3576826196473555e-06, "loss": 1.5323, "step": 225 }, { "epoch": 0.5692695214105793, "grad_norm": 0.5056527853012085, "learning_rate": 4.332493702770781e-06, "loss": 1.5374, "step": 226 }, { "epoch": 0.5717884130982368, "grad_norm": 0.44245389103889465, "learning_rate": 4.307304785894207e-06, "loss": 1.5169, "step": 227 }, { "epoch": 0.5743073047858942, "grad_norm": 0.4938381016254425, "learning_rate": 4.282115869017633e-06, "loss": 1.5192, "step": 228 }, { "epoch": 0.5768261964735516, "grad_norm": 0.4689100384712219, "learning_rate": 4.256926952141058e-06, "loss": 1.5666, "step": 229 }, { "epoch": 0.5793450881612091, "grad_norm": 0.5333397388458252, "learning_rate": 4.231738035264484e-06, "loss": 1.5562, "step": 230 }, { "epoch": 0.5818639798488665, "grad_norm": 0.5024259090423584, "learning_rate": 4.20654911838791e-06, "loss": 1.5135, "step": 231 }, { "epoch": 0.5843828715365239, "grad_norm": 0.46757936477661133, "learning_rate": 4.181360201511335e-06, "loss": 1.522, "step": 232 }, { "epoch": 0.5869017632241813, "grad_norm": 0.5455654263496399, "learning_rate": 4.156171284634761e-06, "loss": 1.5281, "step": 233 }, { "epoch": 0.5894206549118388, "grad_norm": 0.48288044333457947, "learning_rate": 4.130982367758187e-06, "loss": 1.5252, "step": 234 }, { "epoch": 0.5919395465994962, "grad_norm": 0.44919902086257935, "learning_rate": 4.1057934508816125e-06, "loss": 1.5371, "step": 235 }, { "epoch": 0.5944584382871536, "grad_norm": 0.4358011782169342, "learning_rate": 4.080604534005038e-06, "loss": 1.5419, "step": 236 }, { "epoch": 0.5969773299748111, "grad_norm": 0.518595278263092, "learning_rate": 4.055415617128464e-06, "loss": 1.538, "step": 237 }, { "epoch": 0.5994962216624685, "grad_norm": 0.6567726135253906, "learning_rate": 4.03022670025189e-06, "loss": 1.4867, "step": 238 }, { "epoch": 0.6020151133501259, "grad_norm": 0.48650607466697693, "learning_rate": 4.005037783375315e-06, "loss": 1.494, "step": 239 }, { "epoch": 0.6045340050377834, "grad_norm": 0.6559653878211975, "learning_rate": 3.979848866498741e-06, "loss": 1.54, "step": 240 }, { "epoch": 0.6070528967254408, "grad_norm": 0.45548874139785767, "learning_rate": 3.954659949622167e-06, "loss": 1.5148, "step": 241 }, { "epoch": 0.6095717884130982, "grad_norm": 0.6561994552612305, "learning_rate": 3.9294710327455925e-06, "loss": 1.5244, "step": 242 }, { "epoch": 0.6120906801007556, "grad_norm": 0.46143561601638794, "learning_rate": 3.904282115869018e-06, "loss": 1.5315, "step": 243 }, { "epoch": 0.6146095717884131, "grad_norm": 0.537300705909729, "learning_rate": 3.879093198992444e-06, "loss": 1.5424, "step": 244 }, { "epoch": 0.6171284634760705, "grad_norm": 0.46460816264152527, "learning_rate": 3.85390428211587e-06, "loss": 1.4941, "step": 245 }, { "epoch": 0.6196473551637279, "grad_norm": 0.48894399404525757, "learning_rate": 3.828715365239295e-06, "loss": 1.5294, "step": 246 }, { "epoch": 0.6221662468513854, "grad_norm": 0.4623178541660309, "learning_rate": 3.8035264483627206e-06, "loss": 1.5068, "step": 247 }, { "epoch": 0.6246851385390428, "grad_norm": 0.49979573488235474, "learning_rate": 3.7783375314861463e-06, "loss": 1.4801, "step": 248 }, { "epoch": 0.6272040302267002, "grad_norm": 0.5378308296203613, "learning_rate": 3.753148614609572e-06, "loss": 1.5444, "step": 249 }, { "epoch": 0.6297229219143576, "grad_norm": 0.5385175347328186, "learning_rate": 3.727959697732998e-06, "loss": 1.5249, "step": 250 }, { "epoch": 0.6322418136020151, "grad_norm": 0.46512940526008606, "learning_rate": 3.7027707808564234e-06, "loss": 1.5082, "step": 251 }, { "epoch": 0.6347607052896725, "grad_norm": 0.6099820733070374, "learning_rate": 3.6775818639798495e-06, "loss": 1.5297, "step": 252 }, { "epoch": 0.6372795969773299, "grad_norm": 0.4563128650188446, "learning_rate": 3.652392947103275e-06, "loss": 1.5108, "step": 253 }, { "epoch": 0.6397984886649875, "grad_norm": 0.4638257324695587, "learning_rate": 3.6272040302267005e-06, "loss": 1.492, "step": 254 }, { "epoch": 0.6423173803526449, "grad_norm": 0.4734160602092743, "learning_rate": 3.6020151133501262e-06, "loss": 1.5113, "step": 255 }, { "epoch": 0.6448362720403022, "grad_norm": 0.4613577127456665, "learning_rate": 3.576826196473552e-06, "loss": 1.5352, "step": 256 }, { "epoch": 0.6473551637279596, "grad_norm": 0.6752243638038635, "learning_rate": 3.5516372795969776e-06, "loss": 1.492, "step": 257 }, { "epoch": 0.6498740554156172, "grad_norm": 0.4645501673221588, "learning_rate": 3.5264483627204033e-06, "loss": 1.4993, "step": 258 }, { "epoch": 0.6523929471032746, "grad_norm": 0.5898957252502441, "learning_rate": 3.5012594458438295e-06, "loss": 1.4917, "step": 259 }, { "epoch": 0.654911838790932, "grad_norm": 0.4554866552352905, "learning_rate": 3.4760705289672547e-06, "loss": 1.5192, "step": 260 }, { "epoch": 0.6574307304785895, "grad_norm": 0.4567941427230835, "learning_rate": 3.45088161209068e-06, "loss": 1.5442, "step": 261 }, { "epoch": 0.6599496221662469, "grad_norm": 0.4824671447277069, "learning_rate": 3.425692695214106e-06, "loss": 1.5348, "step": 262 }, { "epoch": 0.6624685138539043, "grad_norm": 0.4494476616382599, "learning_rate": 3.4005037783375314e-06, "loss": 1.5278, "step": 263 }, { "epoch": 0.6649874055415617, "grad_norm": 0.5391709208488464, "learning_rate": 3.3753148614609576e-06, "loss": 1.5277, "step": 264 }, { "epoch": 0.6675062972292192, "grad_norm": 0.4483042061328888, "learning_rate": 3.3501259445843833e-06, "loss": 1.4955, "step": 265 }, { "epoch": 0.6700251889168766, "grad_norm": 0.46210387349128723, "learning_rate": 3.3249370277078086e-06, "loss": 1.5077, "step": 266 }, { "epoch": 0.672544080604534, "grad_norm": 0.5058848261833191, "learning_rate": 3.2997481108312347e-06, "loss": 1.4645, "step": 267 }, { "epoch": 0.6750629722921915, "grad_norm": 0.4964057207107544, "learning_rate": 3.27455919395466e-06, "loss": 1.4897, "step": 268 }, { "epoch": 0.6775818639798489, "grad_norm": 0.46125808358192444, "learning_rate": 3.249370277078086e-06, "loss": 1.5414, "step": 269 }, { "epoch": 0.6801007556675063, "grad_norm": 0.488656222820282, "learning_rate": 3.2241813602015114e-06, "loss": 1.4999, "step": 270 }, { "epoch": 0.6826196473551638, "grad_norm": 0.4692099988460541, "learning_rate": 3.1989924433249375e-06, "loss": 1.5402, "step": 271 }, { "epoch": 0.6851385390428212, "grad_norm": 0.49234357476234436, "learning_rate": 3.173803526448363e-06, "loss": 1.5373, "step": 272 }, { "epoch": 0.6876574307304786, "grad_norm": 0.596118152141571, "learning_rate": 3.1486146095717885e-06, "loss": 1.5145, "step": 273 }, { "epoch": 0.690176322418136, "grad_norm": 0.4749690890312195, "learning_rate": 3.1234256926952146e-06, "loss": 1.4973, "step": 274 }, { "epoch": 0.6926952141057935, "grad_norm": 0.4940085709095001, "learning_rate": 3.09823677581864e-06, "loss": 1.464, "step": 275 }, { "epoch": 0.6952141057934509, "grad_norm": 0.47270411252975464, "learning_rate": 3.073047858942066e-06, "loss": 1.5094, "step": 276 }, { "epoch": 0.6977329974811083, "grad_norm": 0.4631718695163727, "learning_rate": 3.0478589420654913e-06, "loss": 1.4893, "step": 277 }, { "epoch": 0.7002518891687658, "grad_norm": 0.5515400171279907, "learning_rate": 3.0226700251889174e-06, "loss": 1.5342, "step": 278 }, { "epoch": 0.7027707808564232, "grad_norm": 0.5326355695724487, "learning_rate": 2.9974811083123427e-06, "loss": 1.5263, "step": 279 }, { "epoch": 0.7052896725440806, "grad_norm": 0.45032408833503723, "learning_rate": 2.9722921914357684e-06, "loss": 1.4977, "step": 280 }, { "epoch": 0.707808564231738, "grad_norm": 0.49274197220802307, "learning_rate": 2.947103274559194e-06, "loss": 1.4729, "step": 281 }, { "epoch": 0.7103274559193955, "grad_norm": 0.45705220103263855, "learning_rate": 2.92191435768262e-06, "loss": 1.4908, "step": 282 }, { "epoch": 0.7128463476070529, "grad_norm": 0.46655991673469543, "learning_rate": 2.896725440806046e-06, "loss": 1.503, "step": 283 }, { "epoch": 0.7153652392947103, "grad_norm": 0.5047741532325745, "learning_rate": 2.8715365239294713e-06, "loss": 1.4656, "step": 284 }, { "epoch": 0.7178841309823678, "grad_norm": 0.4772416949272156, "learning_rate": 2.8463476070528965e-06, "loss": 1.4664, "step": 285 }, { "epoch": 0.7204030226700252, "grad_norm": 0.4567766487598419, "learning_rate": 2.8211586901763227e-06, "loss": 1.5123, "step": 286 }, { "epoch": 0.7229219143576826, "grad_norm": 0.4822060763835907, "learning_rate": 2.7959697732997484e-06, "loss": 1.5079, "step": 287 }, { "epoch": 0.72544080604534, "grad_norm": 0.637371301651001, "learning_rate": 2.770780856423174e-06, "loss": 1.472, "step": 288 }, { "epoch": 0.7279596977329975, "grad_norm": 0.4881971478462219, "learning_rate": 2.7455919395465998e-06, "loss": 1.4737, "step": 289 }, { "epoch": 0.7304785894206549, "grad_norm": 0.4653415381908417, "learning_rate": 2.7204030226700255e-06, "loss": 1.5104, "step": 290 }, { "epoch": 0.7329974811083123, "grad_norm": 0.476697713136673, "learning_rate": 2.695214105793451e-06, "loss": 1.5072, "step": 291 }, { "epoch": 0.7355163727959698, "grad_norm": 0.6168654561042786, "learning_rate": 2.6700251889168765e-06, "loss": 1.5142, "step": 292 }, { "epoch": 0.7380352644836272, "grad_norm": 0.6653453707695007, "learning_rate": 2.6448362720403026e-06, "loss": 1.4897, "step": 293 }, { "epoch": 0.7405541561712846, "grad_norm": 0.4866642951965332, "learning_rate": 2.619647355163728e-06, "loss": 1.5409, "step": 294 }, { "epoch": 0.743073047858942, "grad_norm": 0.4763050377368927, "learning_rate": 2.594458438287154e-06, "loss": 1.5306, "step": 295 }, { "epoch": 0.7455919395465995, "grad_norm": 0.5434437990188599, "learning_rate": 2.5692695214105793e-06, "loss": 1.5334, "step": 296 }, { "epoch": 0.7481108312342569, "grad_norm": 0.5760312080383301, "learning_rate": 2.5440806045340054e-06, "loss": 1.5138, "step": 297 }, { "epoch": 0.7506297229219143, "grad_norm": 0.44751110672950745, "learning_rate": 2.518891687657431e-06, "loss": 1.4845, "step": 298 }, { "epoch": 0.7531486146095718, "grad_norm": 0.4421987235546112, "learning_rate": 2.493702770780857e-06, "loss": 1.4837, "step": 299 }, { "epoch": 0.7556675062972292, "grad_norm": 0.7657718658447266, "learning_rate": 2.4685138539042825e-06, "loss": 1.5151, "step": 300 }, { "epoch": 0.7581863979848866, "grad_norm": 0.5052861571311951, "learning_rate": 2.443324937027708e-06, "loss": 1.5404, "step": 301 }, { "epoch": 0.760705289672544, "grad_norm": 0.5251312851905823, "learning_rate": 2.4181360201511335e-06, "loss": 1.4329, "step": 302 }, { "epoch": 0.7632241813602015, "grad_norm": 0.46061962842941284, "learning_rate": 2.3929471032745592e-06, "loss": 1.4976, "step": 303 }, { "epoch": 0.7657430730478589, "grad_norm": 0.4743208587169647, "learning_rate": 2.367758186397985e-06, "loss": 1.4939, "step": 304 }, { "epoch": 0.7682619647355163, "grad_norm": 0.4864160418510437, "learning_rate": 2.3425692695214107e-06, "loss": 1.4997, "step": 305 }, { "epoch": 0.7707808564231738, "grad_norm": 0.47275349497795105, "learning_rate": 2.3173803526448368e-06, "loss": 1.4793, "step": 306 }, { "epoch": 0.7732997481108312, "grad_norm": 0.49562177062034607, "learning_rate": 2.292191435768262e-06, "loss": 1.4755, "step": 307 }, { "epoch": 0.7758186397984886, "grad_norm": 0.564599335193634, "learning_rate": 2.2670025188916878e-06, "loss": 1.4932, "step": 308 }, { "epoch": 0.7783375314861462, "grad_norm": 0.4657755494117737, "learning_rate": 2.2418136020151135e-06, "loss": 1.5076, "step": 309 }, { "epoch": 0.7808564231738035, "grad_norm": 0.486026793718338, "learning_rate": 2.216624685138539e-06, "loss": 1.5014, "step": 310 }, { "epoch": 0.783375314861461, "grad_norm": 0.4599766135215759, "learning_rate": 2.191435768261965e-06, "loss": 1.5274, "step": 311 }, { "epoch": 0.7858942065491183, "grad_norm": 0.47607848048210144, "learning_rate": 2.1662468513853906e-06, "loss": 1.4701, "step": 312 }, { "epoch": 0.7884130982367759, "grad_norm": 0.47365328669548035, "learning_rate": 2.1410579345088163e-06, "loss": 1.4932, "step": 313 }, { "epoch": 0.7909319899244333, "grad_norm": 0.45562124252319336, "learning_rate": 2.115869017632242e-06, "loss": 1.4912, "step": 314 }, { "epoch": 0.7934508816120907, "grad_norm": 0.5331164002418518, "learning_rate": 2.0906801007556677e-06, "loss": 1.5174, "step": 315 }, { "epoch": 0.7959697732997482, "grad_norm": 0.509325385093689, "learning_rate": 2.0654911838790934e-06, "loss": 1.4788, "step": 316 }, { "epoch": 0.7984886649874056, "grad_norm": 0.4969271123409271, "learning_rate": 2.040302267002519e-06, "loss": 1.5377, "step": 317 }, { "epoch": 0.801007556675063, "grad_norm": 0.44712427258491516, "learning_rate": 2.015113350125945e-06, "loss": 1.5279, "step": 318 }, { "epoch": 0.8035264483627204, "grad_norm": 0.47016969323158264, "learning_rate": 1.9899244332493705e-06, "loss": 1.5309, "step": 319 }, { "epoch": 0.8060453400503779, "grad_norm": 0.5187602043151855, "learning_rate": 1.9647355163727962e-06, "loss": 1.507, "step": 320 }, { "epoch": 0.8085642317380353, "grad_norm": 0.4568648636341095, "learning_rate": 1.939546599496222e-06, "loss": 1.4517, "step": 321 }, { "epoch": 0.8110831234256927, "grad_norm": 0.4813389480113983, "learning_rate": 1.9143576826196476e-06, "loss": 1.5215, "step": 322 }, { "epoch": 0.8136020151133502, "grad_norm": 0.5260921716690063, "learning_rate": 1.8891687657430731e-06, "loss": 1.5154, "step": 323 }, { "epoch": 0.8161209068010076, "grad_norm": 0.5113592743873596, "learning_rate": 1.863979848866499e-06, "loss": 1.4496, "step": 324 }, { "epoch": 0.818639798488665, "grad_norm": 0.48540815711021423, "learning_rate": 1.8387909319899248e-06, "loss": 1.4874, "step": 325 }, { "epoch": 0.8211586901763224, "grad_norm": 0.4522131681442261, "learning_rate": 1.8136020151133503e-06, "loss": 1.4781, "step": 326 }, { "epoch": 0.8236775818639799, "grad_norm": 0.45719313621520996, "learning_rate": 1.788413098236776e-06, "loss": 1.4859, "step": 327 }, { "epoch": 0.8261964735516373, "grad_norm": 0.43814224004745483, "learning_rate": 1.7632241813602017e-06, "loss": 1.4775, "step": 328 }, { "epoch": 0.8287153652392947, "grad_norm": 0.44290891289711, "learning_rate": 1.7380352644836274e-06, "loss": 1.5037, "step": 329 }, { "epoch": 0.8312342569269522, "grad_norm": 0.4844774603843689, "learning_rate": 1.712846347607053e-06, "loss": 1.5179, "step": 330 }, { "epoch": 0.8337531486146096, "grad_norm": 0.4434620440006256, "learning_rate": 1.6876574307304788e-06, "loss": 1.494, "step": 331 }, { "epoch": 0.836272040302267, "grad_norm": 0.46283698081970215, "learning_rate": 1.6624685138539043e-06, "loss": 1.4889, "step": 332 }, { "epoch": 0.8387909319899244, "grad_norm": 0.471802681684494, "learning_rate": 1.63727959697733e-06, "loss": 1.4558, "step": 333 }, { "epoch": 0.8413098236775819, "grad_norm": 0.4605620205402374, "learning_rate": 1.6120906801007557e-06, "loss": 1.5238, "step": 334 }, { "epoch": 0.8438287153652393, "grad_norm": 0.6928207874298096, "learning_rate": 1.5869017632241814e-06, "loss": 1.51, "step": 335 }, { "epoch": 0.8463476070528967, "grad_norm": 0.48179909586906433, "learning_rate": 1.5617128463476073e-06, "loss": 1.5368, "step": 336 }, { "epoch": 0.8488664987405542, "grad_norm": 0.5029130578041077, "learning_rate": 1.536523929471033e-06, "loss": 1.4563, "step": 337 }, { "epoch": 0.8513853904282116, "grad_norm": 0.4608486294746399, "learning_rate": 1.5113350125944587e-06, "loss": 1.463, "step": 338 }, { "epoch": 0.853904282115869, "grad_norm": 0.5182480216026306, "learning_rate": 1.4861460957178842e-06, "loss": 1.465, "step": 339 }, { "epoch": 0.8564231738035264, "grad_norm": 0.4644806385040283, "learning_rate": 1.46095717884131e-06, "loss": 1.4987, "step": 340 }, { "epoch": 0.8589420654911839, "grad_norm": 0.4732770323753357, "learning_rate": 1.4357682619647356e-06, "loss": 1.5133, "step": 341 }, { "epoch": 0.8614609571788413, "grad_norm": 0.5835548043251038, "learning_rate": 1.4105793450881613e-06, "loss": 1.5233, "step": 342 }, { "epoch": 0.8639798488664987, "grad_norm": 0.45620298385620117, "learning_rate": 1.385390428211587e-06, "loss": 1.4727, "step": 343 }, { "epoch": 0.8664987405541562, "grad_norm": 0.4693787395954132, "learning_rate": 1.3602015113350127e-06, "loss": 1.4706, "step": 344 }, { "epoch": 0.8690176322418136, "grad_norm": 0.6238934993743896, "learning_rate": 1.3350125944584382e-06, "loss": 1.5022, "step": 345 }, { "epoch": 0.871536523929471, "grad_norm": 0.5140495896339417, "learning_rate": 1.309823677581864e-06, "loss": 1.4581, "step": 346 }, { "epoch": 0.8740554156171285, "grad_norm": 0.6451770663261414, "learning_rate": 1.2846347607052897e-06, "loss": 1.523, "step": 347 }, { "epoch": 0.8765743073047859, "grad_norm": 0.5394758582115173, "learning_rate": 1.2594458438287156e-06, "loss": 1.4815, "step": 348 }, { "epoch": 0.8790931989924433, "grad_norm": 0.4751567840576172, "learning_rate": 1.2342569269521413e-06, "loss": 1.4666, "step": 349 }, { "epoch": 0.8816120906801007, "grad_norm": 0.5158999562263489, "learning_rate": 1.2090680100755668e-06, "loss": 1.477, "step": 350 }, { "epoch": 0.8841309823677582, "grad_norm": 0.47987380623817444, "learning_rate": 1.1838790931989925e-06, "loss": 1.4751, "step": 351 }, { "epoch": 0.8866498740554156, "grad_norm": 0.45010906457901, "learning_rate": 1.1586901763224184e-06, "loss": 1.4935, "step": 352 }, { "epoch": 0.889168765743073, "grad_norm": 0.4675264060497284, "learning_rate": 1.1335012594458439e-06, "loss": 1.4767, "step": 353 }, { "epoch": 0.8916876574307305, "grad_norm": 0.4817536175251007, "learning_rate": 1.1083123425692696e-06, "loss": 1.5079, "step": 354 }, { "epoch": 0.8942065491183879, "grad_norm": 0.5326683521270752, "learning_rate": 1.0831234256926953e-06, "loss": 1.4643, "step": 355 }, { "epoch": 0.8967254408060453, "grad_norm": 0.45862582325935364, "learning_rate": 1.057934508816121e-06, "loss": 1.4784, "step": 356 }, { "epoch": 0.8992443324937027, "grad_norm": 0.4639340937137604, "learning_rate": 1.0327455919395467e-06, "loss": 1.4669, "step": 357 }, { "epoch": 0.9017632241813602, "grad_norm": 0.5519356727600098, "learning_rate": 1.0075566750629724e-06, "loss": 1.4962, "step": 358 }, { "epoch": 0.9042821158690176, "grad_norm": 0.5423635244369507, "learning_rate": 9.823677581863981e-07, "loss": 1.5149, "step": 359 }, { "epoch": 0.906801007556675, "grad_norm": 0.4961482286453247, "learning_rate": 9.571788413098238e-07, "loss": 1.4841, "step": 360 }, { "epoch": 0.9093198992443325, "grad_norm": 0.5558215379714966, "learning_rate": 9.319899244332495e-07, "loss": 1.4672, "step": 361 }, { "epoch": 0.9118387909319899, "grad_norm": 0.47575876116752625, "learning_rate": 9.068010075566751e-07, "loss": 1.5035, "step": 362 }, { "epoch": 0.9143576826196473, "grad_norm": 0.44151756167411804, "learning_rate": 8.816120906801008e-07, "loss": 1.4923, "step": 363 }, { "epoch": 0.9168765743073047, "grad_norm": 0.49502983689308167, "learning_rate": 8.564231738035265e-07, "loss": 1.4872, "step": 364 }, { "epoch": 0.9193954659949622, "grad_norm": 0.4563881456851959, "learning_rate": 8.312342569269521e-07, "loss": 1.5022, "step": 365 }, { "epoch": 0.9219143576826196, "grad_norm": 0.4814889132976532, "learning_rate": 8.060453400503778e-07, "loss": 1.4922, "step": 366 }, { "epoch": 0.924433249370277, "grad_norm": 0.44825509190559387, "learning_rate": 7.808564231738037e-07, "loss": 1.4695, "step": 367 }, { "epoch": 0.9269521410579346, "grad_norm": 0.46482357382774353, "learning_rate": 7.556675062972294e-07, "loss": 1.4943, "step": 368 }, { "epoch": 0.929471032745592, "grad_norm": 0.5883563160896301, "learning_rate": 7.30478589420655e-07, "loss": 1.4658, "step": 369 }, { "epoch": 0.9319899244332494, "grad_norm": 0.6148042678833008, "learning_rate": 7.052896725440807e-07, "loss": 1.4528, "step": 370 }, { "epoch": 0.9345088161209067, "grad_norm": 0.4770396649837494, "learning_rate": 6.801007556675064e-07, "loss": 1.4914, "step": 371 }, { "epoch": 0.9370277078085643, "grad_norm": 0.46335241198539734, "learning_rate": 6.54911838790932e-07, "loss": 1.5172, "step": 372 }, { "epoch": 0.9395465994962217, "grad_norm": 0.46679455041885376, "learning_rate": 6.297229219143578e-07, "loss": 1.4426, "step": 373 }, { "epoch": 0.9420654911838791, "grad_norm": 0.5507463216781616, "learning_rate": 6.045340050377834e-07, "loss": 1.5067, "step": 374 }, { "epoch": 0.9445843828715366, "grad_norm": 0.468250572681427, "learning_rate": 5.793450881612092e-07, "loss": 1.5105, "step": 375 }, { "epoch": 0.947103274559194, "grad_norm": 0.6048943996429443, "learning_rate": 5.541561712846348e-07, "loss": 1.4814, "step": 376 }, { "epoch": 0.9496221662468514, "grad_norm": 0.4735409617424011, "learning_rate": 5.289672544080605e-07, "loss": 1.4739, "step": 377 }, { "epoch": 0.9521410579345088, "grad_norm": 0.5519718527793884, "learning_rate": 5.037783375314862e-07, "loss": 1.5022, "step": 378 }, { "epoch": 0.9546599496221663, "grad_norm": 0.4825071692466736, "learning_rate": 4.785894206549119e-07, "loss": 1.4977, "step": 379 }, { "epoch": 0.9571788413098237, "grad_norm": 0.44791093468666077, "learning_rate": 4.5340050377833756e-07, "loss": 1.4912, "step": 380 }, { "epoch": 0.9596977329974811, "grad_norm": 0.6440786719322205, "learning_rate": 4.2821158690176327e-07, "loss": 1.4602, "step": 381 }, { "epoch": 0.9622166246851386, "grad_norm": 0.4575777053833008, "learning_rate": 4.030226700251889e-07, "loss": 1.4833, "step": 382 }, { "epoch": 0.964735516372796, "grad_norm": 0.47071707248687744, "learning_rate": 3.778337531486147e-07, "loss": 1.4963, "step": 383 }, { "epoch": 0.9672544080604534, "grad_norm": 0.6902024745941162, "learning_rate": 3.5264483627204033e-07, "loss": 1.4699, "step": 384 }, { "epoch": 0.9697732997481109, "grad_norm": 0.48268118500709534, "learning_rate": 3.27455919395466e-07, "loss": 1.472, "step": 385 }, { "epoch": 0.9722921914357683, "grad_norm": 0.4497368335723877, "learning_rate": 3.022670025188917e-07, "loss": 1.4654, "step": 386 }, { "epoch": 0.9748110831234257, "grad_norm": 0.5587329864501953, "learning_rate": 2.770780856423174e-07, "loss": 1.5351, "step": 387 }, { "epoch": 0.9773299748110831, "grad_norm": 0.5236759185791016, "learning_rate": 2.518891687657431e-07, "loss": 1.4955, "step": 388 }, { "epoch": 0.9798488664987406, "grad_norm": 0.4622642397880554, "learning_rate": 2.2670025188916878e-07, "loss": 1.4956, "step": 389 }, { "epoch": 0.982367758186398, "grad_norm": 0.4652063548564911, "learning_rate": 2.0151133501259446e-07, "loss": 1.4875, "step": 390 }, { "epoch": 0.9848866498740554, "grad_norm": 0.44629859924316406, "learning_rate": 1.7632241813602017e-07, "loss": 1.4543, "step": 391 }, { "epoch": 0.9874055415617129, "grad_norm": 0.45472198724746704, "learning_rate": 1.5113350125944585e-07, "loss": 1.5048, "step": 392 }, { "epoch": 0.9899244332493703, "grad_norm": 0.4791916608810425, "learning_rate": 1.2594458438287155e-07, "loss": 1.4998, "step": 393 }, { "epoch": 0.9924433249370277, "grad_norm": 0.45487239956855774, "learning_rate": 1.0075566750629723e-07, "loss": 1.5058, "step": 394 }, { "epoch": 0.9949622166246851, "grad_norm": 0.5730354189872742, "learning_rate": 7.556675062972292e-08, "loss": 1.5314, "step": 395 }, { "epoch": 0.9974811083123426, "grad_norm": 0.47194746136665344, "learning_rate": 5.0377833753148615e-08, "loss": 1.5077, "step": 396 }, { "epoch": 1.0, "grad_norm": 0.5024914741516113, "learning_rate": 2.5188916876574308e-08, "loss": 1.4974, "step": 397 } ], "logging_steps": 1.0, "max_steps": 397, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.840368526032896e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }