|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 0, |
|
"global_step": 397, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025188916876574307, |
|
"grad_norm": 1.420465350151062, |
|
"learning_rate": 1e-05, |
|
"loss": 2.5777, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005037783375314861, |
|
"grad_norm": 1.4124211072921753, |
|
"learning_rate": 9.974811083123427e-06, |
|
"loss": 2.5574, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007556675062972292, |
|
"grad_norm": 1.444077730178833, |
|
"learning_rate": 9.949622166246852e-06, |
|
"loss": 2.7149, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010075566750629723, |
|
"grad_norm": 1.2692691087722778, |
|
"learning_rate": 9.924433249370277e-06, |
|
"loss": 2.4942, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012594458438287154, |
|
"grad_norm": 1.2546937465667725, |
|
"learning_rate": 9.899244332493704e-06, |
|
"loss": 2.5284, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015113350125944584, |
|
"grad_norm": 1.2006076574325562, |
|
"learning_rate": 9.87405541561713e-06, |
|
"loss": 2.5203, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.017632241813602016, |
|
"grad_norm": 1.1375973224639893, |
|
"learning_rate": 9.848866498740555e-06, |
|
"loss": 2.4494, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.020151133501259445, |
|
"grad_norm": 1.0649913549423218, |
|
"learning_rate": 9.82367758186398e-06, |
|
"loss": 2.4138, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.022670025188916875, |
|
"grad_norm": 1.0274866819381714, |
|
"learning_rate": 9.798488664987406e-06, |
|
"loss": 2.3557, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02518891687657431, |
|
"grad_norm": 1.0478529930114746, |
|
"learning_rate": 9.773299748110831e-06, |
|
"loss": 2.4614, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.027707808564231738, |
|
"grad_norm": 0.9700673818588257, |
|
"learning_rate": 9.748110831234258e-06, |
|
"loss": 2.4212, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.030226700251889168, |
|
"grad_norm": 0.8414812684059143, |
|
"learning_rate": 9.722921914357684e-06, |
|
"loss": 2.2299, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0327455919395466, |
|
"grad_norm": 0.8956544399261475, |
|
"learning_rate": 9.69773299748111e-06, |
|
"loss": 2.3443, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03526448362720403, |
|
"grad_norm": 0.9195625185966492, |
|
"learning_rate": 9.672544080604534e-06, |
|
"loss": 2.2813, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.037783375314861464, |
|
"grad_norm": 0.8406645655632019, |
|
"learning_rate": 9.64735516372796e-06, |
|
"loss": 2.2909, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04030226700251889, |
|
"grad_norm": 0.8406001925468445, |
|
"learning_rate": 9.622166246851387e-06, |
|
"loss": 2.3022, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.042821158690176324, |
|
"grad_norm": 0.8053434491157532, |
|
"learning_rate": 9.596977329974812e-06, |
|
"loss": 2.2592, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04534005037783375, |
|
"grad_norm": 0.8638896346092224, |
|
"learning_rate": 9.571788413098237e-06, |
|
"loss": 2.3171, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04785894206549118, |
|
"grad_norm": 0.8893205523490906, |
|
"learning_rate": 9.546599496221664e-06, |
|
"loss": 2.2565, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05037783375314862, |
|
"grad_norm": 0.7514384984970093, |
|
"learning_rate": 9.521410579345088e-06, |
|
"loss": 2.1959, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05289672544080604, |
|
"grad_norm": 0.7832961678504944, |
|
"learning_rate": 9.496221662468515e-06, |
|
"loss": 2.2186, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.055415617128463476, |
|
"grad_norm": 0.7781046628952026, |
|
"learning_rate": 9.47103274559194e-06, |
|
"loss": 2.207, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05793450881612091, |
|
"grad_norm": 0.7359276413917542, |
|
"learning_rate": 9.445843828715366e-06, |
|
"loss": 2.1479, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.060453400503778336, |
|
"grad_norm": 0.7263805866241455, |
|
"learning_rate": 9.420654911838791e-06, |
|
"loss": 2.1799, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06297229219143577, |
|
"grad_norm": 0.6834078431129456, |
|
"learning_rate": 9.395465994962218e-06, |
|
"loss": 2.1216, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0654911838790932, |
|
"grad_norm": 0.6694800853729248, |
|
"learning_rate": 9.370277078085643e-06, |
|
"loss": 2.0769, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06801007556675064, |
|
"grad_norm": 0.6812991499900818, |
|
"learning_rate": 9.345088161209067e-06, |
|
"loss": 2.146, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07052896725440806, |
|
"grad_norm": 0.6379550695419312, |
|
"learning_rate": 9.319899244332494e-06, |
|
"loss": 2.0901, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07304785894206549, |
|
"grad_norm": 0.6825947761535645, |
|
"learning_rate": 9.29471032745592e-06, |
|
"loss": 2.1533, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07556675062972293, |
|
"grad_norm": 0.7910833954811096, |
|
"learning_rate": 9.269521410579347e-06, |
|
"loss": 2.1828, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07808564231738035, |
|
"grad_norm": 0.6861229538917542, |
|
"learning_rate": 9.244332493702772e-06, |
|
"loss": 2.1502, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08060453400503778, |
|
"grad_norm": 0.6285768747329712, |
|
"learning_rate": 9.219143576826197e-06, |
|
"loss": 2.1031, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08312342569269521, |
|
"grad_norm": 0.6474770903587341, |
|
"learning_rate": 9.193954659949623e-06, |
|
"loss": 2.087, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08564231738035265, |
|
"grad_norm": 0.5884003043174744, |
|
"learning_rate": 9.168765743073048e-06, |
|
"loss": 2.0418, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08816120906801007, |
|
"grad_norm": 0.5800574421882629, |
|
"learning_rate": 9.143576826196475e-06, |
|
"loss": 2.0484, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0906801007556675, |
|
"grad_norm": 0.5606217980384827, |
|
"learning_rate": 9.1183879093199e-06, |
|
"loss": 2.0026, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09319899244332494, |
|
"grad_norm": 0.6527896523475647, |
|
"learning_rate": 9.093198992443326e-06, |
|
"loss": 1.9611, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09571788413098237, |
|
"grad_norm": 0.5732287764549255, |
|
"learning_rate": 9.068010075566751e-06, |
|
"loss": 2.0348, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0982367758186398, |
|
"grad_norm": 0.5753059387207031, |
|
"learning_rate": 9.042821158690178e-06, |
|
"loss": 2.0062, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10075566750629723, |
|
"grad_norm": 0.5425299406051636, |
|
"learning_rate": 9.017632241813602e-06, |
|
"loss": 1.9781, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10327455919395466, |
|
"grad_norm": 0.5520154237747192, |
|
"learning_rate": 8.992443324937027e-06, |
|
"loss": 1.9927, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10579345088161209, |
|
"grad_norm": 0.5321075320243835, |
|
"learning_rate": 8.967254408060454e-06, |
|
"loss": 1.9715, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10831234256926953, |
|
"grad_norm": 0.5192540287971497, |
|
"learning_rate": 8.94206549118388e-06, |
|
"loss": 1.9771, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11083123425692695, |
|
"grad_norm": 0.5216296315193176, |
|
"learning_rate": 8.916876574307305e-06, |
|
"loss": 1.9554, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11335012594458438, |
|
"grad_norm": 0.5138005614280701, |
|
"learning_rate": 8.89168765743073e-06, |
|
"loss": 1.9501, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11586901763224182, |
|
"grad_norm": 0.5473687052726746, |
|
"learning_rate": 8.866498740554157e-06, |
|
"loss": 1.9943, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11838790931989925, |
|
"grad_norm": 0.5291565656661987, |
|
"learning_rate": 8.841309823677583e-06, |
|
"loss": 1.9401, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.12090680100755667, |
|
"grad_norm": 0.5129333734512329, |
|
"learning_rate": 8.816120906801008e-06, |
|
"loss": 1.9557, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12342569269521411, |
|
"grad_norm": 0.5359098315238953, |
|
"learning_rate": 8.790931989924435e-06, |
|
"loss": 1.9787, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12594458438287154, |
|
"grad_norm": 0.4913354814052582, |
|
"learning_rate": 8.76574307304786e-06, |
|
"loss": 1.9198, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12846347607052896, |
|
"grad_norm": 0.4875161647796631, |
|
"learning_rate": 8.740554156171286e-06, |
|
"loss": 1.9497, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1309823677581864, |
|
"grad_norm": 0.47248420119285583, |
|
"learning_rate": 8.715365239294711e-06, |
|
"loss": 1.8747, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13350125944584382, |
|
"grad_norm": 0.48350995779037476, |
|
"learning_rate": 8.690176322418138e-06, |
|
"loss": 1.8919, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.13602015113350127, |
|
"grad_norm": 0.48570191860198975, |
|
"learning_rate": 8.664987405541562e-06, |
|
"loss": 1.8958, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1385390428211587, |
|
"grad_norm": 0.47888582944869995, |
|
"learning_rate": 8.639798488664987e-06, |
|
"loss": 1.8924, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14105793450881612, |
|
"grad_norm": 0.4759175479412079, |
|
"learning_rate": 8.614609571788414e-06, |
|
"loss": 1.8559, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14357682619647355, |
|
"grad_norm": 0.47866225242614746, |
|
"learning_rate": 8.58942065491184e-06, |
|
"loss": 1.8297, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14609571788413098, |
|
"grad_norm": 0.47261252999305725, |
|
"learning_rate": 8.564231738035265e-06, |
|
"loss": 1.9205, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1486146095717884, |
|
"grad_norm": 0.4570164978504181, |
|
"learning_rate": 8.53904282115869e-06, |
|
"loss": 1.8286, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15113350125944586, |
|
"grad_norm": 0.45629221200942993, |
|
"learning_rate": 8.513853904282117e-06, |
|
"loss": 1.8307, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15365239294710328, |
|
"grad_norm": 0.4506438374519348, |
|
"learning_rate": 8.488664987405543e-06, |
|
"loss": 1.8264, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1561712846347607, |
|
"grad_norm": 0.46889957785606384, |
|
"learning_rate": 8.463476070528968e-06, |
|
"loss": 1.8619, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.15869017632241814, |
|
"grad_norm": 0.4415088891983032, |
|
"learning_rate": 8.438287153652393e-06, |
|
"loss": 1.8088, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16120906801007556, |
|
"grad_norm": 0.6827300786972046, |
|
"learning_rate": 8.41309823677582e-06, |
|
"loss": 1.8091, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.163727959697733, |
|
"grad_norm": 0.4396965503692627, |
|
"learning_rate": 8.387909319899244e-06, |
|
"loss": 1.8235, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16624685138539042, |
|
"grad_norm": 0.4572596549987793, |
|
"learning_rate": 8.36272040302267e-06, |
|
"loss": 1.8473, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.16876574307304787, |
|
"grad_norm": 0.47288888692855835, |
|
"learning_rate": 8.337531486146096e-06, |
|
"loss": 1.7847, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1712846347607053, |
|
"grad_norm": 0.42984092235565186, |
|
"learning_rate": 8.312342569269522e-06, |
|
"loss": 1.8235, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17380352644836272, |
|
"grad_norm": 0.4297022521495819, |
|
"learning_rate": 8.287153652392947e-06, |
|
"loss": 1.7944, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.17632241813602015, |
|
"grad_norm": 0.44730344414711, |
|
"learning_rate": 8.261964735516374e-06, |
|
"loss": 1.8026, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17884130982367757, |
|
"grad_norm": 0.45562756061553955, |
|
"learning_rate": 8.2367758186398e-06, |
|
"loss": 1.8084, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.181360201511335, |
|
"grad_norm": 0.43180692195892334, |
|
"learning_rate": 8.211586901763225e-06, |
|
"loss": 1.804, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.18387909319899245, |
|
"grad_norm": 0.4151434302330017, |
|
"learning_rate": 8.18639798488665e-06, |
|
"loss": 1.745, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.18639798488664988, |
|
"grad_norm": 0.42020657658576965, |
|
"learning_rate": 8.161209068010076e-06, |
|
"loss": 1.729, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1889168765743073, |
|
"grad_norm": 0.4290010631084442, |
|
"learning_rate": 8.136020151133503e-06, |
|
"loss": 1.7916, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19143576826196473, |
|
"grad_norm": 0.4147432744503021, |
|
"learning_rate": 8.110831234256928e-06, |
|
"loss": 1.7515, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19395465994962216, |
|
"grad_norm": 0.4140765964984894, |
|
"learning_rate": 8.085642317380353e-06, |
|
"loss": 1.7494, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1964735516372796, |
|
"grad_norm": 0.42620202898979187, |
|
"learning_rate": 8.06045340050378e-06, |
|
"loss": 1.7654, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.19899244332493704, |
|
"grad_norm": 0.46347954869270325, |
|
"learning_rate": 8.035264483627204e-06, |
|
"loss": 1.7229, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.20151133501259447, |
|
"grad_norm": 0.44873306155204773, |
|
"learning_rate": 8.01007556675063e-06, |
|
"loss": 1.7153, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2040302267002519, |
|
"grad_norm": 0.46032124757766724, |
|
"learning_rate": 7.984886649874056e-06, |
|
"loss": 1.7187, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.20654911838790932, |
|
"grad_norm": 0.40681278705596924, |
|
"learning_rate": 7.959697732997482e-06, |
|
"loss": 1.7562, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.20906801007556675, |
|
"grad_norm": 0.4236059784889221, |
|
"learning_rate": 7.934508816120907e-06, |
|
"loss": 1.7158, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.21158690176322417, |
|
"grad_norm": 0.42215806245803833, |
|
"learning_rate": 7.909319899244334e-06, |
|
"loss": 1.7113, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2141057934508816, |
|
"grad_norm": 0.4768604040145874, |
|
"learning_rate": 7.884130982367758e-06, |
|
"loss": 1.7382, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.21662468513853905, |
|
"grad_norm": 0.5776296854019165, |
|
"learning_rate": 7.858942065491185e-06, |
|
"loss": 1.7144, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.21914357682619648, |
|
"grad_norm": 0.42450252175331116, |
|
"learning_rate": 7.83375314861461e-06, |
|
"loss": 1.7112, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2216624685138539, |
|
"grad_norm": 0.5352158546447754, |
|
"learning_rate": 7.808564231738036e-06, |
|
"loss": 1.7534, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22418136020151133, |
|
"grad_norm": 0.42181944847106934, |
|
"learning_rate": 7.783375314861463e-06, |
|
"loss": 1.6889, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.22670025188916876, |
|
"grad_norm": 0.41291770339012146, |
|
"learning_rate": 7.758186397984888e-06, |
|
"loss": 1.7026, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22921914357682618, |
|
"grad_norm": 0.4190363883972168, |
|
"learning_rate": 7.732997481108313e-06, |
|
"loss": 1.736, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.23173803526448364, |
|
"grad_norm": 0.48135876655578613, |
|
"learning_rate": 7.70780856423174e-06, |
|
"loss": 1.732, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.23425692695214106, |
|
"grad_norm": 0.4111890494823456, |
|
"learning_rate": 7.682619647355164e-06, |
|
"loss": 1.7054, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2367758186397985, |
|
"grad_norm": 0.5191890597343445, |
|
"learning_rate": 7.65743073047859e-06, |
|
"loss": 1.7147, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.23929471032745592, |
|
"grad_norm": 0.4190960228443146, |
|
"learning_rate": 7.632241813602015e-06, |
|
"loss": 1.7128, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.24181360201511334, |
|
"grad_norm": 0.4258028566837311, |
|
"learning_rate": 7.607052896725441e-06, |
|
"loss": 1.7264, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24433249370277077, |
|
"grad_norm": 0.4177513122558594, |
|
"learning_rate": 7.581863979848867e-06, |
|
"loss": 1.6779, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.24685138539042822, |
|
"grad_norm": 0.4666061997413635, |
|
"learning_rate": 7.5566750629722926e-06, |
|
"loss": 1.6453, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.24937027707808565, |
|
"grad_norm": 0.4100574553012848, |
|
"learning_rate": 7.531486146095719e-06, |
|
"loss": 1.6963, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2518891687657431, |
|
"grad_norm": 0.4570634067058563, |
|
"learning_rate": 7.506297229219144e-06, |
|
"loss": 1.7032, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25440806045340053, |
|
"grad_norm": 0.42653965950012207, |
|
"learning_rate": 7.48110831234257e-06, |
|
"loss": 1.6624, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.25692695214105793, |
|
"grad_norm": 0.4480111300945282, |
|
"learning_rate": 7.455919395465996e-06, |
|
"loss": 1.7011, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2594458438287154, |
|
"grad_norm": 0.4271489977836609, |
|
"learning_rate": 7.430730478589421e-06, |
|
"loss": 1.6716, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2619647355163728, |
|
"grad_norm": 0.41798117756843567, |
|
"learning_rate": 7.405541561712847e-06, |
|
"loss": 1.6759, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26448362720403024, |
|
"grad_norm": 0.40775346755981445, |
|
"learning_rate": 7.3803526448362725e-06, |
|
"loss": 1.6778, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.26700251889168763, |
|
"grad_norm": 0.41410258412361145, |
|
"learning_rate": 7.355163727959699e-06, |
|
"loss": 1.6924, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2695214105793451, |
|
"grad_norm": 0.42046648263931274, |
|
"learning_rate": 7.329974811083124e-06, |
|
"loss": 1.6468, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.27204030226700254, |
|
"grad_norm": 0.4312984347343445, |
|
"learning_rate": 7.30478589420655e-06, |
|
"loss": 1.7088, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.27455919395465994, |
|
"grad_norm": 0.4318784475326538, |
|
"learning_rate": 7.279596977329975e-06, |
|
"loss": 1.6773, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2770780856423174, |
|
"grad_norm": 0.4189915359020233, |
|
"learning_rate": 7.254408060453401e-06, |
|
"loss": 1.6194, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2795969773299748, |
|
"grad_norm": 0.5198895931243896, |
|
"learning_rate": 7.229219143576827e-06, |
|
"loss": 1.6848, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.28211586901763225, |
|
"grad_norm": 0.5195222496986389, |
|
"learning_rate": 7.2040302267002524e-06, |
|
"loss": 1.6813, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28463476070528965, |
|
"grad_norm": 0.45624077320098877, |
|
"learning_rate": 7.178841309823679e-06, |
|
"loss": 1.6448, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2871536523929471, |
|
"grad_norm": 0.49435746669769287, |
|
"learning_rate": 7.153652392947104e-06, |
|
"loss": 1.6584, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.28967254408060455, |
|
"grad_norm": 0.4301837086677551, |
|
"learning_rate": 7.1284634760705296e-06, |
|
"loss": 1.6652, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.29219143576826195, |
|
"grad_norm": 0.4709468185901642, |
|
"learning_rate": 7.103274559193955e-06, |
|
"loss": 1.6371, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2947103274559194, |
|
"grad_norm": 0.45211878418922424, |
|
"learning_rate": 7.07808564231738e-06, |
|
"loss": 1.6672, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2972292191435768, |
|
"grad_norm": 0.4376428723335266, |
|
"learning_rate": 7.052896725440807e-06, |
|
"loss": 1.695, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.29974811083123426, |
|
"grad_norm": 0.4894670844078064, |
|
"learning_rate": 7.027707808564232e-06, |
|
"loss": 1.6617, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3022670025188917, |
|
"grad_norm": 0.454942911863327, |
|
"learning_rate": 7.002518891687659e-06, |
|
"loss": 1.5997, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3047858942065491, |
|
"grad_norm": 0.547237753868103, |
|
"learning_rate": 6.977329974811084e-06, |
|
"loss": 1.6714, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.30730478589420657, |
|
"grad_norm": 0.43378302454948425, |
|
"learning_rate": 6.9521410579345095e-06, |
|
"loss": 1.6393, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.30982367758186397, |
|
"grad_norm": 0.43780213594436646, |
|
"learning_rate": 6.926952141057935e-06, |
|
"loss": 1.6717, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3123425692695214, |
|
"grad_norm": 0.4194709062576294, |
|
"learning_rate": 6.90176322418136e-06, |
|
"loss": 1.6464, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3148614609571788, |
|
"grad_norm": 0.42024093866348267, |
|
"learning_rate": 6.876574307304787e-06, |
|
"loss": 1.6275, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.31738035264483627, |
|
"grad_norm": 0.4303475022315979, |
|
"learning_rate": 6.851385390428212e-06, |
|
"loss": 1.6375, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3198992443324937, |
|
"grad_norm": 0.42517420649528503, |
|
"learning_rate": 6.826196473551638e-06, |
|
"loss": 1.6363, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3224181360201511, |
|
"grad_norm": 0.42485958337783813, |
|
"learning_rate": 6.801007556675063e-06, |
|
"loss": 1.6302, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3249370277078086, |
|
"grad_norm": 0.4118500053882599, |
|
"learning_rate": 6.7758186397984894e-06, |
|
"loss": 1.6174, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.327455919395466, |
|
"grad_norm": 0.4531554579734802, |
|
"learning_rate": 6.750629722921915e-06, |
|
"loss": 1.6283, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.32997481108312343, |
|
"grad_norm": 0.41642168164253235, |
|
"learning_rate": 6.72544080604534e-06, |
|
"loss": 1.6328, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.33249370277078083, |
|
"grad_norm": 0.4234478771686554, |
|
"learning_rate": 6.7002518891687666e-06, |
|
"loss": 1.6393, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3350125944584383, |
|
"grad_norm": 0.4314388930797577, |
|
"learning_rate": 6.675062972292192e-06, |
|
"loss": 1.6371, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.33753148614609574, |
|
"grad_norm": 0.49057596921920776, |
|
"learning_rate": 6.649874055415617e-06, |
|
"loss": 1.6777, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.34005037783375314, |
|
"grad_norm": 0.6537044644355774, |
|
"learning_rate": 6.624685138539043e-06, |
|
"loss": 1.6021, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3425692695214106, |
|
"grad_norm": 0.44913142919540405, |
|
"learning_rate": 6.599496221662469e-06, |
|
"loss": 1.6369, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.345088161209068, |
|
"grad_norm": 0.41060981154441833, |
|
"learning_rate": 6.574307304785895e-06, |
|
"loss": 1.6001, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.34760705289672544, |
|
"grad_norm": 0.44041162729263306, |
|
"learning_rate": 6.54911838790932e-06, |
|
"loss": 1.6352, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3501259445843829, |
|
"grad_norm": 0.42245715856552124, |
|
"learning_rate": 6.5239294710327465e-06, |
|
"loss": 1.6195, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3526448362720403, |
|
"grad_norm": 0.4138522446155548, |
|
"learning_rate": 6.498740554156172e-06, |
|
"loss": 1.6254, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.35516372795969775, |
|
"grad_norm": 0.42503440380096436, |
|
"learning_rate": 6.473551637279597e-06, |
|
"loss": 1.6187, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.35768261964735515, |
|
"grad_norm": 0.5783386826515198, |
|
"learning_rate": 6.448362720403023e-06, |
|
"loss": 1.6704, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3602015113350126, |
|
"grad_norm": 0.4822537302970886, |
|
"learning_rate": 6.423173803526449e-06, |
|
"loss": 1.5762, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.36272040302267, |
|
"grad_norm": 0.43411409854888916, |
|
"learning_rate": 6.397984886649875e-06, |
|
"loss": 1.6067, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.36523929471032746, |
|
"grad_norm": 0.43474212288856506, |
|
"learning_rate": 6.3727959697733e-06, |
|
"loss": 1.6198, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3677581863979849, |
|
"grad_norm": 0.4297161400318146, |
|
"learning_rate": 6.347607052896726e-06, |
|
"loss": 1.6004, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3702770780856423, |
|
"grad_norm": 0.7553440928459167, |
|
"learning_rate": 6.322418136020152e-06, |
|
"loss": 1.7129, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.37279596977329976, |
|
"grad_norm": 0.4365249574184418, |
|
"learning_rate": 6.297229219143577e-06, |
|
"loss": 1.6186, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.37531486146095716, |
|
"grad_norm": 0.4731481373310089, |
|
"learning_rate": 6.272040302267003e-06, |
|
"loss": 1.5799, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3778337531486146, |
|
"grad_norm": 0.44013121724128723, |
|
"learning_rate": 6.246851385390429e-06, |
|
"loss": 1.6117, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.380352644836272, |
|
"grad_norm": 0.4383363425731659, |
|
"learning_rate": 6.221662468513855e-06, |
|
"loss": 1.5916, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.38287153652392947, |
|
"grad_norm": 0.4586566686630249, |
|
"learning_rate": 6.19647355163728e-06, |
|
"loss": 1.5987, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3853904282115869, |
|
"grad_norm": 0.5225487351417542, |
|
"learning_rate": 6.1712846347607055e-06, |
|
"loss": 1.5497, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3879093198992443, |
|
"grad_norm": 0.4564357399940491, |
|
"learning_rate": 6.146095717884132e-06, |
|
"loss": 1.6016, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3904282115869018, |
|
"grad_norm": 0.49243706464767456, |
|
"learning_rate": 6.120906801007557e-06, |
|
"loss": 1.6004, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3929471032745592, |
|
"grad_norm": 0.6145833134651184, |
|
"learning_rate": 6.095717884130983e-06, |
|
"loss": 1.5891, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3954659949622166, |
|
"grad_norm": 0.4326134920120239, |
|
"learning_rate": 6.070528967254408e-06, |
|
"loss": 1.6108, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3979848866498741, |
|
"grad_norm": 0.45841845870018005, |
|
"learning_rate": 6.045340050377835e-06, |
|
"loss": 1.6044, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4005037783375315, |
|
"grad_norm": 0.5934171676635742, |
|
"learning_rate": 6.02015113350126e-06, |
|
"loss": 1.6273, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.40302267002518893, |
|
"grad_norm": 0.5909122824668884, |
|
"learning_rate": 5.9949622166246855e-06, |
|
"loss": 1.5819, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.40554156171284633, |
|
"grad_norm": 0.47986772656440735, |
|
"learning_rate": 5.969773299748112e-06, |
|
"loss": 1.6292, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4080604534005038, |
|
"grad_norm": 0.43019899725914, |
|
"learning_rate": 5.944584382871537e-06, |
|
"loss": 1.6219, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4105793450881612, |
|
"grad_norm": 0.44603484869003296, |
|
"learning_rate": 5.919395465994963e-06, |
|
"loss": 1.6177, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.41309823677581864, |
|
"grad_norm": 0.6486812233924866, |
|
"learning_rate": 5.894206549118388e-06, |
|
"loss": 1.6169, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4156171284634761, |
|
"grad_norm": 0.4344078600406647, |
|
"learning_rate": 5.869017632241813e-06, |
|
"loss": 1.6131, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4181360201511335, |
|
"grad_norm": 0.4963393211364746, |
|
"learning_rate": 5.84382871536524e-06, |
|
"loss": 1.5649, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.42065491183879095, |
|
"grad_norm": 0.4491269588470459, |
|
"learning_rate": 5.818639798488665e-06, |
|
"loss": 1.5886, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.42317380352644834, |
|
"grad_norm": 0.44954273104667664, |
|
"learning_rate": 5.793450881612092e-06, |
|
"loss": 1.5514, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4256926952141058, |
|
"grad_norm": 0.5957120060920715, |
|
"learning_rate": 5.768261964735517e-06, |
|
"loss": 1.5656, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4282115869017632, |
|
"grad_norm": 0.4787919223308563, |
|
"learning_rate": 5.7430730478589425e-06, |
|
"loss": 1.5906, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.43073047858942065, |
|
"grad_norm": 0.4297046959400177, |
|
"learning_rate": 5.717884130982368e-06, |
|
"loss": 1.5676, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4332493702770781, |
|
"grad_norm": 0.4834885597229004, |
|
"learning_rate": 5.692695214105793e-06, |
|
"loss": 1.5672, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4357682619647355, |
|
"grad_norm": 0.5278275012969971, |
|
"learning_rate": 5.66750629722922e-06, |
|
"loss": 1.5994, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.43828715365239296, |
|
"grad_norm": 0.4892403185367584, |
|
"learning_rate": 5.642317380352645e-06, |
|
"loss": 1.5845, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.44080604534005036, |
|
"grad_norm": 0.5153166055679321, |
|
"learning_rate": 5.617128463476071e-06, |
|
"loss": 1.5573, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4433249370277078, |
|
"grad_norm": 0.5289381146430969, |
|
"learning_rate": 5.591939546599497e-06, |
|
"loss": 1.5658, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.44584382871536526, |
|
"grad_norm": 0.45170825719833374, |
|
"learning_rate": 5.5667506297229225e-06, |
|
"loss": 1.5322, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.44836272040302266, |
|
"grad_norm": 0.45414310693740845, |
|
"learning_rate": 5.541561712846348e-06, |
|
"loss": 1.5872, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4508816120906801, |
|
"grad_norm": 0.47673285007476807, |
|
"learning_rate": 5.516372795969773e-06, |
|
"loss": 1.603, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4534005037783375, |
|
"grad_norm": 0.4653848707675934, |
|
"learning_rate": 5.4911838790931996e-06, |
|
"loss": 1.5235, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.45591939546599497, |
|
"grad_norm": 0.4475414752960205, |
|
"learning_rate": 5.465994962216625e-06, |
|
"loss": 1.5671, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.45843828715365237, |
|
"grad_norm": 0.48499029874801636, |
|
"learning_rate": 5.440806045340051e-06, |
|
"loss": 1.5912, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4609571788413098, |
|
"grad_norm": 0.4531858563423157, |
|
"learning_rate": 5.415617128463476e-06, |
|
"loss": 1.541, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4634760705289673, |
|
"grad_norm": 0.44078829884529114, |
|
"learning_rate": 5.390428211586902e-06, |
|
"loss": 1.583, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4659949622166247, |
|
"grad_norm": 0.47280648350715637, |
|
"learning_rate": 5.365239294710328e-06, |
|
"loss": 1.6233, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.46851385390428213, |
|
"grad_norm": 0.5612819194793701, |
|
"learning_rate": 5.340050377833753e-06, |
|
"loss": 1.6078, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.47103274559193953, |
|
"grad_norm": 0.4777447283267975, |
|
"learning_rate": 5.3148614609571795e-06, |
|
"loss": 1.5722, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.473551637279597, |
|
"grad_norm": 0.49805429577827454, |
|
"learning_rate": 5.289672544080605e-06, |
|
"loss": 1.6243, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4760705289672544, |
|
"grad_norm": 0.4395243525505066, |
|
"learning_rate": 5.264483627204031e-06, |
|
"loss": 1.5497, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.47858942065491183, |
|
"grad_norm": 0.7493352890014648, |
|
"learning_rate": 5.239294710327456e-06, |
|
"loss": 1.6466, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4811083123425693, |
|
"grad_norm": 0.5018370747566223, |
|
"learning_rate": 5.214105793450882e-06, |
|
"loss": 1.5492, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4836272040302267, |
|
"grad_norm": 0.4791150391101837, |
|
"learning_rate": 5.188916876574308e-06, |
|
"loss": 1.5679, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.48614609571788414, |
|
"grad_norm": 0.4814487099647522, |
|
"learning_rate": 5.163727959697733e-06, |
|
"loss": 1.5595, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.48866498740554154, |
|
"grad_norm": 0.44743016362190247, |
|
"learning_rate": 5.138539042821159e-06, |
|
"loss": 1.5971, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.491183879093199, |
|
"grad_norm": 0.47840508818626404, |
|
"learning_rate": 5.113350125944585e-06, |
|
"loss": 1.5414, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.49370277078085645, |
|
"grad_norm": 0.4497021436691284, |
|
"learning_rate": 5.088161209068011e-06, |
|
"loss": 1.5595, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.49622166246851385, |
|
"grad_norm": 0.49746012687683105, |
|
"learning_rate": 5.062972292191436e-06, |
|
"loss": 1.5403, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4987405541561713, |
|
"grad_norm": 0.4701424837112427, |
|
"learning_rate": 5.037783375314862e-06, |
|
"loss": 1.5597, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5012594458438288, |
|
"grad_norm": 0.4464475214481354, |
|
"learning_rate": 5.012594458438288e-06, |
|
"loss": 1.5436, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5037783375314862, |
|
"grad_norm": 0.5158559083938599, |
|
"learning_rate": 4.987405541561714e-06, |
|
"loss": 1.5638, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5062972292191436, |
|
"grad_norm": 0.5568498969078064, |
|
"learning_rate": 4.9622166246851385e-06, |
|
"loss": 1.5968, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5088161209068011, |
|
"grad_norm": 0.4441608488559723, |
|
"learning_rate": 4.937027707808565e-06, |
|
"loss": 1.54, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5113350125944585, |
|
"grad_norm": 0.4909915328025818, |
|
"learning_rate": 4.91183879093199e-06, |
|
"loss": 1.5439, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5138539042821159, |
|
"grad_norm": 0.4911031424999237, |
|
"learning_rate": 4.886649874055416e-06, |
|
"loss": 1.5438, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5163727959697733, |
|
"grad_norm": 0.7304896116256714, |
|
"learning_rate": 4.861460957178842e-06, |
|
"loss": 1.5061, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5188916876574308, |
|
"grad_norm": 0.4542643129825592, |
|
"learning_rate": 4.836272040302267e-06, |
|
"loss": 1.5738, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5214105793450882, |
|
"grad_norm": 0.8241648077964783, |
|
"learning_rate": 4.811083123425694e-06, |
|
"loss": 1.5982, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5239294710327456, |
|
"grad_norm": 0.45886871218681335, |
|
"learning_rate": 4.7858942065491185e-06, |
|
"loss": 1.5594, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5264483627204031, |
|
"grad_norm": 0.5265582799911499, |
|
"learning_rate": 4.760705289672544e-06, |
|
"loss": 1.57, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5289672544080605, |
|
"grad_norm": 0.46276602149009705, |
|
"learning_rate": 4.73551637279597e-06, |
|
"loss": 1.5475, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5314861460957179, |
|
"grad_norm": 0.5516127943992615, |
|
"learning_rate": 4.710327455919396e-06, |
|
"loss": 1.5497, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5340050377833753, |
|
"grad_norm": 0.485507071018219, |
|
"learning_rate": 4.685138539042821e-06, |
|
"loss": 1.5954, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5365239294710328, |
|
"grad_norm": 0.4667035937309265, |
|
"learning_rate": 4.659949622166247e-06, |
|
"loss": 1.5524, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5390428211586902, |
|
"grad_norm": 0.4725947082042694, |
|
"learning_rate": 4.6347607052896736e-06, |
|
"loss": 1.5701, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5415617128463476, |
|
"grad_norm": 0.48055243492126465, |
|
"learning_rate": 4.609571788413098e-06, |
|
"loss": 1.512, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5440806045340051, |
|
"grad_norm": 0.47020798921585083, |
|
"learning_rate": 4.584382871536524e-06, |
|
"loss": 1.517, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5465994962216625, |
|
"grad_norm": 0.458790123462677, |
|
"learning_rate": 4.55919395465995e-06, |
|
"loss": 1.5963, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5491183879093199, |
|
"grad_norm": 0.46757379174232483, |
|
"learning_rate": 4.5340050377833755e-06, |
|
"loss": 1.5307, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5516372795969773, |
|
"grad_norm": 0.48817694187164307, |
|
"learning_rate": 4.508816120906801e-06, |
|
"loss": 1.5096, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5541561712846348, |
|
"grad_norm": 0.46775302290916443, |
|
"learning_rate": 4.483627204030227e-06, |
|
"loss": 1.5081, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5566750629722922, |
|
"grad_norm": 0.4632299244403839, |
|
"learning_rate": 4.458438287153653e-06, |
|
"loss": 1.5274, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5591939546599496, |
|
"grad_norm": 0.6220762729644775, |
|
"learning_rate": 4.433249370277078e-06, |
|
"loss": 1.4909, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5617128463476071, |
|
"grad_norm": 0.4639570713043213, |
|
"learning_rate": 4.408060453400504e-06, |
|
"loss": 1.531, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5642317380352645, |
|
"grad_norm": 0.48596182465553284, |
|
"learning_rate": 4.38287153652393e-06, |
|
"loss": 1.522, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5667506297229219, |
|
"grad_norm": 0.4745020866394043, |
|
"learning_rate": 4.3576826196473555e-06, |
|
"loss": 1.5323, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5692695214105793, |
|
"grad_norm": 0.5056527853012085, |
|
"learning_rate": 4.332493702770781e-06, |
|
"loss": 1.5374, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5717884130982368, |
|
"grad_norm": 0.44245389103889465, |
|
"learning_rate": 4.307304785894207e-06, |
|
"loss": 1.5169, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5743073047858942, |
|
"grad_norm": 0.4938381016254425, |
|
"learning_rate": 4.282115869017633e-06, |
|
"loss": 1.5192, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5768261964735516, |
|
"grad_norm": 0.4689100384712219, |
|
"learning_rate": 4.256926952141058e-06, |
|
"loss": 1.5666, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5793450881612091, |
|
"grad_norm": 0.5333397388458252, |
|
"learning_rate": 4.231738035264484e-06, |
|
"loss": 1.5562, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5818639798488665, |
|
"grad_norm": 0.5024259090423584, |
|
"learning_rate": 4.20654911838791e-06, |
|
"loss": 1.5135, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5843828715365239, |
|
"grad_norm": 0.46757936477661133, |
|
"learning_rate": 4.181360201511335e-06, |
|
"loss": 1.522, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5869017632241813, |
|
"grad_norm": 0.5455654263496399, |
|
"learning_rate": 4.156171284634761e-06, |
|
"loss": 1.5281, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.5894206549118388, |
|
"grad_norm": 0.48288044333457947, |
|
"learning_rate": 4.130982367758187e-06, |
|
"loss": 1.5252, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5919395465994962, |
|
"grad_norm": 0.44919902086257935, |
|
"learning_rate": 4.1057934508816125e-06, |
|
"loss": 1.5371, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5944584382871536, |
|
"grad_norm": 0.4358011782169342, |
|
"learning_rate": 4.080604534005038e-06, |
|
"loss": 1.5419, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5969773299748111, |
|
"grad_norm": 0.518595278263092, |
|
"learning_rate": 4.055415617128464e-06, |
|
"loss": 1.538, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5994962216624685, |
|
"grad_norm": 0.6567726135253906, |
|
"learning_rate": 4.03022670025189e-06, |
|
"loss": 1.4867, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6020151133501259, |
|
"grad_norm": 0.48650607466697693, |
|
"learning_rate": 4.005037783375315e-06, |
|
"loss": 1.494, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6045340050377834, |
|
"grad_norm": 0.6559653878211975, |
|
"learning_rate": 3.979848866498741e-06, |
|
"loss": 1.54, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6070528967254408, |
|
"grad_norm": 0.45548874139785767, |
|
"learning_rate": 3.954659949622167e-06, |
|
"loss": 1.5148, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6095717884130982, |
|
"grad_norm": 0.6561994552612305, |
|
"learning_rate": 3.9294710327455925e-06, |
|
"loss": 1.5244, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6120906801007556, |
|
"grad_norm": 0.46143561601638794, |
|
"learning_rate": 3.904282115869018e-06, |
|
"loss": 1.5315, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6146095717884131, |
|
"grad_norm": 0.537300705909729, |
|
"learning_rate": 3.879093198992444e-06, |
|
"loss": 1.5424, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6171284634760705, |
|
"grad_norm": 0.46460816264152527, |
|
"learning_rate": 3.85390428211587e-06, |
|
"loss": 1.4941, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6196473551637279, |
|
"grad_norm": 0.48894399404525757, |
|
"learning_rate": 3.828715365239295e-06, |
|
"loss": 1.5294, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6221662468513854, |
|
"grad_norm": 0.4623178541660309, |
|
"learning_rate": 3.8035264483627206e-06, |
|
"loss": 1.5068, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6246851385390428, |
|
"grad_norm": 0.49979573488235474, |
|
"learning_rate": 3.7783375314861463e-06, |
|
"loss": 1.4801, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6272040302267002, |
|
"grad_norm": 0.5378308296203613, |
|
"learning_rate": 3.753148614609572e-06, |
|
"loss": 1.5444, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6297229219143576, |
|
"grad_norm": 0.5385175347328186, |
|
"learning_rate": 3.727959697732998e-06, |
|
"loss": 1.5249, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6322418136020151, |
|
"grad_norm": 0.46512940526008606, |
|
"learning_rate": 3.7027707808564234e-06, |
|
"loss": 1.5082, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6347607052896725, |
|
"grad_norm": 0.6099820733070374, |
|
"learning_rate": 3.6775818639798495e-06, |
|
"loss": 1.5297, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6372795969773299, |
|
"grad_norm": 0.4563128650188446, |
|
"learning_rate": 3.652392947103275e-06, |
|
"loss": 1.5108, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6397984886649875, |
|
"grad_norm": 0.4638257324695587, |
|
"learning_rate": 3.6272040302267005e-06, |
|
"loss": 1.492, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6423173803526449, |
|
"grad_norm": 0.4734160602092743, |
|
"learning_rate": 3.6020151133501262e-06, |
|
"loss": 1.5113, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6448362720403022, |
|
"grad_norm": 0.4613577127456665, |
|
"learning_rate": 3.576826196473552e-06, |
|
"loss": 1.5352, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6473551637279596, |
|
"grad_norm": 0.6752243638038635, |
|
"learning_rate": 3.5516372795969776e-06, |
|
"loss": 1.492, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6498740554156172, |
|
"grad_norm": 0.4645501673221588, |
|
"learning_rate": 3.5264483627204033e-06, |
|
"loss": 1.4993, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6523929471032746, |
|
"grad_norm": 0.5898957252502441, |
|
"learning_rate": 3.5012594458438295e-06, |
|
"loss": 1.4917, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.654911838790932, |
|
"grad_norm": 0.4554866552352905, |
|
"learning_rate": 3.4760705289672547e-06, |
|
"loss": 1.5192, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6574307304785895, |
|
"grad_norm": 0.4567941427230835, |
|
"learning_rate": 3.45088161209068e-06, |
|
"loss": 1.5442, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6599496221662469, |
|
"grad_norm": 0.4824671447277069, |
|
"learning_rate": 3.425692695214106e-06, |
|
"loss": 1.5348, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6624685138539043, |
|
"grad_norm": 0.4494476616382599, |
|
"learning_rate": 3.4005037783375314e-06, |
|
"loss": 1.5278, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6649874055415617, |
|
"grad_norm": 0.5391709208488464, |
|
"learning_rate": 3.3753148614609576e-06, |
|
"loss": 1.5277, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6675062972292192, |
|
"grad_norm": 0.4483042061328888, |
|
"learning_rate": 3.3501259445843833e-06, |
|
"loss": 1.4955, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6700251889168766, |
|
"grad_norm": 0.46210387349128723, |
|
"learning_rate": 3.3249370277078086e-06, |
|
"loss": 1.5077, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.672544080604534, |
|
"grad_norm": 0.5058848261833191, |
|
"learning_rate": 3.2997481108312347e-06, |
|
"loss": 1.4645, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6750629722921915, |
|
"grad_norm": 0.4964057207107544, |
|
"learning_rate": 3.27455919395466e-06, |
|
"loss": 1.4897, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6775818639798489, |
|
"grad_norm": 0.46125808358192444, |
|
"learning_rate": 3.249370277078086e-06, |
|
"loss": 1.5414, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6801007556675063, |
|
"grad_norm": 0.488656222820282, |
|
"learning_rate": 3.2241813602015114e-06, |
|
"loss": 1.4999, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6826196473551638, |
|
"grad_norm": 0.4692099988460541, |
|
"learning_rate": 3.1989924433249375e-06, |
|
"loss": 1.5402, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6851385390428212, |
|
"grad_norm": 0.49234357476234436, |
|
"learning_rate": 3.173803526448363e-06, |
|
"loss": 1.5373, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6876574307304786, |
|
"grad_norm": 0.596118152141571, |
|
"learning_rate": 3.1486146095717885e-06, |
|
"loss": 1.5145, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.690176322418136, |
|
"grad_norm": 0.4749690890312195, |
|
"learning_rate": 3.1234256926952146e-06, |
|
"loss": 1.4973, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6926952141057935, |
|
"grad_norm": 0.4940085709095001, |
|
"learning_rate": 3.09823677581864e-06, |
|
"loss": 1.464, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6952141057934509, |
|
"grad_norm": 0.47270411252975464, |
|
"learning_rate": 3.073047858942066e-06, |
|
"loss": 1.5094, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6977329974811083, |
|
"grad_norm": 0.4631718695163727, |
|
"learning_rate": 3.0478589420654913e-06, |
|
"loss": 1.4893, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7002518891687658, |
|
"grad_norm": 0.5515400171279907, |
|
"learning_rate": 3.0226700251889174e-06, |
|
"loss": 1.5342, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7027707808564232, |
|
"grad_norm": 0.5326355695724487, |
|
"learning_rate": 2.9974811083123427e-06, |
|
"loss": 1.5263, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7052896725440806, |
|
"grad_norm": 0.45032408833503723, |
|
"learning_rate": 2.9722921914357684e-06, |
|
"loss": 1.4977, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.707808564231738, |
|
"grad_norm": 0.49274197220802307, |
|
"learning_rate": 2.947103274559194e-06, |
|
"loss": 1.4729, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7103274559193955, |
|
"grad_norm": 0.45705220103263855, |
|
"learning_rate": 2.92191435768262e-06, |
|
"loss": 1.4908, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7128463476070529, |
|
"grad_norm": 0.46655991673469543, |
|
"learning_rate": 2.896725440806046e-06, |
|
"loss": 1.503, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7153652392947103, |
|
"grad_norm": 0.5047741532325745, |
|
"learning_rate": 2.8715365239294713e-06, |
|
"loss": 1.4656, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7178841309823678, |
|
"grad_norm": 0.4772416949272156, |
|
"learning_rate": 2.8463476070528965e-06, |
|
"loss": 1.4664, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7204030226700252, |
|
"grad_norm": 0.4567766487598419, |
|
"learning_rate": 2.8211586901763227e-06, |
|
"loss": 1.5123, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7229219143576826, |
|
"grad_norm": 0.4822060763835907, |
|
"learning_rate": 2.7959697732997484e-06, |
|
"loss": 1.5079, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.72544080604534, |
|
"grad_norm": 0.637371301651001, |
|
"learning_rate": 2.770780856423174e-06, |
|
"loss": 1.472, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7279596977329975, |
|
"grad_norm": 0.4881971478462219, |
|
"learning_rate": 2.7455919395465998e-06, |
|
"loss": 1.4737, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7304785894206549, |
|
"grad_norm": 0.4653415381908417, |
|
"learning_rate": 2.7204030226700255e-06, |
|
"loss": 1.5104, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7329974811083123, |
|
"grad_norm": 0.476697713136673, |
|
"learning_rate": 2.695214105793451e-06, |
|
"loss": 1.5072, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7355163727959698, |
|
"grad_norm": 0.6168654561042786, |
|
"learning_rate": 2.6700251889168765e-06, |
|
"loss": 1.5142, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7380352644836272, |
|
"grad_norm": 0.6653453707695007, |
|
"learning_rate": 2.6448362720403026e-06, |
|
"loss": 1.4897, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7405541561712846, |
|
"grad_norm": 0.4866642951965332, |
|
"learning_rate": 2.619647355163728e-06, |
|
"loss": 1.5409, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.743073047858942, |
|
"grad_norm": 0.4763050377368927, |
|
"learning_rate": 2.594458438287154e-06, |
|
"loss": 1.5306, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7455919395465995, |
|
"grad_norm": 0.5434437990188599, |
|
"learning_rate": 2.5692695214105793e-06, |
|
"loss": 1.5334, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7481108312342569, |
|
"grad_norm": 0.5760312080383301, |
|
"learning_rate": 2.5440806045340054e-06, |
|
"loss": 1.5138, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7506297229219143, |
|
"grad_norm": 0.44751110672950745, |
|
"learning_rate": 2.518891687657431e-06, |
|
"loss": 1.4845, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7531486146095718, |
|
"grad_norm": 0.4421987235546112, |
|
"learning_rate": 2.493702770780857e-06, |
|
"loss": 1.4837, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7556675062972292, |
|
"grad_norm": 0.7657718658447266, |
|
"learning_rate": 2.4685138539042825e-06, |
|
"loss": 1.5151, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7581863979848866, |
|
"grad_norm": 0.5052861571311951, |
|
"learning_rate": 2.443324937027708e-06, |
|
"loss": 1.5404, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.760705289672544, |
|
"grad_norm": 0.5251312851905823, |
|
"learning_rate": 2.4181360201511335e-06, |
|
"loss": 1.4329, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7632241813602015, |
|
"grad_norm": 0.46061962842941284, |
|
"learning_rate": 2.3929471032745592e-06, |
|
"loss": 1.4976, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7657430730478589, |
|
"grad_norm": 0.4743208587169647, |
|
"learning_rate": 2.367758186397985e-06, |
|
"loss": 1.4939, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7682619647355163, |
|
"grad_norm": 0.4864160418510437, |
|
"learning_rate": 2.3425692695214107e-06, |
|
"loss": 1.4997, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7707808564231738, |
|
"grad_norm": 0.47275349497795105, |
|
"learning_rate": 2.3173803526448368e-06, |
|
"loss": 1.4793, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7732997481108312, |
|
"grad_norm": 0.49562177062034607, |
|
"learning_rate": 2.292191435768262e-06, |
|
"loss": 1.4755, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7758186397984886, |
|
"grad_norm": 0.564599335193634, |
|
"learning_rate": 2.2670025188916878e-06, |
|
"loss": 1.4932, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7783375314861462, |
|
"grad_norm": 0.4657755494117737, |
|
"learning_rate": 2.2418136020151135e-06, |
|
"loss": 1.5076, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7808564231738035, |
|
"grad_norm": 0.486026793718338, |
|
"learning_rate": 2.216624685138539e-06, |
|
"loss": 1.5014, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.783375314861461, |
|
"grad_norm": 0.4599766135215759, |
|
"learning_rate": 2.191435768261965e-06, |
|
"loss": 1.5274, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7858942065491183, |
|
"grad_norm": 0.47607848048210144, |
|
"learning_rate": 2.1662468513853906e-06, |
|
"loss": 1.4701, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7884130982367759, |
|
"grad_norm": 0.47365328669548035, |
|
"learning_rate": 2.1410579345088163e-06, |
|
"loss": 1.4932, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7909319899244333, |
|
"grad_norm": 0.45562124252319336, |
|
"learning_rate": 2.115869017632242e-06, |
|
"loss": 1.4912, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7934508816120907, |
|
"grad_norm": 0.5331164002418518, |
|
"learning_rate": 2.0906801007556677e-06, |
|
"loss": 1.5174, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7959697732997482, |
|
"grad_norm": 0.509325385093689, |
|
"learning_rate": 2.0654911838790934e-06, |
|
"loss": 1.4788, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7984886649874056, |
|
"grad_norm": 0.4969271123409271, |
|
"learning_rate": 2.040302267002519e-06, |
|
"loss": 1.5377, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.801007556675063, |
|
"grad_norm": 0.44712427258491516, |
|
"learning_rate": 2.015113350125945e-06, |
|
"loss": 1.5279, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8035264483627204, |
|
"grad_norm": 0.47016969323158264, |
|
"learning_rate": 1.9899244332493705e-06, |
|
"loss": 1.5309, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8060453400503779, |
|
"grad_norm": 0.5187602043151855, |
|
"learning_rate": 1.9647355163727962e-06, |
|
"loss": 1.507, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8085642317380353, |
|
"grad_norm": 0.4568648636341095, |
|
"learning_rate": 1.939546599496222e-06, |
|
"loss": 1.4517, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8110831234256927, |
|
"grad_norm": 0.4813389480113983, |
|
"learning_rate": 1.9143576826196476e-06, |
|
"loss": 1.5215, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8136020151133502, |
|
"grad_norm": 0.5260921716690063, |
|
"learning_rate": 1.8891687657430731e-06, |
|
"loss": 1.5154, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8161209068010076, |
|
"grad_norm": 0.5113592743873596, |
|
"learning_rate": 1.863979848866499e-06, |
|
"loss": 1.4496, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.818639798488665, |
|
"grad_norm": 0.48540815711021423, |
|
"learning_rate": 1.8387909319899248e-06, |
|
"loss": 1.4874, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8211586901763224, |
|
"grad_norm": 0.4522131681442261, |
|
"learning_rate": 1.8136020151133503e-06, |
|
"loss": 1.4781, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8236775818639799, |
|
"grad_norm": 0.45719313621520996, |
|
"learning_rate": 1.788413098236776e-06, |
|
"loss": 1.4859, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8261964735516373, |
|
"grad_norm": 0.43814224004745483, |
|
"learning_rate": 1.7632241813602017e-06, |
|
"loss": 1.4775, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8287153652392947, |
|
"grad_norm": 0.44290891289711, |
|
"learning_rate": 1.7380352644836274e-06, |
|
"loss": 1.5037, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8312342569269522, |
|
"grad_norm": 0.4844774603843689, |
|
"learning_rate": 1.712846347607053e-06, |
|
"loss": 1.5179, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8337531486146096, |
|
"grad_norm": 0.4434620440006256, |
|
"learning_rate": 1.6876574307304788e-06, |
|
"loss": 1.494, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.836272040302267, |
|
"grad_norm": 0.46283698081970215, |
|
"learning_rate": 1.6624685138539043e-06, |
|
"loss": 1.4889, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8387909319899244, |
|
"grad_norm": 0.471802681684494, |
|
"learning_rate": 1.63727959697733e-06, |
|
"loss": 1.4558, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8413098236775819, |
|
"grad_norm": 0.4605620205402374, |
|
"learning_rate": 1.6120906801007557e-06, |
|
"loss": 1.5238, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8438287153652393, |
|
"grad_norm": 0.6928207874298096, |
|
"learning_rate": 1.5869017632241814e-06, |
|
"loss": 1.51, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8463476070528967, |
|
"grad_norm": 0.48179909586906433, |
|
"learning_rate": 1.5617128463476073e-06, |
|
"loss": 1.5368, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8488664987405542, |
|
"grad_norm": 0.5029130578041077, |
|
"learning_rate": 1.536523929471033e-06, |
|
"loss": 1.4563, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8513853904282116, |
|
"grad_norm": 0.4608486294746399, |
|
"learning_rate": 1.5113350125944587e-06, |
|
"loss": 1.463, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.853904282115869, |
|
"grad_norm": 0.5182480216026306, |
|
"learning_rate": 1.4861460957178842e-06, |
|
"loss": 1.465, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8564231738035264, |
|
"grad_norm": 0.4644806385040283, |
|
"learning_rate": 1.46095717884131e-06, |
|
"loss": 1.4987, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8589420654911839, |
|
"grad_norm": 0.4732770323753357, |
|
"learning_rate": 1.4357682619647356e-06, |
|
"loss": 1.5133, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8614609571788413, |
|
"grad_norm": 0.5835548043251038, |
|
"learning_rate": 1.4105793450881613e-06, |
|
"loss": 1.5233, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8639798488664987, |
|
"grad_norm": 0.45620298385620117, |
|
"learning_rate": 1.385390428211587e-06, |
|
"loss": 1.4727, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.8664987405541562, |
|
"grad_norm": 0.4693787395954132, |
|
"learning_rate": 1.3602015113350127e-06, |
|
"loss": 1.4706, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8690176322418136, |
|
"grad_norm": 0.6238934993743896, |
|
"learning_rate": 1.3350125944584382e-06, |
|
"loss": 1.5022, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.871536523929471, |
|
"grad_norm": 0.5140495896339417, |
|
"learning_rate": 1.309823677581864e-06, |
|
"loss": 1.4581, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8740554156171285, |
|
"grad_norm": 0.6451770663261414, |
|
"learning_rate": 1.2846347607052897e-06, |
|
"loss": 1.523, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.8765743073047859, |
|
"grad_norm": 0.5394758582115173, |
|
"learning_rate": 1.2594458438287156e-06, |
|
"loss": 1.4815, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8790931989924433, |
|
"grad_norm": 0.4751567840576172, |
|
"learning_rate": 1.2342569269521413e-06, |
|
"loss": 1.4666, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8816120906801007, |
|
"grad_norm": 0.5158999562263489, |
|
"learning_rate": 1.2090680100755668e-06, |
|
"loss": 1.477, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8841309823677582, |
|
"grad_norm": 0.47987380623817444, |
|
"learning_rate": 1.1838790931989925e-06, |
|
"loss": 1.4751, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.8866498740554156, |
|
"grad_norm": 0.45010906457901, |
|
"learning_rate": 1.1586901763224184e-06, |
|
"loss": 1.4935, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.889168765743073, |
|
"grad_norm": 0.4675264060497284, |
|
"learning_rate": 1.1335012594458439e-06, |
|
"loss": 1.4767, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8916876574307305, |
|
"grad_norm": 0.4817536175251007, |
|
"learning_rate": 1.1083123425692696e-06, |
|
"loss": 1.5079, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8942065491183879, |
|
"grad_norm": 0.5326683521270752, |
|
"learning_rate": 1.0831234256926953e-06, |
|
"loss": 1.4643, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8967254408060453, |
|
"grad_norm": 0.45862582325935364, |
|
"learning_rate": 1.057934508816121e-06, |
|
"loss": 1.4784, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8992443324937027, |
|
"grad_norm": 0.4639340937137604, |
|
"learning_rate": 1.0327455919395467e-06, |
|
"loss": 1.4669, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.9017632241813602, |
|
"grad_norm": 0.5519356727600098, |
|
"learning_rate": 1.0075566750629724e-06, |
|
"loss": 1.4962, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9042821158690176, |
|
"grad_norm": 0.5423635244369507, |
|
"learning_rate": 9.823677581863981e-07, |
|
"loss": 1.5149, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.906801007556675, |
|
"grad_norm": 0.4961482286453247, |
|
"learning_rate": 9.571788413098238e-07, |
|
"loss": 1.4841, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9093198992443325, |
|
"grad_norm": 0.5558215379714966, |
|
"learning_rate": 9.319899244332495e-07, |
|
"loss": 1.4672, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9118387909319899, |
|
"grad_norm": 0.47575876116752625, |
|
"learning_rate": 9.068010075566751e-07, |
|
"loss": 1.5035, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9143576826196473, |
|
"grad_norm": 0.44151756167411804, |
|
"learning_rate": 8.816120906801008e-07, |
|
"loss": 1.4923, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9168765743073047, |
|
"grad_norm": 0.49502983689308167, |
|
"learning_rate": 8.564231738035265e-07, |
|
"loss": 1.4872, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9193954659949622, |
|
"grad_norm": 0.4563881456851959, |
|
"learning_rate": 8.312342569269521e-07, |
|
"loss": 1.5022, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9219143576826196, |
|
"grad_norm": 0.4814889132976532, |
|
"learning_rate": 8.060453400503778e-07, |
|
"loss": 1.4922, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.924433249370277, |
|
"grad_norm": 0.44825509190559387, |
|
"learning_rate": 7.808564231738037e-07, |
|
"loss": 1.4695, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9269521410579346, |
|
"grad_norm": 0.46482357382774353, |
|
"learning_rate": 7.556675062972294e-07, |
|
"loss": 1.4943, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.929471032745592, |
|
"grad_norm": 0.5883563160896301, |
|
"learning_rate": 7.30478589420655e-07, |
|
"loss": 1.4658, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9319899244332494, |
|
"grad_norm": 0.6148042678833008, |
|
"learning_rate": 7.052896725440807e-07, |
|
"loss": 1.4528, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9345088161209067, |
|
"grad_norm": 0.4770396649837494, |
|
"learning_rate": 6.801007556675064e-07, |
|
"loss": 1.4914, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9370277078085643, |
|
"grad_norm": 0.46335241198539734, |
|
"learning_rate": 6.54911838790932e-07, |
|
"loss": 1.5172, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9395465994962217, |
|
"grad_norm": 0.46679455041885376, |
|
"learning_rate": 6.297229219143578e-07, |
|
"loss": 1.4426, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9420654911838791, |
|
"grad_norm": 0.5507463216781616, |
|
"learning_rate": 6.045340050377834e-07, |
|
"loss": 1.5067, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9445843828715366, |
|
"grad_norm": 0.468250572681427, |
|
"learning_rate": 5.793450881612092e-07, |
|
"loss": 1.5105, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.947103274559194, |
|
"grad_norm": 0.6048943996429443, |
|
"learning_rate": 5.541561712846348e-07, |
|
"loss": 1.4814, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9496221662468514, |
|
"grad_norm": 0.4735409617424011, |
|
"learning_rate": 5.289672544080605e-07, |
|
"loss": 1.4739, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9521410579345088, |
|
"grad_norm": 0.5519718527793884, |
|
"learning_rate": 5.037783375314862e-07, |
|
"loss": 1.5022, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9546599496221663, |
|
"grad_norm": 0.4825071692466736, |
|
"learning_rate": 4.785894206549119e-07, |
|
"loss": 1.4977, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9571788413098237, |
|
"grad_norm": 0.44791093468666077, |
|
"learning_rate": 4.5340050377833756e-07, |
|
"loss": 1.4912, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9596977329974811, |
|
"grad_norm": 0.6440786719322205, |
|
"learning_rate": 4.2821158690176327e-07, |
|
"loss": 1.4602, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9622166246851386, |
|
"grad_norm": 0.4575777053833008, |
|
"learning_rate": 4.030226700251889e-07, |
|
"loss": 1.4833, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.964735516372796, |
|
"grad_norm": 0.47071707248687744, |
|
"learning_rate": 3.778337531486147e-07, |
|
"loss": 1.4963, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9672544080604534, |
|
"grad_norm": 0.6902024745941162, |
|
"learning_rate": 3.5264483627204033e-07, |
|
"loss": 1.4699, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9697732997481109, |
|
"grad_norm": 0.48268118500709534, |
|
"learning_rate": 3.27455919395466e-07, |
|
"loss": 1.472, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9722921914357683, |
|
"grad_norm": 0.4497368335723877, |
|
"learning_rate": 3.022670025188917e-07, |
|
"loss": 1.4654, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9748110831234257, |
|
"grad_norm": 0.5587329864501953, |
|
"learning_rate": 2.770780856423174e-07, |
|
"loss": 1.5351, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.9773299748110831, |
|
"grad_norm": 0.5236759185791016, |
|
"learning_rate": 2.518891687657431e-07, |
|
"loss": 1.4955, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9798488664987406, |
|
"grad_norm": 0.4622642397880554, |
|
"learning_rate": 2.2670025188916878e-07, |
|
"loss": 1.4956, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.982367758186398, |
|
"grad_norm": 0.4652063548564911, |
|
"learning_rate": 2.0151133501259446e-07, |
|
"loss": 1.4875, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9848866498740554, |
|
"grad_norm": 0.44629859924316406, |
|
"learning_rate": 1.7632241813602017e-07, |
|
"loss": 1.4543, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.9874055415617129, |
|
"grad_norm": 0.45472198724746704, |
|
"learning_rate": 1.5113350125944585e-07, |
|
"loss": 1.5048, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9899244332493703, |
|
"grad_norm": 0.4791916608810425, |
|
"learning_rate": 1.2594458438287155e-07, |
|
"loss": 1.4998, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9924433249370277, |
|
"grad_norm": 0.45487239956855774, |
|
"learning_rate": 1.0075566750629723e-07, |
|
"loss": 1.5058, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9949622166246851, |
|
"grad_norm": 0.5730354189872742, |
|
"learning_rate": 7.556675062972292e-08, |
|
"loss": 1.5314, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9974811083123426, |
|
"grad_norm": 0.47194746136665344, |
|
"learning_rate": 5.0377833753148615e-08, |
|
"loss": 1.5077, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5024914741516113, |
|
"learning_rate": 2.5188916876574308e-08, |
|
"loss": 1.4974, |
|
"step": 397 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 397, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.840368526032896e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|