Stewart Slocum
Add fine-tuned model
2aa1d12
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 397,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025188916876574307,
"grad_norm": 1.420465350151062,
"learning_rate": 1e-05,
"loss": 2.5777,
"step": 1
},
{
"epoch": 0.005037783375314861,
"grad_norm": 1.4124211072921753,
"learning_rate": 9.974811083123427e-06,
"loss": 2.5574,
"step": 2
},
{
"epoch": 0.007556675062972292,
"grad_norm": 1.444077730178833,
"learning_rate": 9.949622166246852e-06,
"loss": 2.7149,
"step": 3
},
{
"epoch": 0.010075566750629723,
"grad_norm": 1.2692691087722778,
"learning_rate": 9.924433249370277e-06,
"loss": 2.4942,
"step": 4
},
{
"epoch": 0.012594458438287154,
"grad_norm": 1.2546937465667725,
"learning_rate": 9.899244332493704e-06,
"loss": 2.5284,
"step": 5
},
{
"epoch": 0.015113350125944584,
"grad_norm": 1.2006076574325562,
"learning_rate": 9.87405541561713e-06,
"loss": 2.5203,
"step": 6
},
{
"epoch": 0.017632241813602016,
"grad_norm": 1.1375973224639893,
"learning_rate": 9.848866498740555e-06,
"loss": 2.4494,
"step": 7
},
{
"epoch": 0.020151133501259445,
"grad_norm": 1.0649913549423218,
"learning_rate": 9.82367758186398e-06,
"loss": 2.4138,
"step": 8
},
{
"epoch": 0.022670025188916875,
"grad_norm": 1.0274866819381714,
"learning_rate": 9.798488664987406e-06,
"loss": 2.3557,
"step": 9
},
{
"epoch": 0.02518891687657431,
"grad_norm": 1.0478529930114746,
"learning_rate": 9.773299748110831e-06,
"loss": 2.4614,
"step": 10
},
{
"epoch": 0.027707808564231738,
"grad_norm": 0.9700673818588257,
"learning_rate": 9.748110831234258e-06,
"loss": 2.4212,
"step": 11
},
{
"epoch": 0.030226700251889168,
"grad_norm": 0.8414812684059143,
"learning_rate": 9.722921914357684e-06,
"loss": 2.2299,
"step": 12
},
{
"epoch": 0.0327455919395466,
"grad_norm": 0.8956544399261475,
"learning_rate": 9.69773299748111e-06,
"loss": 2.3443,
"step": 13
},
{
"epoch": 0.03526448362720403,
"grad_norm": 0.9195625185966492,
"learning_rate": 9.672544080604534e-06,
"loss": 2.2813,
"step": 14
},
{
"epoch": 0.037783375314861464,
"grad_norm": 0.8406645655632019,
"learning_rate": 9.64735516372796e-06,
"loss": 2.2909,
"step": 15
},
{
"epoch": 0.04030226700251889,
"grad_norm": 0.8406001925468445,
"learning_rate": 9.622166246851387e-06,
"loss": 2.3022,
"step": 16
},
{
"epoch": 0.042821158690176324,
"grad_norm": 0.8053434491157532,
"learning_rate": 9.596977329974812e-06,
"loss": 2.2592,
"step": 17
},
{
"epoch": 0.04534005037783375,
"grad_norm": 0.8638896346092224,
"learning_rate": 9.571788413098237e-06,
"loss": 2.3171,
"step": 18
},
{
"epoch": 0.04785894206549118,
"grad_norm": 0.8893205523490906,
"learning_rate": 9.546599496221664e-06,
"loss": 2.2565,
"step": 19
},
{
"epoch": 0.05037783375314862,
"grad_norm": 0.7514384984970093,
"learning_rate": 9.521410579345088e-06,
"loss": 2.1959,
"step": 20
},
{
"epoch": 0.05289672544080604,
"grad_norm": 0.7832961678504944,
"learning_rate": 9.496221662468515e-06,
"loss": 2.2186,
"step": 21
},
{
"epoch": 0.055415617128463476,
"grad_norm": 0.7781046628952026,
"learning_rate": 9.47103274559194e-06,
"loss": 2.207,
"step": 22
},
{
"epoch": 0.05793450881612091,
"grad_norm": 0.7359276413917542,
"learning_rate": 9.445843828715366e-06,
"loss": 2.1479,
"step": 23
},
{
"epoch": 0.060453400503778336,
"grad_norm": 0.7263805866241455,
"learning_rate": 9.420654911838791e-06,
"loss": 2.1799,
"step": 24
},
{
"epoch": 0.06297229219143577,
"grad_norm": 0.6834078431129456,
"learning_rate": 9.395465994962218e-06,
"loss": 2.1216,
"step": 25
},
{
"epoch": 0.0654911838790932,
"grad_norm": 0.6694800853729248,
"learning_rate": 9.370277078085643e-06,
"loss": 2.0769,
"step": 26
},
{
"epoch": 0.06801007556675064,
"grad_norm": 0.6812991499900818,
"learning_rate": 9.345088161209067e-06,
"loss": 2.146,
"step": 27
},
{
"epoch": 0.07052896725440806,
"grad_norm": 0.6379550695419312,
"learning_rate": 9.319899244332494e-06,
"loss": 2.0901,
"step": 28
},
{
"epoch": 0.07304785894206549,
"grad_norm": 0.6825947761535645,
"learning_rate": 9.29471032745592e-06,
"loss": 2.1533,
"step": 29
},
{
"epoch": 0.07556675062972293,
"grad_norm": 0.7910833954811096,
"learning_rate": 9.269521410579347e-06,
"loss": 2.1828,
"step": 30
},
{
"epoch": 0.07808564231738035,
"grad_norm": 0.6861229538917542,
"learning_rate": 9.244332493702772e-06,
"loss": 2.1502,
"step": 31
},
{
"epoch": 0.08060453400503778,
"grad_norm": 0.6285768747329712,
"learning_rate": 9.219143576826197e-06,
"loss": 2.1031,
"step": 32
},
{
"epoch": 0.08312342569269521,
"grad_norm": 0.6474770903587341,
"learning_rate": 9.193954659949623e-06,
"loss": 2.087,
"step": 33
},
{
"epoch": 0.08564231738035265,
"grad_norm": 0.5884003043174744,
"learning_rate": 9.168765743073048e-06,
"loss": 2.0418,
"step": 34
},
{
"epoch": 0.08816120906801007,
"grad_norm": 0.5800574421882629,
"learning_rate": 9.143576826196475e-06,
"loss": 2.0484,
"step": 35
},
{
"epoch": 0.0906801007556675,
"grad_norm": 0.5606217980384827,
"learning_rate": 9.1183879093199e-06,
"loss": 2.0026,
"step": 36
},
{
"epoch": 0.09319899244332494,
"grad_norm": 0.6527896523475647,
"learning_rate": 9.093198992443326e-06,
"loss": 1.9611,
"step": 37
},
{
"epoch": 0.09571788413098237,
"grad_norm": 0.5732287764549255,
"learning_rate": 9.068010075566751e-06,
"loss": 2.0348,
"step": 38
},
{
"epoch": 0.0982367758186398,
"grad_norm": 0.5753059387207031,
"learning_rate": 9.042821158690178e-06,
"loss": 2.0062,
"step": 39
},
{
"epoch": 0.10075566750629723,
"grad_norm": 0.5425299406051636,
"learning_rate": 9.017632241813602e-06,
"loss": 1.9781,
"step": 40
},
{
"epoch": 0.10327455919395466,
"grad_norm": 0.5520154237747192,
"learning_rate": 8.992443324937027e-06,
"loss": 1.9927,
"step": 41
},
{
"epoch": 0.10579345088161209,
"grad_norm": 0.5321075320243835,
"learning_rate": 8.967254408060454e-06,
"loss": 1.9715,
"step": 42
},
{
"epoch": 0.10831234256926953,
"grad_norm": 0.5192540287971497,
"learning_rate": 8.94206549118388e-06,
"loss": 1.9771,
"step": 43
},
{
"epoch": 0.11083123425692695,
"grad_norm": 0.5216296315193176,
"learning_rate": 8.916876574307305e-06,
"loss": 1.9554,
"step": 44
},
{
"epoch": 0.11335012594458438,
"grad_norm": 0.5138005614280701,
"learning_rate": 8.89168765743073e-06,
"loss": 1.9501,
"step": 45
},
{
"epoch": 0.11586901763224182,
"grad_norm": 0.5473687052726746,
"learning_rate": 8.866498740554157e-06,
"loss": 1.9943,
"step": 46
},
{
"epoch": 0.11838790931989925,
"grad_norm": 0.5291565656661987,
"learning_rate": 8.841309823677583e-06,
"loss": 1.9401,
"step": 47
},
{
"epoch": 0.12090680100755667,
"grad_norm": 0.5129333734512329,
"learning_rate": 8.816120906801008e-06,
"loss": 1.9557,
"step": 48
},
{
"epoch": 0.12342569269521411,
"grad_norm": 0.5359098315238953,
"learning_rate": 8.790931989924435e-06,
"loss": 1.9787,
"step": 49
},
{
"epoch": 0.12594458438287154,
"grad_norm": 0.4913354814052582,
"learning_rate": 8.76574307304786e-06,
"loss": 1.9198,
"step": 50
},
{
"epoch": 0.12846347607052896,
"grad_norm": 0.4875161647796631,
"learning_rate": 8.740554156171286e-06,
"loss": 1.9497,
"step": 51
},
{
"epoch": 0.1309823677581864,
"grad_norm": 0.47248420119285583,
"learning_rate": 8.715365239294711e-06,
"loss": 1.8747,
"step": 52
},
{
"epoch": 0.13350125944584382,
"grad_norm": 0.48350995779037476,
"learning_rate": 8.690176322418138e-06,
"loss": 1.8919,
"step": 53
},
{
"epoch": 0.13602015113350127,
"grad_norm": 0.48570191860198975,
"learning_rate": 8.664987405541562e-06,
"loss": 1.8958,
"step": 54
},
{
"epoch": 0.1385390428211587,
"grad_norm": 0.47888582944869995,
"learning_rate": 8.639798488664987e-06,
"loss": 1.8924,
"step": 55
},
{
"epoch": 0.14105793450881612,
"grad_norm": 0.4759175479412079,
"learning_rate": 8.614609571788414e-06,
"loss": 1.8559,
"step": 56
},
{
"epoch": 0.14357682619647355,
"grad_norm": 0.47866225242614746,
"learning_rate": 8.58942065491184e-06,
"loss": 1.8297,
"step": 57
},
{
"epoch": 0.14609571788413098,
"grad_norm": 0.47261252999305725,
"learning_rate": 8.564231738035265e-06,
"loss": 1.9205,
"step": 58
},
{
"epoch": 0.1486146095717884,
"grad_norm": 0.4570164978504181,
"learning_rate": 8.53904282115869e-06,
"loss": 1.8286,
"step": 59
},
{
"epoch": 0.15113350125944586,
"grad_norm": 0.45629221200942993,
"learning_rate": 8.513853904282117e-06,
"loss": 1.8307,
"step": 60
},
{
"epoch": 0.15365239294710328,
"grad_norm": 0.4506438374519348,
"learning_rate": 8.488664987405543e-06,
"loss": 1.8264,
"step": 61
},
{
"epoch": 0.1561712846347607,
"grad_norm": 0.46889957785606384,
"learning_rate": 8.463476070528968e-06,
"loss": 1.8619,
"step": 62
},
{
"epoch": 0.15869017632241814,
"grad_norm": 0.4415088891983032,
"learning_rate": 8.438287153652393e-06,
"loss": 1.8088,
"step": 63
},
{
"epoch": 0.16120906801007556,
"grad_norm": 0.6827300786972046,
"learning_rate": 8.41309823677582e-06,
"loss": 1.8091,
"step": 64
},
{
"epoch": 0.163727959697733,
"grad_norm": 0.4396965503692627,
"learning_rate": 8.387909319899244e-06,
"loss": 1.8235,
"step": 65
},
{
"epoch": 0.16624685138539042,
"grad_norm": 0.4572596549987793,
"learning_rate": 8.36272040302267e-06,
"loss": 1.8473,
"step": 66
},
{
"epoch": 0.16876574307304787,
"grad_norm": 0.47288888692855835,
"learning_rate": 8.337531486146096e-06,
"loss": 1.7847,
"step": 67
},
{
"epoch": 0.1712846347607053,
"grad_norm": 0.42984092235565186,
"learning_rate": 8.312342569269522e-06,
"loss": 1.8235,
"step": 68
},
{
"epoch": 0.17380352644836272,
"grad_norm": 0.4297022521495819,
"learning_rate": 8.287153652392947e-06,
"loss": 1.7944,
"step": 69
},
{
"epoch": 0.17632241813602015,
"grad_norm": 0.44730344414711,
"learning_rate": 8.261964735516374e-06,
"loss": 1.8026,
"step": 70
},
{
"epoch": 0.17884130982367757,
"grad_norm": 0.45562756061553955,
"learning_rate": 8.2367758186398e-06,
"loss": 1.8084,
"step": 71
},
{
"epoch": 0.181360201511335,
"grad_norm": 0.43180692195892334,
"learning_rate": 8.211586901763225e-06,
"loss": 1.804,
"step": 72
},
{
"epoch": 0.18387909319899245,
"grad_norm": 0.4151434302330017,
"learning_rate": 8.18639798488665e-06,
"loss": 1.745,
"step": 73
},
{
"epoch": 0.18639798488664988,
"grad_norm": 0.42020657658576965,
"learning_rate": 8.161209068010076e-06,
"loss": 1.729,
"step": 74
},
{
"epoch": 0.1889168765743073,
"grad_norm": 0.4290010631084442,
"learning_rate": 8.136020151133503e-06,
"loss": 1.7916,
"step": 75
},
{
"epoch": 0.19143576826196473,
"grad_norm": 0.4147432744503021,
"learning_rate": 8.110831234256928e-06,
"loss": 1.7515,
"step": 76
},
{
"epoch": 0.19395465994962216,
"grad_norm": 0.4140765964984894,
"learning_rate": 8.085642317380353e-06,
"loss": 1.7494,
"step": 77
},
{
"epoch": 0.1964735516372796,
"grad_norm": 0.42620202898979187,
"learning_rate": 8.06045340050378e-06,
"loss": 1.7654,
"step": 78
},
{
"epoch": 0.19899244332493704,
"grad_norm": 0.46347954869270325,
"learning_rate": 8.035264483627204e-06,
"loss": 1.7229,
"step": 79
},
{
"epoch": 0.20151133501259447,
"grad_norm": 0.44873306155204773,
"learning_rate": 8.01007556675063e-06,
"loss": 1.7153,
"step": 80
},
{
"epoch": 0.2040302267002519,
"grad_norm": 0.46032124757766724,
"learning_rate": 7.984886649874056e-06,
"loss": 1.7187,
"step": 81
},
{
"epoch": 0.20654911838790932,
"grad_norm": 0.40681278705596924,
"learning_rate": 7.959697732997482e-06,
"loss": 1.7562,
"step": 82
},
{
"epoch": 0.20906801007556675,
"grad_norm": 0.4236059784889221,
"learning_rate": 7.934508816120907e-06,
"loss": 1.7158,
"step": 83
},
{
"epoch": 0.21158690176322417,
"grad_norm": 0.42215806245803833,
"learning_rate": 7.909319899244334e-06,
"loss": 1.7113,
"step": 84
},
{
"epoch": 0.2141057934508816,
"grad_norm": 0.4768604040145874,
"learning_rate": 7.884130982367758e-06,
"loss": 1.7382,
"step": 85
},
{
"epoch": 0.21662468513853905,
"grad_norm": 0.5776296854019165,
"learning_rate": 7.858942065491185e-06,
"loss": 1.7144,
"step": 86
},
{
"epoch": 0.21914357682619648,
"grad_norm": 0.42450252175331116,
"learning_rate": 7.83375314861461e-06,
"loss": 1.7112,
"step": 87
},
{
"epoch": 0.2216624685138539,
"grad_norm": 0.5352158546447754,
"learning_rate": 7.808564231738036e-06,
"loss": 1.7534,
"step": 88
},
{
"epoch": 0.22418136020151133,
"grad_norm": 0.42181944847106934,
"learning_rate": 7.783375314861463e-06,
"loss": 1.6889,
"step": 89
},
{
"epoch": 0.22670025188916876,
"grad_norm": 0.41291770339012146,
"learning_rate": 7.758186397984888e-06,
"loss": 1.7026,
"step": 90
},
{
"epoch": 0.22921914357682618,
"grad_norm": 0.4190363883972168,
"learning_rate": 7.732997481108313e-06,
"loss": 1.736,
"step": 91
},
{
"epoch": 0.23173803526448364,
"grad_norm": 0.48135876655578613,
"learning_rate": 7.70780856423174e-06,
"loss": 1.732,
"step": 92
},
{
"epoch": 0.23425692695214106,
"grad_norm": 0.4111890494823456,
"learning_rate": 7.682619647355164e-06,
"loss": 1.7054,
"step": 93
},
{
"epoch": 0.2367758186397985,
"grad_norm": 0.5191890597343445,
"learning_rate": 7.65743073047859e-06,
"loss": 1.7147,
"step": 94
},
{
"epoch": 0.23929471032745592,
"grad_norm": 0.4190960228443146,
"learning_rate": 7.632241813602015e-06,
"loss": 1.7128,
"step": 95
},
{
"epoch": 0.24181360201511334,
"grad_norm": 0.4258028566837311,
"learning_rate": 7.607052896725441e-06,
"loss": 1.7264,
"step": 96
},
{
"epoch": 0.24433249370277077,
"grad_norm": 0.4177513122558594,
"learning_rate": 7.581863979848867e-06,
"loss": 1.6779,
"step": 97
},
{
"epoch": 0.24685138539042822,
"grad_norm": 0.4666061997413635,
"learning_rate": 7.5566750629722926e-06,
"loss": 1.6453,
"step": 98
},
{
"epoch": 0.24937027707808565,
"grad_norm": 0.4100574553012848,
"learning_rate": 7.531486146095719e-06,
"loss": 1.6963,
"step": 99
},
{
"epoch": 0.2518891687657431,
"grad_norm": 0.4570634067058563,
"learning_rate": 7.506297229219144e-06,
"loss": 1.7032,
"step": 100
},
{
"epoch": 0.25440806045340053,
"grad_norm": 0.42653965950012207,
"learning_rate": 7.48110831234257e-06,
"loss": 1.6624,
"step": 101
},
{
"epoch": 0.25692695214105793,
"grad_norm": 0.4480111300945282,
"learning_rate": 7.455919395465996e-06,
"loss": 1.7011,
"step": 102
},
{
"epoch": 0.2594458438287154,
"grad_norm": 0.4271489977836609,
"learning_rate": 7.430730478589421e-06,
"loss": 1.6716,
"step": 103
},
{
"epoch": 0.2619647355163728,
"grad_norm": 0.41798117756843567,
"learning_rate": 7.405541561712847e-06,
"loss": 1.6759,
"step": 104
},
{
"epoch": 0.26448362720403024,
"grad_norm": 0.40775346755981445,
"learning_rate": 7.3803526448362725e-06,
"loss": 1.6778,
"step": 105
},
{
"epoch": 0.26700251889168763,
"grad_norm": 0.41410258412361145,
"learning_rate": 7.355163727959699e-06,
"loss": 1.6924,
"step": 106
},
{
"epoch": 0.2695214105793451,
"grad_norm": 0.42046648263931274,
"learning_rate": 7.329974811083124e-06,
"loss": 1.6468,
"step": 107
},
{
"epoch": 0.27204030226700254,
"grad_norm": 0.4312984347343445,
"learning_rate": 7.30478589420655e-06,
"loss": 1.7088,
"step": 108
},
{
"epoch": 0.27455919395465994,
"grad_norm": 0.4318784475326538,
"learning_rate": 7.279596977329975e-06,
"loss": 1.6773,
"step": 109
},
{
"epoch": 0.2770780856423174,
"grad_norm": 0.4189915359020233,
"learning_rate": 7.254408060453401e-06,
"loss": 1.6194,
"step": 110
},
{
"epoch": 0.2795969773299748,
"grad_norm": 0.5198895931243896,
"learning_rate": 7.229219143576827e-06,
"loss": 1.6848,
"step": 111
},
{
"epoch": 0.28211586901763225,
"grad_norm": 0.5195222496986389,
"learning_rate": 7.2040302267002524e-06,
"loss": 1.6813,
"step": 112
},
{
"epoch": 0.28463476070528965,
"grad_norm": 0.45624077320098877,
"learning_rate": 7.178841309823679e-06,
"loss": 1.6448,
"step": 113
},
{
"epoch": 0.2871536523929471,
"grad_norm": 0.49435746669769287,
"learning_rate": 7.153652392947104e-06,
"loss": 1.6584,
"step": 114
},
{
"epoch": 0.28967254408060455,
"grad_norm": 0.4301837086677551,
"learning_rate": 7.1284634760705296e-06,
"loss": 1.6652,
"step": 115
},
{
"epoch": 0.29219143576826195,
"grad_norm": 0.4709468185901642,
"learning_rate": 7.103274559193955e-06,
"loss": 1.6371,
"step": 116
},
{
"epoch": 0.2947103274559194,
"grad_norm": 0.45211878418922424,
"learning_rate": 7.07808564231738e-06,
"loss": 1.6672,
"step": 117
},
{
"epoch": 0.2972292191435768,
"grad_norm": 0.4376428723335266,
"learning_rate": 7.052896725440807e-06,
"loss": 1.695,
"step": 118
},
{
"epoch": 0.29974811083123426,
"grad_norm": 0.4894670844078064,
"learning_rate": 7.027707808564232e-06,
"loss": 1.6617,
"step": 119
},
{
"epoch": 0.3022670025188917,
"grad_norm": 0.454942911863327,
"learning_rate": 7.002518891687659e-06,
"loss": 1.5997,
"step": 120
},
{
"epoch": 0.3047858942065491,
"grad_norm": 0.547237753868103,
"learning_rate": 6.977329974811084e-06,
"loss": 1.6714,
"step": 121
},
{
"epoch": 0.30730478589420657,
"grad_norm": 0.43378302454948425,
"learning_rate": 6.9521410579345095e-06,
"loss": 1.6393,
"step": 122
},
{
"epoch": 0.30982367758186397,
"grad_norm": 0.43780213594436646,
"learning_rate": 6.926952141057935e-06,
"loss": 1.6717,
"step": 123
},
{
"epoch": 0.3123425692695214,
"grad_norm": 0.4194709062576294,
"learning_rate": 6.90176322418136e-06,
"loss": 1.6464,
"step": 124
},
{
"epoch": 0.3148614609571788,
"grad_norm": 0.42024093866348267,
"learning_rate": 6.876574307304787e-06,
"loss": 1.6275,
"step": 125
},
{
"epoch": 0.31738035264483627,
"grad_norm": 0.4303475022315979,
"learning_rate": 6.851385390428212e-06,
"loss": 1.6375,
"step": 126
},
{
"epoch": 0.3198992443324937,
"grad_norm": 0.42517420649528503,
"learning_rate": 6.826196473551638e-06,
"loss": 1.6363,
"step": 127
},
{
"epoch": 0.3224181360201511,
"grad_norm": 0.42485958337783813,
"learning_rate": 6.801007556675063e-06,
"loss": 1.6302,
"step": 128
},
{
"epoch": 0.3249370277078086,
"grad_norm": 0.4118500053882599,
"learning_rate": 6.7758186397984894e-06,
"loss": 1.6174,
"step": 129
},
{
"epoch": 0.327455919395466,
"grad_norm": 0.4531554579734802,
"learning_rate": 6.750629722921915e-06,
"loss": 1.6283,
"step": 130
},
{
"epoch": 0.32997481108312343,
"grad_norm": 0.41642168164253235,
"learning_rate": 6.72544080604534e-06,
"loss": 1.6328,
"step": 131
},
{
"epoch": 0.33249370277078083,
"grad_norm": 0.4234478771686554,
"learning_rate": 6.7002518891687666e-06,
"loss": 1.6393,
"step": 132
},
{
"epoch": 0.3350125944584383,
"grad_norm": 0.4314388930797577,
"learning_rate": 6.675062972292192e-06,
"loss": 1.6371,
"step": 133
},
{
"epoch": 0.33753148614609574,
"grad_norm": 0.49057596921920776,
"learning_rate": 6.649874055415617e-06,
"loss": 1.6777,
"step": 134
},
{
"epoch": 0.34005037783375314,
"grad_norm": 0.6537044644355774,
"learning_rate": 6.624685138539043e-06,
"loss": 1.6021,
"step": 135
},
{
"epoch": 0.3425692695214106,
"grad_norm": 0.44913142919540405,
"learning_rate": 6.599496221662469e-06,
"loss": 1.6369,
"step": 136
},
{
"epoch": 0.345088161209068,
"grad_norm": 0.41060981154441833,
"learning_rate": 6.574307304785895e-06,
"loss": 1.6001,
"step": 137
},
{
"epoch": 0.34760705289672544,
"grad_norm": 0.44041162729263306,
"learning_rate": 6.54911838790932e-06,
"loss": 1.6352,
"step": 138
},
{
"epoch": 0.3501259445843829,
"grad_norm": 0.42245715856552124,
"learning_rate": 6.5239294710327465e-06,
"loss": 1.6195,
"step": 139
},
{
"epoch": 0.3526448362720403,
"grad_norm": 0.4138522446155548,
"learning_rate": 6.498740554156172e-06,
"loss": 1.6254,
"step": 140
},
{
"epoch": 0.35516372795969775,
"grad_norm": 0.42503440380096436,
"learning_rate": 6.473551637279597e-06,
"loss": 1.6187,
"step": 141
},
{
"epoch": 0.35768261964735515,
"grad_norm": 0.5783386826515198,
"learning_rate": 6.448362720403023e-06,
"loss": 1.6704,
"step": 142
},
{
"epoch": 0.3602015113350126,
"grad_norm": 0.4822537302970886,
"learning_rate": 6.423173803526449e-06,
"loss": 1.5762,
"step": 143
},
{
"epoch": 0.36272040302267,
"grad_norm": 0.43411409854888916,
"learning_rate": 6.397984886649875e-06,
"loss": 1.6067,
"step": 144
},
{
"epoch": 0.36523929471032746,
"grad_norm": 0.43474212288856506,
"learning_rate": 6.3727959697733e-06,
"loss": 1.6198,
"step": 145
},
{
"epoch": 0.3677581863979849,
"grad_norm": 0.4297161400318146,
"learning_rate": 6.347607052896726e-06,
"loss": 1.6004,
"step": 146
},
{
"epoch": 0.3702770780856423,
"grad_norm": 0.7553440928459167,
"learning_rate": 6.322418136020152e-06,
"loss": 1.7129,
"step": 147
},
{
"epoch": 0.37279596977329976,
"grad_norm": 0.4365249574184418,
"learning_rate": 6.297229219143577e-06,
"loss": 1.6186,
"step": 148
},
{
"epoch": 0.37531486146095716,
"grad_norm": 0.4731481373310089,
"learning_rate": 6.272040302267003e-06,
"loss": 1.5799,
"step": 149
},
{
"epoch": 0.3778337531486146,
"grad_norm": 0.44013121724128723,
"learning_rate": 6.246851385390429e-06,
"loss": 1.6117,
"step": 150
},
{
"epoch": 0.380352644836272,
"grad_norm": 0.4383363425731659,
"learning_rate": 6.221662468513855e-06,
"loss": 1.5916,
"step": 151
},
{
"epoch": 0.38287153652392947,
"grad_norm": 0.4586566686630249,
"learning_rate": 6.19647355163728e-06,
"loss": 1.5987,
"step": 152
},
{
"epoch": 0.3853904282115869,
"grad_norm": 0.5225487351417542,
"learning_rate": 6.1712846347607055e-06,
"loss": 1.5497,
"step": 153
},
{
"epoch": 0.3879093198992443,
"grad_norm": 0.4564357399940491,
"learning_rate": 6.146095717884132e-06,
"loss": 1.6016,
"step": 154
},
{
"epoch": 0.3904282115869018,
"grad_norm": 0.49243706464767456,
"learning_rate": 6.120906801007557e-06,
"loss": 1.6004,
"step": 155
},
{
"epoch": 0.3929471032745592,
"grad_norm": 0.6145833134651184,
"learning_rate": 6.095717884130983e-06,
"loss": 1.5891,
"step": 156
},
{
"epoch": 0.3954659949622166,
"grad_norm": 0.4326134920120239,
"learning_rate": 6.070528967254408e-06,
"loss": 1.6108,
"step": 157
},
{
"epoch": 0.3979848866498741,
"grad_norm": 0.45841845870018005,
"learning_rate": 6.045340050377835e-06,
"loss": 1.6044,
"step": 158
},
{
"epoch": 0.4005037783375315,
"grad_norm": 0.5934171676635742,
"learning_rate": 6.02015113350126e-06,
"loss": 1.6273,
"step": 159
},
{
"epoch": 0.40302267002518893,
"grad_norm": 0.5909122824668884,
"learning_rate": 5.9949622166246855e-06,
"loss": 1.5819,
"step": 160
},
{
"epoch": 0.40554156171284633,
"grad_norm": 0.47986772656440735,
"learning_rate": 5.969773299748112e-06,
"loss": 1.6292,
"step": 161
},
{
"epoch": 0.4080604534005038,
"grad_norm": 0.43019899725914,
"learning_rate": 5.944584382871537e-06,
"loss": 1.6219,
"step": 162
},
{
"epoch": 0.4105793450881612,
"grad_norm": 0.44603484869003296,
"learning_rate": 5.919395465994963e-06,
"loss": 1.6177,
"step": 163
},
{
"epoch": 0.41309823677581864,
"grad_norm": 0.6486812233924866,
"learning_rate": 5.894206549118388e-06,
"loss": 1.6169,
"step": 164
},
{
"epoch": 0.4156171284634761,
"grad_norm": 0.4344078600406647,
"learning_rate": 5.869017632241813e-06,
"loss": 1.6131,
"step": 165
},
{
"epoch": 0.4181360201511335,
"grad_norm": 0.4963393211364746,
"learning_rate": 5.84382871536524e-06,
"loss": 1.5649,
"step": 166
},
{
"epoch": 0.42065491183879095,
"grad_norm": 0.4491269588470459,
"learning_rate": 5.818639798488665e-06,
"loss": 1.5886,
"step": 167
},
{
"epoch": 0.42317380352644834,
"grad_norm": 0.44954273104667664,
"learning_rate": 5.793450881612092e-06,
"loss": 1.5514,
"step": 168
},
{
"epoch": 0.4256926952141058,
"grad_norm": 0.5957120060920715,
"learning_rate": 5.768261964735517e-06,
"loss": 1.5656,
"step": 169
},
{
"epoch": 0.4282115869017632,
"grad_norm": 0.4787919223308563,
"learning_rate": 5.7430730478589425e-06,
"loss": 1.5906,
"step": 170
},
{
"epoch": 0.43073047858942065,
"grad_norm": 0.4297046959400177,
"learning_rate": 5.717884130982368e-06,
"loss": 1.5676,
"step": 171
},
{
"epoch": 0.4332493702770781,
"grad_norm": 0.4834885597229004,
"learning_rate": 5.692695214105793e-06,
"loss": 1.5672,
"step": 172
},
{
"epoch": 0.4357682619647355,
"grad_norm": 0.5278275012969971,
"learning_rate": 5.66750629722922e-06,
"loss": 1.5994,
"step": 173
},
{
"epoch": 0.43828715365239296,
"grad_norm": 0.4892403185367584,
"learning_rate": 5.642317380352645e-06,
"loss": 1.5845,
"step": 174
},
{
"epoch": 0.44080604534005036,
"grad_norm": 0.5153166055679321,
"learning_rate": 5.617128463476071e-06,
"loss": 1.5573,
"step": 175
},
{
"epoch": 0.4433249370277078,
"grad_norm": 0.5289381146430969,
"learning_rate": 5.591939546599497e-06,
"loss": 1.5658,
"step": 176
},
{
"epoch": 0.44584382871536526,
"grad_norm": 0.45170825719833374,
"learning_rate": 5.5667506297229225e-06,
"loss": 1.5322,
"step": 177
},
{
"epoch": 0.44836272040302266,
"grad_norm": 0.45414310693740845,
"learning_rate": 5.541561712846348e-06,
"loss": 1.5872,
"step": 178
},
{
"epoch": 0.4508816120906801,
"grad_norm": 0.47673285007476807,
"learning_rate": 5.516372795969773e-06,
"loss": 1.603,
"step": 179
},
{
"epoch": 0.4534005037783375,
"grad_norm": 0.4653848707675934,
"learning_rate": 5.4911838790931996e-06,
"loss": 1.5235,
"step": 180
},
{
"epoch": 0.45591939546599497,
"grad_norm": 0.4475414752960205,
"learning_rate": 5.465994962216625e-06,
"loss": 1.5671,
"step": 181
},
{
"epoch": 0.45843828715365237,
"grad_norm": 0.48499029874801636,
"learning_rate": 5.440806045340051e-06,
"loss": 1.5912,
"step": 182
},
{
"epoch": 0.4609571788413098,
"grad_norm": 0.4531858563423157,
"learning_rate": 5.415617128463476e-06,
"loss": 1.541,
"step": 183
},
{
"epoch": 0.4634760705289673,
"grad_norm": 0.44078829884529114,
"learning_rate": 5.390428211586902e-06,
"loss": 1.583,
"step": 184
},
{
"epoch": 0.4659949622166247,
"grad_norm": 0.47280648350715637,
"learning_rate": 5.365239294710328e-06,
"loss": 1.6233,
"step": 185
},
{
"epoch": 0.46851385390428213,
"grad_norm": 0.5612819194793701,
"learning_rate": 5.340050377833753e-06,
"loss": 1.6078,
"step": 186
},
{
"epoch": 0.47103274559193953,
"grad_norm": 0.4777447283267975,
"learning_rate": 5.3148614609571795e-06,
"loss": 1.5722,
"step": 187
},
{
"epoch": 0.473551637279597,
"grad_norm": 0.49805429577827454,
"learning_rate": 5.289672544080605e-06,
"loss": 1.6243,
"step": 188
},
{
"epoch": 0.4760705289672544,
"grad_norm": 0.4395243525505066,
"learning_rate": 5.264483627204031e-06,
"loss": 1.5497,
"step": 189
},
{
"epoch": 0.47858942065491183,
"grad_norm": 0.7493352890014648,
"learning_rate": 5.239294710327456e-06,
"loss": 1.6466,
"step": 190
},
{
"epoch": 0.4811083123425693,
"grad_norm": 0.5018370747566223,
"learning_rate": 5.214105793450882e-06,
"loss": 1.5492,
"step": 191
},
{
"epoch": 0.4836272040302267,
"grad_norm": 0.4791150391101837,
"learning_rate": 5.188916876574308e-06,
"loss": 1.5679,
"step": 192
},
{
"epoch": 0.48614609571788414,
"grad_norm": 0.4814487099647522,
"learning_rate": 5.163727959697733e-06,
"loss": 1.5595,
"step": 193
},
{
"epoch": 0.48866498740554154,
"grad_norm": 0.44743016362190247,
"learning_rate": 5.138539042821159e-06,
"loss": 1.5971,
"step": 194
},
{
"epoch": 0.491183879093199,
"grad_norm": 0.47840508818626404,
"learning_rate": 5.113350125944585e-06,
"loss": 1.5414,
"step": 195
},
{
"epoch": 0.49370277078085645,
"grad_norm": 0.4497021436691284,
"learning_rate": 5.088161209068011e-06,
"loss": 1.5595,
"step": 196
},
{
"epoch": 0.49622166246851385,
"grad_norm": 0.49746012687683105,
"learning_rate": 5.062972292191436e-06,
"loss": 1.5403,
"step": 197
},
{
"epoch": 0.4987405541561713,
"grad_norm": 0.4701424837112427,
"learning_rate": 5.037783375314862e-06,
"loss": 1.5597,
"step": 198
},
{
"epoch": 0.5012594458438288,
"grad_norm": 0.4464475214481354,
"learning_rate": 5.012594458438288e-06,
"loss": 1.5436,
"step": 199
},
{
"epoch": 0.5037783375314862,
"grad_norm": 0.5158559083938599,
"learning_rate": 4.987405541561714e-06,
"loss": 1.5638,
"step": 200
},
{
"epoch": 0.5062972292191436,
"grad_norm": 0.5568498969078064,
"learning_rate": 4.9622166246851385e-06,
"loss": 1.5968,
"step": 201
},
{
"epoch": 0.5088161209068011,
"grad_norm": 0.4441608488559723,
"learning_rate": 4.937027707808565e-06,
"loss": 1.54,
"step": 202
},
{
"epoch": 0.5113350125944585,
"grad_norm": 0.4909915328025818,
"learning_rate": 4.91183879093199e-06,
"loss": 1.5439,
"step": 203
},
{
"epoch": 0.5138539042821159,
"grad_norm": 0.4911031424999237,
"learning_rate": 4.886649874055416e-06,
"loss": 1.5438,
"step": 204
},
{
"epoch": 0.5163727959697733,
"grad_norm": 0.7304896116256714,
"learning_rate": 4.861460957178842e-06,
"loss": 1.5061,
"step": 205
},
{
"epoch": 0.5188916876574308,
"grad_norm": 0.4542643129825592,
"learning_rate": 4.836272040302267e-06,
"loss": 1.5738,
"step": 206
},
{
"epoch": 0.5214105793450882,
"grad_norm": 0.8241648077964783,
"learning_rate": 4.811083123425694e-06,
"loss": 1.5982,
"step": 207
},
{
"epoch": 0.5239294710327456,
"grad_norm": 0.45886871218681335,
"learning_rate": 4.7858942065491185e-06,
"loss": 1.5594,
"step": 208
},
{
"epoch": 0.5264483627204031,
"grad_norm": 0.5265582799911499,
"learning_rate": 4.760705289672544e-06,
"loss": 1.57,
"step": 209
},
{
"epoch": 0.5289672544080605,
"grad_norm": 0.46276602149009705,
"learning_rate": 4.73551637279597e-06,
"loss": 1.5475,
"step": 210
},
{
"epoch": 0.5314861460957179,
"grad_norm": 0.5516127943992615,
"learning_rate": 4.710327455919396e-06,
"loss": 1.5497,
"step": 211
},
{
"epoch": 0.5340050377833753,
"grad_norm": 0.485507071018219,
"learning_rate": 4.685138539042821e-06,
"loss": 1.5954,
"step": 212
},
{
"epoch": 0.5365239294710328,
"grad_norm": 0.4667035937309265,
"learning_rate": 4.659949622166247e-06,
"loss": 1.5524,
"step": 213
},
{
"epoch": 0.5390428211586902,
"grad_norm": 0.4725947082042694,
"learning_rate": 4.6347607052896736e-06,
"loss": 1.5701,
"step": 214
},
{
"epoch": 0.5415617128463476,
"grad_norm": 0.48055243492126465,
"learning_rate": 4.609571788413098e-06,
"loss": 1.512,
"step": 215
},
{
"epoch": 0.5440806045340051,
"grad_norm": 0.47020798921585083,
"learning_rate": 4.584382871536524e-06,
"loss": 1.517,
"step": 216
},
{
"epoch": 0.5465994962216625,
"grad_norm": 0.458790123462677,
"learning_rate": 4.55919395465995e-06,
"loss": 1.5963,
"step": 217
},
{
"epoch": 0.5491183879093199,
"grad_norm": 0.46757379174232483,
"learning_rate": 4.5340050377833755e-06,
"loss": 1.5307,
"step": 218
},
{
"epoch": 0.5516372795969773,
"grad_norm": 0.48817694187164307,
"learning_rate": 4.508816120906801e-06,
"loss": 1.5096,
"step": 219
},
{
"epoch": 0.5541561712846348,
"grad_norm": 0.46775302290916443,
"learning_rate": 4.483627204030227e-06,
"loss": 1.5081,
"step": 220
},
{
"epoch": 0.5566750629722922,
"grad_norm": 0.4632299244403839,
"learning_rate": 4.458438287153653e-06,
"loss": 1.5274,
"step": 221
},
{
"epoch": 0.5591939546599496,
"grad_norm": 0.6220762729644775,
"learning_rate": 4.433249370277078e-06,
"loss": 1.4909,
"step": 222
},
{
"epoch": 0.5617128463476071,
"grad_norm": 0.4639570713043213,
"learning_rate": 4.408060453400504e-06,
"loss": 1.531,
"step": 223
},
{
"epoch": 0.5642317380352645,
"grad_norm": 0.48596182465553284,
"learning_rate": 4.38287153652393e-06,
"loss": 1.522,
"step": 224
},
{
"epoch": 0.5667506297229219,
"grad_norm": 0.4745020866394043,
"learning_rate": 4.3576826196473555e-06,
"loss": 1.5323,
"step": 225
},
{
"epoch": 0.5692695214105793,
"grad_norm": 0.5056527853012085,
"learning_rate": 4.332493702770781e-06,
"loss": 1.5374,
"step": 226
},
{
"epoch": 0.5717884130982368,
"grad_norm": 0.44245389103889465,
"learning_rate": 4.307304785894207e-06,
"loss": 1.5169,
"step": 227
},
{
"epoch": 0.5743073047858942,
"grad_norm": 0.4938381016254425,
"learning_rate": 4.282115869017633e-06,
"loss": 1.5192,
"step": 228
},
{
"epoch": 0.5768261964735516,
"grad_norm": 0.4689100384712219,
"learning_rate": 4.256926952141058e-06,
"loss": 1.5666,
"step": 229
},
{
"epoch": 0.5793450881612091,
"grad_norm": 0.5333397388458252,
"learning_rate": 4.231738035264484e-06,
"loss": 1.5562,
"step": 230
},
{
"epoch": 0.5818639798488665,
"grad_norm": 0.5024259090423584,
"learning_rate": 4.20654911838791e-06,
"loss": 1.5135,
"step": 231
},
{
"epoch": 0.5843828715365239,
"grad_norm": 0.46757936477661133,
"learning_rate": 4.181360201511335e-06,
"loss": 1.522,
"step": 232
},
{
"epoch": 0.5869017632241813,
"grad_norm": 0.5455654263496399,
"learning_rate": 4.156171284634761e-06,
"loss": 1.5281,
"step": 233
},
{
"epoch": 0.5894206549118388,
"grad_norm": 0.48288044333457947,
"learning_rate": 4.130982367758187e-06,
"loss": 1.5252,
"step": 234
},
{
"epoch": 0.5919395465994962,
"grad_norm": 0.44919902086257935,
"learning_rate": 4.1057934508816125e-06,
"loss": 1.5371,
"step": 235
},
{
"epoch": 0.5944584382871536,
"grad_norm": 0.4358011782169342,
"learning_rate": 4.080604534005038e-06,
"loss": 1.5419,
"step": 236
},
{
"epoch": 0.5969773299748111,
"grad_norm": 0.518595278263092,
"learning_rate": 4.055415617128464e-06,
"loss": 1.538,
"step": 237
},
{
"epoch": 0.5994962216624685,
"grad_norm": 0.6567726135253906,
"learning_rate": 4.03022670025189e-06,
"loss": 1.4867,
"step": 238
},
{
"epoch": 0.6020151133501259,
"grad_norm": 0.48650607466697693,
"learning_rate": 4.005037783375315e-06,
"loss": 1.494,
"step": 239
},
{
"epoch": 0.6045340050377834,
"grad_norm": 0.6559653878211975,
"learning_rate": 3.979848866498741e-06,
"loss": 1.54,
"step": 240
},
{
"epoch": 0.6070528967254408,
"grad_norm": 0.45548874139785767,
"learning_rate": 3.954659949622167e-06,
"loss": 1.5148,
"step": 241
},
{
"epoch": 0.6095717884130982,
"grad_norm": 0.6561994552612305,
"learning_rate": 3.9294710327455925e-06,
"loss": 1.5244,
"step": 242
},
{
"epoch": 0.6120906801007556,
"grad_norm": 0.46143561601638794,
"learning_rate": 3.904282115869018e-06,
"loss": 1.5315,
"step": 243
},
{
"epoch": 0.6146095717884131,
"grad_norm": 0.537300705909729,
"learning_rate": 3.879093198992444e-06,
"loss": 1.5424,
"step": 244
},
{
"epoch": 0.6171284634760705,
"grad_norm": 0.46460816264152527,
"learning_rate": 3.85390428211587e-06,
"loss": 1.4941,
"step": 245
},
{
"epoch": 0.6196473551637279,
"grad_norm": 0.48894399404525757,
"learning_rate": 3.828715365239295e-06,
"loss": 1.5294,
"step": 246
},
{
"epoch": 0.6221662468513854,
"grad_norm": 0.4623178541660309,
"learning_rate": 3.8035264483627206e-06,
"loss": 1.5068,
"step": 247
},
{
"epoch": 0.6246851385390428,
"grad_norm": 0.49979573488235474,
"learning_rate": 3.7783375314861463e-06,
"loss": 1.4801,
"step": 248
},
{
"epoch": 0.6272040302267002,
"grad_norm": 0.5378308296203613,
"learning_rate": 3.753148614609572e-06,
"loss": 1.5444,
"step": 249
},
{
"epoch": 0.6297229219143576,
"grad_norm": 0.5385175347328186,
"learning_rate": 3.727959697732998e-06,
"loss": 1.5249,
"step": 250
},
{
"epoch": 0.6322418136020151,
"grad_norm": 0.46512940526008606,
"learning_rate": 3.7027707808564234e-06,
"loss": 1.5082,
"step": 251
},
{
"epoch": 0.6347607052896725,
"grad_norm": 0.6099820733070374,
"learning_rate": 3.6775818639798495e-06,
"loss": 1.5297,
"step": 252
},
{
"epoch": 0.6372795969773299,
"grad_norm": 0.4563128650188446,
"learning_rate": 3.652392947103275e-06,
"loss": 1.5108,
"step": 253
},
{
"epoch": 0.6397984886649875,
"grad_norm": 0.4638257324695587,
"learning_rate": 3.6272040302267005e-06,
"loss": 1.492,
"step": 254
},
{
"epoch": 0.6423173803526449,
"grad_norm": 0.4734160602092743,
"learning_rate": 3.6020151133501262e-06,
"loss": 1.5113,
"step": 255
},
{
"epoch": 0.6448362720403022,
"grad_norm": 0.4613577127456665,
"learning_rate": 3.576826196473552e-06,
"loss": 1.5352,
"step": 256
},
{
"epoch": 0.6473551637279596,
"grad_norm": 0.6752243638038635,
"learning_rate": 3.5516372795969776e-06,
"loss": 1.492,
"step": 257
},
{
"epoch": 0.6498740554156172,
"grad_norm": 0.4645501673221588,
"learning_rate": 3.5264483627204033e-06,
"loss": 1.4993,
"step": 258
},
{
"epoch": 0.6523929471032746,
"grad_norm": 0.5898957252502441,
"learning_rate": 3.5012594458438295e-06,
"loss": 1.4917,
"step": 259
},
{
"epoch": 0.654911838790932,
"grad_norm": 0.4554866552352905,
"learning_rate": 3.4760705289672547e-06,
"loss": 1.5192,
"step": 260
},
{
"epoch": 0.6574307304785895,
"grad_norm": 0.4567941427230835,
"learning_rate": 3.45088161209068e-06,
"loss": 1.5442,
"step": 261
},
{
"epoch": 0.6599496221662469,
"grad_norm": 0.4824671447277069,
"learning_rate": 3.425692695214106e-06,
"loss": 1.5348,
"step": 262
},
{
"epoch": 0.6624685138539043,
"grad_norm": 0.4494476616382599,
"learning_rate": 3.4005037783375314e-06,
"loss": 1.5278,
"step": 263
},
{
"epoch": 0.6649874055415617,
"grad_norm": 0.5391709208488464,
"learning_rate": 3.3753148614609576e-06,
"loss": 1.5277,
"step": 264
},
{
"epoch": 0.6675062972292192,
"grad_norm": 0.4483042061328888,
"learning_rate": 3.3501259445843833e-06,
"loss": 1.4955,
"step": 265
},
{
"epoch": 0.6700251889168766,
"grad_norm": 0.46210387349128723,
"learning_rate": 3.3249370277078086e-06,
"loss": 1.5077,
"step": 266
},
{
"epoch": 0.672544080604534,
"grad_norm": 0.5058848261833191,
"learning_rate": 3.2997481108312347e-06,
"loss": 1.4645,
"step": 267
},
{
"epoch": 0.6750629722921915,
"grad_norm": 0.4964057207107544,
"learning_rate": 3.27455919395466e-06,
"loss": 1.4897,
"step": 268
},
{
"epoch": 0.6775818639798489,
"grad_norm": 0.46125808358192444,
"learning_rate": 3.249370277078086e-06,
"loss": 1.5414,
"step": 269
},
{
"epoch": 0.6801007556675063,
"grad_norm": 0.488656222820282,
"learning_rate": 3.2241813602015114e-06,
"loss": 1.4999,
"step": 270
},
{
"epoch": 0.6826196473551638,
"grad_norm": 0.4692099988460541,
"learning_rate": 3.1989924433249375e-06,
"loss": 1.5402,
"step": 271
},
{
"epoch": 0.6851385390428212,
"grad_norm": 0.49234357476234436,
"learning_rate": 3.173803526448363e-06,
"loss": 1.5373,
"step": 272
},
{
"epoch": 0.6876574307304786,
"grad_norm": 0.596118152141571,
"learning_rate": 3.1486146095717885e-06,
"loss": 1.5145,
"step": 273
},
{
"epoch": 0.690176322418136,
"grad_norm": 0.4749690890312195,
"learning_rate": 3.1234256926952146e-06,
"loss": 1.4973,
"step": 274
},
{
"epoch": 0.6926952141057935,
"grad_norm": 0.4940085709095001,
"learning_rate": 3.09823677581864e-06,
"loss": 1.464,
"step": 275
},
{
"epoch": 0.6952141057934509,
"grad_norm": 0.47270411252975464,
"learning_rate": 3.073047858942066e-06,
"loss": 1.5094,
"step": 276
},
{
"epoch": 0.6977329974811083,
"grad_norm": 0.4631718695163727,
"learning_rate": 3.0478589420654913e-06,
"loss": 1.4893,
"step": 277
},
{
"epoch": 0.7002518891687658,
"grad_norm": 0.5515400171279907,
"learning_rate": 3.0226700251889174e-06,
"loss": 1.5342,
"step": 278
},
{
"epoch": 0.7027707808564232,
"grad_norm": 0.5326355695724487,
"learning_rate": 2.9974811083123427e-06,
"loss": 1.5263,
"step": 279
},
{
"epoch": 0.7052896725440806,
"grad_norm": 0.45032408833503723,
"learning_rate": 2.9722921914357684e-06,
"loss": 1.4977,
"step": 280
},
{
"epoch": 0.707808564231738,
"grad_norm": 0.49274197220802307,
"learning_rate": 2.947103274559194e-06,
"loss": 1.4729,
"step": 281
},
{
"epoch": 0.7103274559193955,
"grad_norm": 0.45705220103263855,
"learning_rate": 2.92191435768262e-06,
"loss": 1.4908,
"step": 282
},
{
"epoch": 0.7128463476070529,
"grad_norm": 0.46655991673469543,
"learning_rate": 2.896725440806046e-06,
"loss": 1.503,
"step": 283
},
{
"epoch": 0.7153652392947103,
"grad_norm": 0.5047741532325745,
"learning_rate": 2.8715365239294713e-06,
"loss": 1.4656,
"step": 284
},
{
"epoch": 0.7178841309823678,
"grad_norm": 0.4772416949272156,
"learning_rate": 2.8463476070528965e-06,
"loss": 1.4664,
"step": 285
},
{
"epoch": 0.7204030226700252,
"grad_norm": 0.4567766487598419,
"learning_rate": 2.8211586901763227e-06,
"loss": 1.5123,
"step": 286
},
{
"epoch": 0.7229219143576826,
"grad_norm": 0.4822060763835907,
"learning_rate": 2.7959697732997484e-06,
"loss": 1.5079,
"step": 287
},
{
"epoch": 0.72544080604534,
"grad_norm": 0.637371301651001,
"learning_rate": 2.770780856423174e-06,
"loss": 1.472,
"step": 288
},
{
"epoch": 0.7279596977329975,
"grad_norm": 0.4881971478462219,
"learning_rate": 2.7455919395465998e-06,
"loss": 1.4737,
"step": 289
},
{
"epoch": 0.7304785894206549,
"grad_norm": 0.4653415381908417,
"learning_rate": 2.7204030226700255e-06,
"loss": 1.5104,
"step": 290
},
{
"epoch": 0.7329974811083123,
"grad_norm": 0.476697713136673,
"learning_rate": 2.695214105793451e-06,
"loss": 1.5072,
"step": 291
},
{
"epoch": 0.7355163727959698,
"grad_norm": 0.6168654561042786,
"learning_rate": 2.6700251889168765e-06,
"loss": 1.5142,
"step": 292
},
{
"epoch": 0.7380352644836272,
"grad_norm": 0.6653453707695007,
"learning_rate": 2.6448362720403026e-06,
"loss": 1.4897,
"step": 293
},
{
"epoch": 0.7405541561712846,
"grad_norm": 0.4866642951965332,
"learning_rate": 2.619647355163728e-06,
"loss": 1.5409,
"step": 294
},
{
"epoch": 0.743073047858942,
"grad_norm": 0.4763050377368927,
"learning_rate": 2.594458438287154e-06,
"loss": 1.5306,
"step": 295
},
{
"epoch": 0.7455919395465995,
"grad_norm": 0.5434437990188599,
"learning_rate": 2.5692695214105793e-06,
"loss": 1.5334,
"step": 296
},
{
"epoch": 0.7481108312342569,
"grad_norm": 0.5760312080383301,
"learning_rate": 2.5440806045340054e-06,
"loss": 1.5138,
"step": 297
},
{
"epoch": 0.7506297229219143,
"grad_norm": 0.44751110672950745,
"learning_rate": 2.518891687657431e-06,
"loss": 1.4845,
"step": 298
},
{
"epoch": 0.7531486146095718,
"grad_norm": 0.4421987235546112,
"learning_rate": 2.493702770780857e-06,
"loss": 1.4837,
"step": 299
},
{
"epoch": 0.7556675062972292,
"grad_norm": 0.7657718658447266,
"learning_rate": 2.4685138539042825e-06,
"loss": 1.5151,
"step": 300
},
{
"epoch": 0.7581863979848866,
"grad_norm": 0.5052861571311951,
"learning_rate": 2.443324937027708e-06,
"loss": 1.5404,
"step": 301
},
{
"epoch": 0.760705289672544,
"grad_norm": 0.5251312851905823,
"learning_rate": 2.4181360201511335e-06,
"loss": 1.4329,
"step": 302
},
{
"epoch": 0.7632241813602015,
"grad_norm": 0.46061962842941284,
"learning_rate": 2.3929471032745592e-06,
"loss": 1.4976,
"step": 303
},
{
"epoch": 0.7657430730478589,
"grad_norm": 0.4743208587169647,
"learning_rate": 2.367758186397985e-06,
"loss": 1.4939,
"step": 304
},
{
"epoch": 0.7682619647355163,
"grad_norm": 0.4864160418510437,
"learning_rate": 2.3425692695214107e-06,
"loss": 1.4997,
"step": 305
},
{
"epoch": 0.7707808564231738,
"grad_norm": 0.47275349497795105,
"learning_rate": 2.3173803526448368e-06,
"loss": 1.4793,
"step": 306
},
{
"epoch": 0.7732997481108312,
"grad_norm": 0.49562177062034607,
"learning_rate": 2.292191435768262e-06,
"loss": 1.4755,
"step": 307
},
{
"epoch": 0.7758186397984886,
"grad_norm": 0.564599335193634,
"learning_rate": 2.2670025188916878e-06,
"loss": 1.4932,
"step": 308
},
{
"epoch": 0.7783375314861462,
"grad_norm": 0.4657755494117737,
"learning_rate": 2.2418136020151135e-06,
"loss": 1.5076,
"step": 309
},
{
"epoch": 0.7808564231738035,
"grad_norm": 0.486026793718338,
"learning_rate": 2.216624685138539e-06,
"loss": 1.5014,
"step": 310
},
{
"epoch": 0.783375314861461,
"grad_norm": 0.4599766135215759,
"learning_rate": 2.191435768261965e-06,
"loss": 1.5274,
"step": 311
},
{
"epoch": 0.7858942065491183,
"grad_norm": 0.47607848048210144,
"learning_rate": 2.1662468513853906e-06,
"loss": 1.4701,
"step": 312
},
{
"epoch": 0.7884130982367759,
"grad_norm": 0.47365328669548035,
"learning_rate": 2.1410579345088163e-06,
"loss": 1.4932,
"step": 313
},
{
"epoch": 0.7909319899244333,
"grad_norm": 0.45562124252319336,
"learning_rate": 2.115869017632242e-06,
"loss": 1.4912,
"step": 314
},
{
"epoch": 0.7934508816120907,
"grad_norm": 0.5331164002418518,
"learning_rate": 2.0906801007556677e-06,
"loss": 1.5174,
"step": 315
},
{
"epoch": 0.7959697732997482,
"grad_norm": 0.509325385093689,
"learning_rate": 2.0654911838790934e-06,
"loss": 1.4788,
"step": 316
},
{
"epoch": 0.7984886649874056,
"grad_norm": 0.4969271123409271,
"learning_rate": 2.040302267002519e-06,
"loss": 1.5377,
"step": 317
},
{
"epoch": 0.801007556675063,
"grad_norm": 0.44712427258491516,
"learning_rate": 2.015113350125945e-06,
"loss": 1.5279,
"step": 318
},
{
"epoch": 0.8035264483627204,
"grad_norm": 0.47016969323158264,
"learning_rate": 1.9899244332493705e-06,
"loss": 1.5309,
"step": 319
},
{
"epoch": 0.8060453400503779,
"grad_norm": 0.5187602043151855,
"learning_rate": 1.9647355163727962e-06,
"loss": 1.507,
"step": 320
},
{
"epoch": 0.8085642317380353,
"grad_norm": 0.4568648636341095,
"learning_rate": 1.939546599496222e-06,
"loss": 1.4517,
"step": 321
},
{
"epoch": 0.8110831234256927,
"grad_norm": 0.4813389480113983,
"learning_rate": 1.9143576826196476e-06,
"loss": 1.5215,
"step": 322
},
{
"epoch": 0.8136020151133502,
"grad_norm": 0.5260921716690063,
"learning_rate": 1.8891687657430731e-06,
"loss": 1.5154,
"step": 323
},
{
"epoch": 0.8161209068010076,
"grad_norm": 0.5113592743873596,
"learning_rate": 1.863979848866499e-06,
"loss": 1.4496,
"step": 324
},
{
"epoch": 0.818639798488665,
"grad_norm": 0.48540815711021423,
"learning_rate": 1.8387909319899248e-06,
"loss": 1.4874,
"step": 325
},
{
"epoch": 0.8211586901763224,
"grad_norm": 0.4522131681442261,
"learning_rate": 1.8136020151133503e-06,
"loss": 1.4781,
"step": 326
},
{
"epoch": 0.8236775818639799,
"grad_norm": 0.45719313621520996,
"learning_rate": 1.788413098236776e-06,
"loss": 1.4859,
"step": 327
},
{
"epoch": 0.8261964735516373,
"grad_norm": 0.43814224004745483,
"learning_rate": 1.7632241813602017e-06,
"loss": 1.4775,
"step": 328
},
{
"epoch": 0.8287153652392947,
"grad_norm": 0.44290891289711,
"learning_rate": 1.7380352644836274e-06,
"loss": 1.5037,
"step": 329
},
{
"epoch": 0.8312342569269522,
"grad_norm": 0.4844774603843689,
"learning_rate": 1.712846347607053e-06,
"loss": 1.5179,
"step": 330
},
{
"epoch": 0.8337531486146096,
"grad_norm": 0.4434620440006256,
"learning_rate": 1.6876574307304788e-06,
"loss": 1.494,
"step": 331
},
{
"epoch": 0.836272040302267,
"grad_norm": 0.46283698081970215,
"learning_rate": 1.6624685138539043e-06,
"loss": 1.4889,
"step": 332
},
{
"epoch": 0.8387909319899244,
"grad_norm": 0.471802681684494,
"learning_rate": 1.63727959697733e-06,
"loss": 1.4558,
"step": 333
},
{
"epoch": 0.8413098236775819,
"grad_norm": 0.4605620205402374,
"learning_rate": 1.6120906801007557e-06,
"loss": 1.5238,
"step": 334
},
{
"epoch": 0.8438287153652393,
"grad_norm": 0.6928207874298096,
"learning_rate": 1.5869017632241814e-06,
"loss": 1.51,
"step": 335
},
{
"epoch": 0.8463476070528967,
"grad_norm": 0.48179909586906433,
"learning_rate": 1.5617128463476073e-06,
"loss": 1.5368,
"step": 336
},
{
"epoch": 0.8488664987405542,
"grad_norm": 0.5029130578041077,
"learning_rate": 1.536523929471033e-06,
"loss": 1.4563,
"step": 337
},
{
"epoch": 0.8513853904282116,
"grad_norm": 0.4608486294746399,
"learning_rate": 1.5113350125944587e-06,
"loss": 1.463,
"step": 338
},
{
"epoch": 0.853904282115869,
"grad_norm": 0.5182480216026306,
"learning_rate": 1.4861460957178842e-06,
"loss": 1.465,
"step": 339
},
{
"epoch": 0.8564231738035264,
"grad_norm": 0.4644806385040283,
"learning_rate": 1.46095717884131e-06,
"loss": 1.4987,
"step": 340
},
{
"epoch": 0.8589420654911839,
"grad_norm": 0.4732770323753357,
"learning_rate": 1.4357682619647356e-06,
"loss": 1.5133,
"step": 341
},
{
"epoch": 0.8614609571788413,
"grad_norm": 0.5835548043251038,
"learning_rate": 1.4105793450881613e-06,
"loss": 1.5233,
"step": 342
},
{
"epoch": 0.8639798488664987,
"grad_norm": 0.45620298385620117,
"learning_rate": 1.385390428211587e-06,
"loss": 1.4727,
"step": 343
},
{
"epoch": 0.8664987405541562,
"grad_norm": 0.4693787395954132,
"learning_rate": 1.3602015113350127e-06,
"loss": 1.4706,
"step": 344
},
{
"epoch": 0.8690176322418136,
"grad_norm": 0.6238934993743896,
"learning_rate": 1.3350125944584382e-06,
"loss": 1.5022,
"step": 345
},
{
"epoch": 0.871536523929471,
"grad_norm": 0.5140495896339417,
"learning_rate": 1.309823677581864e-06,
"loss": 1.4581,
"step": 346
},
{
"epoch": 0.8740554156171285,
"grad_norm": 0.6451770663261414,
"learning_rate": 1.2846347607052897e-06,
"loss": 1.523,
"step": 347
},
{
"epoch": 0.8765743073047859,
"grad_norm": 0.5394758582115173,
"learning_rate": 1.2594458438287156e-06,
"loss": 1.4815,
"step": 348
},
{
"epoch": 0.8790931989924433,
"grad_norm": 0.4751567840576172,
"learning_rate": 1.2342569269521413e-06,
"loss": 1.4666,
"step": 349
},
{
"epoch": 0.8816120906801007,
"grad_norm": 0.5158999562263489,
"learning_rate": 1.2090680100755668e-06,
"loss": 1.477,
"step": 350
},
{
"epoch": 0.8841309823677582,
"grad_norm": 0.47987380623817444,
"learning_rate": 1.1838790931989925e-06,
"loss": 1.4751,
"step": 351
},
{
"epoch": 0.8866498740554156,
"grad_norm": 0.45010906457901,
"learning_rate": 1.1586901763224184e-06,
"loss": 1.4935,
"step": 352
},
{
"epoch": 0.889168765743073,
"grad_norm": 0.4675264060497284,
"learning_rate": 1.1335012594458439e-06,
"loss": 1.4767,
"step": 353
},
{
"epoch": 0.8916876574307305,
"grad_norm": 0.4817536175251007,
"learning_rate": 1.1083123425692696e-06,
"loss": 1.5079,
"step": 354
},
{
"epoch": 0.8942065491183879,
"grad_norm": 0.5326683521270752,
"learning_rate": 1.0831234256926953e-06,
"loss": 1.4643,
"step": 355
},
{
"epoch": 0.8967254408060453,
"grad_norm": 0.45862582325935364,
"learning_rate": 1.057934508816121e-06,
"loss": 1.4784,
"step": 356
},
{
"epoch": 0.8992443324937027,
"grad_norm": 0.4639340937137604,
"learning_rate": 1.0327455919395467e-06,
"loss": 1.4669,
"step": 357
},
{
"epoch": 0.9017632241813602,
"grad_norm": 0.5519356727600098,
"learning_rate": 1.0075566750629724e-06,
"loss": 1.4962,
"step": 358
},
{
"epoch": 0.9042821158690176,
"grad_norm": 0.5423635244369507,
"learning_rate": 9.823677581863981e-07,
"loss": 1.5149,
"step": 359
},
{
"epoch": 0.906801007556675,
"grad_norm": 0.4961482286453247,
"learning_rate": 9.571788413098238e-07,
"loss": 1.4841,
"step": 360
},
{
"epoch": 0.9093198992443325,
"grad_norm": 0.5558215379714966,
"learning_rate": 9.319899244332495e-07,
"loss": 1.4672,
"step": 361
},
{
"epoch": 0.9118387909319899,
"grad_norm": 0.47575876116752625,
"learning_rate": 9.068010075566751e-07,
"loss": 1.5035,
"step": 362
},
{
"epoch": 0.9143576826196473,
"grad_norm": 0.44151756167411804,
"learning_rate": 8.816120906801008e-07,
"loss": 1.4923,
"step": 363
},
{
"epoch": 0.9168765743073047,
"grad_norm": 0.49502983689308167,
"learning_rate": 8.564231738035265e-07,
"loss": 1.4872,
"step": 364
},
{
"epoch": 0.9193954659949622,
"grad_norm": 0.4563881456851959,
"learning_rate": 8.312342569269521e-07,
"loss": 1.5022,
"step": 365
},
{
"epoch": 0.9219143576826196,
"grad_norm": 0.4814889132976532,
"learning_rate": 8.060453400503778e-07,
"loss": 1.4922,
"step": 366
},
{
"epoch": 0.924433249370277,
"grad_norm": 0.44825509190559387,
"learning_rate": 7.808564231738037e-07,
"loss": 1.4695,
"step": 367
},
{
"epoch": 0.9269521410579346,
"grad_norm": 0.46482357382774353,
"learning_rate": 7.556675062972294e-07,
"loss": 1.4943,
"step": 368
},
{
"epoch": 0.929471032745592,
"grad_norm": 0.5883563160896301,
"learning_rate": 7.30478589420655e-07,
"loss": 1.4658,
"step": 369
},
{
"epoch": 0.9319899244332494,
"grad_norm": 0.6148042678833008,
"learning_rate": 7.052896725440807e-07,
"loss": 1.4528,
"step": 370
},
{
"epoch": 0.9345088161209067,
"grad_norm": 0.4770396649837494,
"learning_rate": 6.801007556675064e-07,
"loss": 1.4914,
"step": 371
},
{
"epoch": 0.9370277078085643,
"grad_norm": 0.46335241198539734,
"learning_rate": 6.54911838790932e-07,
"loss": 1.5172,
"step": 372
},
{
"epoch": 0.9395465994962217,
"grad_norm": 0.46679455041885376,
"learning_rate": 6.297229219143578e-07,
"loss": 1.4426,
"step": 373
},
{
"epoch": 0.9420654911838791,
"grad_norm": 0.5507463216781616,
"learning_rate": 6.045340050377834e-07,
"loss": 1.5067,
"step": 374
},
{
"epoch": 0.9445843828715366,
"grad_norm": 0.468250572681427,
"learning_rate": 5.793450881612092e-07,
"loss": 1.5105,
"step": 375
},
{
"epoch": 0.947103274559194,
"grad_norm": 0.6048943996429443,
"learning_rate": 5.541561712846348e-07,
"loss": 1.4814,
"step": 376
},
{
"epoch": 0.9496221662468514,
"grad_norm": 0.4735409617424011,
"learning_rate": 5.289672544080605e-07,
"loss": 1.4739,
"step": 377
},
{
"epoch": 0.9521410579345088,
"grad_norm": 0.5519718527793884,
"learning_rate": 5.037783375314862e-07,
"loss": 1.5022,
"step": 378
},
{
"epoch": 0.9546599496221663,
"grad_norm": 0.4825071692466736,
"learning_rate": 4.785894206549119e-07,
"loss": 1.4977,
"step": 379
},
{
"epoch": 0.9571788413098237,
"grad_norm": 0.44791093468666077,
"learning_rate": 4.5340050377833756e-07,
"loss": 1.4912,
"step": 380
},
{
"epoch": 0.9596977329974811,
"grad_norm": 0.6440786719322205,
"learning_rate": 4.2821158690176327e-07,
"loss": 1.4602,
"step": 381
},
{
"epoch": 0.9622166246851386,
"grad_norm": 0.4575777053833008,
"learning_rate": 4.030226700251889e-07,
"loss": 1.4833,
"step": 382
},
{
"epoch": 0.964735516372796,
"grad_norm": 0.47071707248687744,
"learning_rate": 3.778337531486147e-07,
"loss": 1.4963,
"step": 383
},
{
"epoch": 0.9672544080604534,
"grad_norm": 0.6902024745941162,
"learning_rate": 3.5264483627204033e-07,
"loss": 1.4699,
"step": 384
},
{
"epoch": 0.9697732997481109,
"grad_norm": 0.48268118500709534,
"learning_rate": 3.27455919395466e-07,
"loss": 1.472,
"step": 385
},
{
"epoch": 0.9722921914357683,
"grad_norm": 0.4497368335723877,
"learning_rate": 3.022670025188917e-07,
"loss": 1.4654,
"step": 386
},
{
"epoch": 0.9748110831234257,
"grad_norm": 0.5587329864501953,
"learning_rate": 2.770780856423174e-07,
"loss": 1.5351,
"step": 387
},
{
"epoch": 0.9773299748110831,
"grad_norm": 0.5236759185791016,
"learning_rate": 2.518891687657431e-07,
"loss": 1.4955,
"step": 388
},
{
"epoch": 0.9798488664987406,
"grad_norm": 0.4622642397880554,
"learning_rate": 2.2670025188916878e-07,
"loss": 1.4956,
"step": 389
},
{
"epoch": 0.982367758186398,
"grad_norm": 0.4652063548564911,
"learning_rate": 2.0151133501259446e-07,
"loss": 1.4875,
"step": 390
},
{
"epoch": 0.9848866498740554,
"grad_norm": 0.44629859924316406,
"learning_rate": 1.7632241813602017e-07,
"loss": 1.4543,
"step": 391
},
{
"epoch": 0.9874055415617129,
"grad_norm": 0.45472198724746704,
"learning_rate": 1.5113350125944585e-07,
"loss": 1.5048,
"step": 392
},
{
"epoch": 0.9899244332493703,
"grad_norm": 0.4791916608810425,
"learning_rate": 1.2594458438287155e-07,
"loss": 1.4998,
"step": 393
},
{
"epoch": 0.9924433249370277,
"grad_norm": 0.45487239956855774,
"learning_rate": 1.0075566750629723e-07,
"loss": 1.5058,
"step": 394
},
{
"epoch": 0.9949622166246851,
"grad_norm": 0.5730354189872742,
"learning_rate": 7.556675062972292e-08,
"loss": 1.5314,
"step": 395
},
{
"epoch": 0.9974811083123426,
"grad_norm": 0.47194746136665344,
"learning_rate": 5.0377833753148615e-08,
"loss": 1.5077,
"step": 396
},
{
"epoch": 1.0,
"grad_norm": 0.5024914741516113,
"learning_rate": 2.5188916876574308e-08,
"loss": 1.4974,
"step": 397
}
],
"logging_steps": 1.0,
"max_steps": 397,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.840368526032896e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}