mbaza_bert / trainer_state.json
Kleber's picture
Upload folder using huggingface_hub
0d54d8d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 538240,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018579072532699168,
"grad_norm": 3.261758327484131,
"learning_rate": 4.995355231866826e-05,
"loss": 7.9049,
"step": 500
},
{
"epoch": 0.037158145065398336,
"grad_norm": 2.7945966720581055,
"learning_rate": 4.99071046373365e-05,
"loss": 7.313,
"step": 1000
},
{
"epoch": 0.0557372175980975,
"grad_norm": 3.66495680809021,
"learning_rate": 4.986065695600476e-05,
"loss": 7.0529,
"step": 1500
},
{
"epoch": 0.07431629013079667,
"grad_norm": 3.696575880050659,
"learning_rate": 4.981420927467301e-05,
"loss": 6.8319,
"step": 2000
},
{
"epoch": 0.09289536266349584,
"grad_norm": 4.190186977386475,
"learning_rate": 4.976776159334126e-05,
"loss": 6.5888,
"step": 2500
},
{
"epoch": 0.111474435196195,
"grad_norm": 4.945929050445557,
"learning_rate": 4.972131391200951e-05,
"loss": 6.3799,
"step": 3000
},
{
"epoch": 0.13005350772889418,
"grad_norm": 4.898506164550781,
"learning_rate": 4.967486623067777e-05,
"loss": 6.1872,
"step": 3500
},
{
"epoch": 0.14863258026159334,
"grad_norm": 4.006326675415039,
"learning_rate": 4.962841854934602e-05,
"loss": 6.0257,
"step": 4000
},
{
"epoch": 0.1672116527942925,
"grad_norm": 5.2170209884643555,
"learning_rate": 4.958197086801427e-05,
"loss": 5.8457,
"step": 4500
},
{
"epoch": 0.18579072532699167,
"grad_norm": 5.515870094299316,
"learning_rate": 4.953552318668253e-05,
"loss": 5.7231,
"step": 5000
},
{
"epoch": 0.20436979785969084,
"grad_norm": 6.801781177520752,
"learning_rate": 4.948907550535077e-05,
"loss": 5.5989,
"step": 5500
},
{
"epoch": 0.22294887039239,
"grad_norm": 5.6380205154418945,
"learning_rate": 4.944262782401903e-05,
"loss": 5.4627,
"step": 6000
},
{
"epoch": 0.24152794292508917,
"grad_norm": 4.960023880004883,
"learning_rate": 4.939618014268728e-05,
"loss": 5.3395,
"step": 6500
},
{
"epoch": 0.26010701545778836,
"grad_norm": 4.814383506774902,
"learning_rate": 4.934973246135553e-05,
"loss": 5.2441,
"step": 7000
},
{
"epoch": 0.2786860879904875,
"grad_norm": 5.070895195007324,
"learning_rate": 4.930328478002379e-05,
"loss": 5.1584,
"step": 7500
},
{
"epoch": 0.2972651605231867,
"grad_norm": 4.370517730712891,
"learning_rate": 4.925683709869204e-05,
"loss": 5.08,
"step": 8000
},
{
"epoch": 0.3158442330558858,
"grad_norm": 5.01335334777832,
"learning_rate": 4.921038941736029e-05,
"loss": 5.0119,
"step": 8500
},
{
"epoch": 0.334423305588585,
"grad_norm": 5.798189163208008,
"learning_rate": 4.916394173602854e-05,
"loss": 4.9469,
"step": 9000
},
{
"epoch": 0.3530023781212842,
"grad_norm": 5.567455291748047,
"learning_rate": 4.9117494054696796e-05,
"loss": 4.8906,
"step": 9500
},
{
"epoch": 0.37158145065398335,
"grad_norm": 5.719528675079346,
"learning_rate": 4.907104637336504e-05,
"loss": 4.826,
"step": 10000
},
{
"epoch": 0.39016052318668254,
"grad_norm": 5.381674289703369,
"learning_rate": 4.90245986920333e-05,
"loss": 4.7627,
"step": 10500
},
{
"epoch": 0.4087395957193817,
"grad_norm": 5.749002933502197,
"learning_rate": 4.8978151010701554e-05,
"loss": 4.7247,
"step": 11000
},
{
"epoch": 0.42731866825208087,
"grad_norm": 5.553402423858643,
"learning_rate": 4.89317033293698e-05,
"loss": 4.648,
"step": 11500
},
{
"epoch": 0.44589774078478,
"grad_norm": 5.483209133148193,
"learning_rate": 4.8885255648038055e-05,
"loss": 4.5921,
"step": 12000
},
{
"epoch": 0.4644768133174792,
"grad_norm": 5.7984747886657715,
"learning_rate": 4.8838807966706305e-05,
"loss": 4.5489,
"step": 12500
},
{
"epoch": 0.48305588585017833,
"grad_norm": 5.168997287750244,
"learning_rate": 4.8792360285374556e-05,
"loss": 4.5258,
"step": 13000
},
{
"epoch": 0.5016349583828775,
"grad_norm": 5.243541240692139,
"learning_rate": 4.8745912604042806e-05,
"loss": 4.4826,
"step": 13500
},
{
"epoch": 0.5202140309155767,
"grad_norm": 5.172119140625,
"learning_rate": 4.8699464922711064e-05,
"loss": 4.4446,
"step": 14000
},
{
"epoch": 0.5387931034482759,
"grad_norm": 4.617650032043457,
"learning_rate": 4.8653017241379314e-05,
"loss": 4.4262,
"step": 14500
},
{
"epoch": 0.557372175980975,
"grad_norm": 5.853124618530273,
"learning_rate": 4.8606569560047565e-05,
"loss": 4.3786,
"step": 15000
},
{
"epoch": 0.5759512485136742,
"grad_norm": 4.6473493576049805,
"learning_rate": 4.8560121878715815e-05,
"loss": 4.3497,
"step": 15500
},
{
"epoch": 0.5945303210463734,
"grad_norm": 5.469015121459961,
"learning_rate": 4.8513674197384066e-05,
"loss": 4.2946,
"step": 16000
},
{
"epoch": 0.6131093935790726,
"grad_norm": 4.9087371826171875,
"learning_rate": 4.846722651605232e-05,
"loss": 4.2847,
"step": 16500
},
{
"epoch": 0.6316884661117717,
"grad_norm": 5.479755878448486,
"learning_rate": 4.842077883472057e-05,
"loss": 4.2635,
"step": 17000
},
{
"epoch": 0.6502675386444708,
"grad_norm": 5.843453407287598,
"learning_rate": 4.8374331153388824e-05,
"loss": 4.2108,
"step": 17500
},
{
"epoch": 0.66884661117717,
"grad_norm": 5.5419535636901855,
"learning_rate": 4.832788347205708e-05,
"loss": 4.2187,
"step": 18000
},
{
"epoch": 0.6874256837098692,
"grad_norm": 5.9003376960754395,
"learning_rate": 4.8281435790725325e-05,
"loss": 4.16,
"step": 18500
},
{
"epoch": 0.7060047562425684,
"grad_norm": 5.591574668884277,
"learning_rate": 4.823498810939358e-05,
"loss": 4.1433,
"step": 19000
},
{
"epoch": 0.7245838287752675,
"grad_norm": 5.295060634613037,
"learning_rate": 4.818854042806183e-05,
"loss": 4.1182,
"step": 19500
},
{
"epoch": 0.7431629013079667,
"grad_norm": 5.101735591888428,
"learning_rate": 4.814209274673008e-05,
"loss": 4.1155,
"step": 20000
},
{
"epoch": 0.7617419738406659,
"grad_norm": 5.852224349975586,
"learning_rate": 4.809564506539834e-05,
"loss": 4.0749,
"step": 20500
},
{
"epoch": 0.7803210463733651,
"grad_norm": 5.916712760925293,
"learning_rate": 4.804919738406659e-05,
"loss": 4.0554,
"step": 21000
},
{
"epoch": 0.7989001189060642,
"grad_norm": 5.017261505126953,
"learning_rate": 4.800274970273484e-05,
"loss": 4.0196,
"step": 21500
},
{
"epoch": 0.8174791914387634,
"grad_norm": 5.785404682159424,
"learning_rate": 4.795630202140309e-05,
"loss": 4.0196,
"step": 22000
},
{
"epoch": 0.8360582639714625,
"grad_norm": 5.758474826812744,
"learning_rate": 4.790985434007135e-05,
"loss": 4.0027,
"step": 22500
},
{
"epoch": 0.8546373365041617,
"grad_norm": 6.120078086853027,
"learning_rate": 4.786340665873959e-05,
"loss": 3.9577,
"step": 23000
},
{
"epoch": 0.8732164090368609,
"grad_norm": 6.130490779876709,
"learning_rate": 4.781695897740785e-05,
"loss": 3.9507,
"step": 23500
},
{
"epoch": 0.89179548156956,
"grad_norm": 4.95521354675293,
"learning_rate": 4.77705112960761e-05,
"loss": 3.9216,
"step": 24000
},
{
"epoch": 0.9103745541022592,
"grad_norm": 5.775145530700684,
"learning_rate": 4.772406361474435e-05,
"loss": 3.9177,
"step": 24500
},
{
"epoch": 0.9289536266349584,
"grad_norm": 5.804774761199951,
"learning_rate": 4.767761593341261e-05,
"loss": 3.8925,
"step": 25000
},
{
"epoch": 0.9475326991676576,
"grad_norm": 5.883671760559082,
"learning_rate": 4.763116825208086e-05,
"loss": 3.8722,
"step": 25500
},
{
"epoch": 0.9661117717003567,
"grad_norm": 5.462569236755371,
"learning_rate": 4.758472057074911e-05,
"loss": 3.8502,
"step": 26000
},
{
"epoch": 0.9846908442330559,
"grad_norm": 5.68014669418335,
"learning_rate": 4.753827288941736e-05,
"loss": 3.8339,
"step": 26500
},
{
"epoch": 1.003269916765755,
"grad_norm": 5.658189296722412,
"learning_rate": 4.749182520808562e-05,
"loss": 3.8221,
"step": 27000
},
{
"epoch": 1.0218489892984541,
"grad_norm": 5.337306976318359,
"learning_rate": 4.744537752675387e-05,
"loss": 3.8099,
"step": 27500
},
{
"epoch": 1.0404280618311534,
"grad_norm": 5.810146808624268,
"learning_rate": 4.739892984542212e-05,
"loss": 3.7751,
"step": 28000
},
{
"epoch": 1.0590071343638525,
"grad_norm": 5.76528263092041,
"learning_rate": 4.7352482164090375e-05,
"loss": 3.7551,
"step": 28500
},
{
"epoch": 1.0775862068965518,
"grad_norm": 6.346560955047607,
"learning_rate": 4.730603448275862e-05,
"loss": 3.7454,
"step": 29000
},
{
"epoch": 1.096165279429251,
"grad_norm": 5.019473552703857,
"learning_rate": 4.7259586801426876e-05,
"loss": 3.74,
"step": 29500
},
{
"epoch": 1.11474435196195,
"grad_norm": 5.211459636688232,
"learning_rate": 4.721313912009513e-05,
"loss": 3.7018,
"step": 30000
},
{
"epoch": 1.1333234244946493,
"grad_norm": 5.713869571685791,
"learning_rate": 4.716669143876338e-05,
"loss": 3.6834,
"step": 30500
},
{
"epoch": 1.1519024970273484,
"grad_norm": 5.4642744064331055,
"learning_rate": 4.7120243757431635e-05,
"loss": 3.7053,
"step": 31000
},
{
"epoch": 1.1704815695600477,
"grad_norm": 6.932915687561035,
"learning_rate": 4.7073796076099885e-05,
"loss": 3.6719,
"step": 31500
},
{
"epoch": 1.1890606420927468,
"grad_norm": 5.861956596374512,
"learning_rate": 4.7027348394768136e-05,
"loss": 3.6613,
"step": 32000
},
{
"epoch": 1.2076397146254458,
"grad_norm": 5.654363632202148,
"learning_rate": 4.6980900713436386e-05,
"loss": 3.6413,
"step": 32500
},
{
"epoch": 1.2262187871581451,
"grad_norm": 5.2097392082214355,
"learning_rate": 4.693445303210464e-05,
"loss": 3.6603,
"step": 33000
},
{
"epoch": 1.2447978596908442,
"grad_norm": 5.455073833465576,
"learning_rate": 4.688800535077289e-05,
"loss": 3.6455,
"step": 33500
},
{
"epoch": 1.2633769322235433,
"grad_norm": 5.670316219329834,
"learning_rate": 4.6841557669441144e-05,
"loss": 3.6029,
"step": 34000
},
{
"epoch": 1.2819560047562426,
"grad_norm": 6.064113140106201,
"learning_rate": 4.6795109988109395e-05,
"loss": 3.5978,
"step": 34500
},
{
"epoch": 1.3005350772889417,
"grad_norm": 5.650447368621826,
"learning_rate": 4.6748662306777645e-05,
"loss": 3.5817,
"step": 35000
},
{
"epoch": 1.3191141498216408,
"grad_norm": 7.115864276885986,
"learning_rate": 4.67022146254459e-05,
"loss": 3.5769,
"step": 35500
},
{
"epoch": 1.33769322235434,
"grad_norm": 6.497522354125977,
"learning_rate": 4.665576694411415e-05,
"loss": 3.5825,
"step": 36000
},
{
"epoch": 1.3562722948870394,
"grad_norm": 5.834658622741699,
"learning_rate": 4.6609319262782404e-05,
"loss": 3.555,
"step": 36500
},
{
"epoch": 1.3748513674197385,
"grad_norm": 5.968478679656982,
"learning_rate": 4.6562871581450654e-05,
"loss": 3.5476,
"step": 37000
},
{
"epoch": 1.3934304399524375,
"grad_norm": 5.435029983520508,
"learning_rate": 4.651642390011891e-05,
"loss": 3.518,
"step": 37500
},
{
"epoch": 1.4120095124851368,
"grad_norm": 5.952866554260254,
"learning_rate": 4.646997621878716e-05,
"loss": 3.5201,
"step": 38000
},
{
"epoch": 1.430588585017836,
"grad_norm": 6.440069675445557,
"learning_rate": 4.642352853745541e-05,
"loss": 3.5159,
"step": 38500
},
{
"epoch": 1.449167657550535,
"grad_norm": 5.686422824859619,
"learning_rate": 4.637708085612367e-05,
"loss": 3.5054,
"step": 39000
},
{
"epoch": 1.4677467300832343,
"grad_norm": 6.039205551147461,
"learning_rate": 4.633063317479191e-05,
"loss": 3.499,
"step": 39500
},
{
"epoch": 1.4863258026159334,
"grad_norm": 5.929929256439209,
"learning_rate": 4.628418549346017e-05,
"loss": 3.5125,
"step": 40000
},
{
"epoch": 1.5049048751486325,
"grad_norm": 6.127495288848877,
"learning_rate": 4.623773781212842e-05,
"loss": 3.4683,
"step": 40500
},
{
"epoch": 1.5234839476813318,
"grad_norm": 5.202524662017822,
"learning_rate": 4.619129013079667e-05,
"loss": 3.4457,
"step": 41000
},
{
"epoch": 1.542063020214031,
"grad_norm": 6.199319839477539,
"learning_rate": 4.614484244946493e-05,
"loss": 3.4489,
"step": 41500
},
{
"epoch": 1.56064209274673,
"grad_norm": 5.948836803436279,
"learning_rate": 4.609839476813318e-05,
"loss": 3.4412,
"step": 42000
},
{
"epoch": 1.5792211652794292,
"grad_norm": 6.036122798919678,
"learning_rate": 4.605194708680143e-05,
"loss": 3.4259,
"step": 42500
},
{
"epoch": 1.5978002378121285,
"grad_norm": 7.213582992553711,
"learning_rate": 4.600549940546968e-05,
"loss": 3.425,
"step": 43000
},
{
"epoch": 1.6163793103448276,
"grad_norm": 5.579181671142578,
"learning_rate": 4.595905172413794e-05,
"loss": 3.4094,
"step": 43500
},
{
"epoch": 1.6349583828775267,
"grad_norm": 6.071746349334717,
"learning_rate": 4.591260404280618e-05,
"loss": 3.3886,
"step": 44000
},
{
"epoch": 1.653537455410226,
"grad_norm": 6.017687797546387,
"learning_rate": 4.586615636147444e-05,
"loss": 3.3987,
"step": 44500
},
{
"epoch": 1.672116527942925,
"grad_norm": 6.2989349365234375,
"learning_rate": 4.581970868014269e-05,
"loss": 3.3987,
"step": 45000
},
{
"epoch": 1.6906956004756242,
"grad_norm": 5.678338527679443,
"learning_rate": 4.577326099881094e-05,
"loss": 3.3755,
"step": 45500
},
{
"epoch": 1.7092746730083235,
"grad_norm": 6.020495891571045,
"learning_rate": 4.57268133174792e-05,
"loss": 3.3591,
"step": 46000
},
{
"epoch": 1.7278537455410226,
"grad_norm": 5.941638946533203,
"learning_rate": 4.568036563614744e-05,
"loss": 3.3574,
"step": 46500
},
{
"epoch": 1.7464328180737216,
"grad_norm": 6.722168922424316,
"learning_rate": 4.56339179548157e-05,
"loss": 3.3746,
"step": 47000
},
{
"epoch": 1.765011890606421,
"grad_norm": 6.632647514343262,
"learning_rate": 4.558747027348395e-05,
"loss": 3.3535,
"step": 47500
},
{
"epoch": 1.7835909631391202,
"grad_norm": 6.448876857757568,
"learning_rate": 4.55410225921522e-05,
"loss": 3.3581,
"step": 48000
},
{
"epoch": 1.802170035671819,
"grad_norm": 5.348858833312988,
"learning_rate": 4.5494574910820456e-05,
"loss": 3.3432,
"step": 48500
},
{
"epoch": 1.8207491082045184,
"grad_norm": 6.1672186851501465,
"learning_rate": 4.5448127229488707e-05,
"loss": 3.3422,
"step": 49000
},
{
"epoch": 1.8393281807372177,
"grad_norm": 5.889304161071777,
"learning_rate": 4.540167954815696e-05,
"loss": 3.3224,
"step": 49500
},
{
"epoch": 1.8579072532699168,
"grad_norm": 6.291742324829102,
"learning_rate": 4.535523186682521e-05,
"loss": 3.3133,
"step": 50000
},
{
"epoch": 1.8764863258026159,
"grad_norm": 5.806668758392334,
"learning_rate": 4.5308784185493465e-05,
"loss": 3.3243,
"step": 50500
},
{
"epoch": 1.8950653983353152,
"grad_norm": 6.051152229309082,
"learning_rate": 4.5262336504161715e-05,
"loss": 3.2947,
"step": 51000
},
{
"epoch": 1.9136444708680143,
"grad_norm": 6.568633079528809,
"learning_rate": 4.5215888822829966e-05,
"loss": 3.2911,
"step": 51500
},
{
"epoch": 1.9322235434007133,
"grad_norm": 6.369818210601807,
"learning_rate": 4.516944114149822e-05,
"loss": 3.2676,
"step": 52000
},
{
"epoch": 1.9508026159334126,
"grad_norm": 5.4059014320373535,
"learning_rate": 4.512299346016647e-05,
"loss": 3.2633,
"step": 52500
},
{
"epoch": 1.9693816884661117,
"grad_norm": 5.883274078369141,
"learning_rate": 4.5076545778834724e-05,
"loss": 3.2759,
"step": 53000
},
{
"epoch": 1.9879607609988108,
"grad_norm": 6.672674179077148,
"learning_rate": 4.5030098097502975e-05,
"loss": 3.2794,
"step": 53500
},
{
"epoch": 2.00653983353151,
"grad_norm": 6.007123947143555,
"learning_rate": 4.4983650416171225e-05,
"loss": 3.2476,
"step": 54000
},
{
"epoch": 2.0251189060642094,
"grad_norm": 5.490503787994385,
"learning_rate": 4.4937202734839475e-05,
"loss": 3.2269,
"step": 54500
},
{
"epoch": 2.0436979785969083,
"grad_norm": 6.020650386810303,
"learning_rate": 4.489075505350773e-05,
"loss": 3.2368,
"step": 55000
},
{
"epoch": 2.0622770511296076,
"grad_norm": 6.575464248657227,
"learning_rate": 4.484430737217598e-05,
"loss": 3.1994,
"step": 55500
},
{
"epoch": 2.080856123662307,
"grad_norm": 5.844559192657471,
"learning_rate": 4.4797859690844234e-05,
"loss": 3.1938,
"step": 56000
},
{
"epoch": 2.0994351961950057,
"grad_norm": 5.892060279846191,
"learning_rate": 4.475141200951249e-05,
"loss": 3.205,
"step": 56500
},
{
"epoch": 2.118014268727705,
"grad_norm": 5.993409156799316,
"learning_rate": 4.4704964328180735e-05,
"loss": 3.212,
"step": 57000
},
{
"epoch": 2.1365933412604043,
"grad_norm": 6.564383029937744,
"learning_rate": 4.465851664684899e-05,
"loss": 3.1842,
"step": 57500
},
{
"epoch": 2.1551724137931036,
"grad_norm": 5.989982604980469,
"learning_rate": 4.461206896551724e-05,
"loss": 3.1859,
"step": 58000
},
{
"epoch": 2.1737514863258025,
"grad_norm": 5.895771503448486,
"learning_rate": 4.456562128418549e-05,
"loss": 3.1839,
"step": 58500
},
{
"epoch": 2.192330558858502,
"grad_norm": 5.832091331481934,
"learning_rate": 4.451917360285375e-05,
"loss": 3.1728,
"step": 59000
},
{
"epoch": 2.210909631391201,
"grad_norm": 5.480752468109131,
"learning_rate": 4.4472725921522e-05,
"loss": 3.1711,
"step": 59500
},
{
"epoch": 2.2294887039239,
"grad_norm": 5.683358192443848,
"learning_rate": 4.442627824019025e-05,
"loss": 3.1669,
"step": 60000
},
{
"epoch": 2.2480677764565993,
"grad_norm": 6.638919830322266,
"learning_rate": 4.43798305588585e-05,
"loss": 3.1677,
"step": 60500
},
{
"epoch": 2.2666468489892986,
"grad_norm": 5.941629886627197,
"learning_rate": 4.433338287752676e-05,
"loss": 3.1506,
"step": 61000
},
{
"epoch": 2.2852259215219974,
"grad_norm": 6.227372169494629,
"learning_rate": 4.428693519619501e-05,
"loss": 3.1607,
"step": 61500
},
{
"epoch": 2.3038049940546967,
"grad_norm": 6.063544750213623,
"learning_rate": 4.424048751486326e-05,
"loss": 3.1477,
"step": 62000
},
{
"epoch": 2.322384066587396,
"grad_norm": 5.8914618492126465,
"learning_rate": 4.419403983353152e-05,
"loss": 3.1362,
"step": 62500
},
{
"epoch": 2.3409631391200953,
"grad_norm": 5.964859962463379,
"learning_rate": 4.414759215219976e-05,
"loss": 3.1282,
"step": 63000
},
{
"epoch": 2.359542211652794,
"grad_norm": 5.622297763824463,
"learning_rate": 4.410114447086802e-05,
"loss": 3.1386,
"step": 63500
},
{
"epoch": 2.3781212841854935,
"grad_norm": 6.728824138641357,
"learning_rate": 4.405469678953627e-05,
"loss": 3.1202,
"step": 64000
},
{
"epoch": 2.396700356718193,
"grad_norm": 6.513198375701904,
"learning_rate": 4.400824910820452e-05,
"loss": 3.1455,
"step": 64500
},
{
"epoch": 2.4152794292508917,
"grad_norm": 6.273243427276611,
"learning_rate": 4.396180142687277e-05,
"loss": 3.1143,
"step": 65000
},
{
"epoch": 2.433858501783591,
"grad_norm": 5.384542465209961,
"learning_rate": 4.391535374554103e-05,
"loss": 3.1111,
"step": 65500
},
{
"epoch": 2.4524375743162903,
"grad_norm": 5.742457866668701,
"learning_rate": 4.386890606420928e-05,
"loss": 3.1146,
"step": 66000
},
{
"epoch": 2.471016646848989,
"grad_norm": 6.236721038818359,
"learning_rate": 4.382245838287753e-05,
"loss": 3.111,
"step": 66500
},
{
"epoch": 2.4895957193816884,
"grad_norm": 6.027072429656982,
"learning_rate": 4.3776010701545785e-05,
"loss": 3.088,
"step": 67000
},
{
"epoch": 2.5081747919143877,
"grad_norm": 6.66511869430542,
"learning_rate": 4.372956302021403e-05,
"loss": 3.1021,
"step": 67500
},
{
"epoch": 2.5267538644470866,
"grad_norm": 5.52970552444458,
"learning_rate": 4.3683115338882286e-05,
"loss": 3.1001,
"step": 68000
},
{
"epoch": 2.545332936979786,
"grad_norm": 6.7701897621154785,
"learning_rate": 4.363666765755054e-05,
"loss": 3.0905,
"step": 68500
},
{
"epoch": 2.563912009512485,
"grad_norm": 5.972938537597656,
"learning_rate": 4.359021997621879e-05,
"loss": 3.0665,
"step": 69000
},
{
"epoch": 2.582491082045184,
"grad_norm": 6.33815860748291,
"learning_rate": 4.3543772294887044e-05,
"loss": 3.0703,
"step": 69500
},
{
"epoch": 2.6010701545778834,
"grad_norm": 5.83467435836792,
"learning_rate": 4.3497324613555295e-05,
"loss": 3.0804,
"step": 70000
},
{
"epoch": 2.6196492271105827,
"grad_norm": 6.139744758605957,
"learning_rate": 4.3450876932223545e-05,
"loss": 3.0668,
"step": 70500
},
{
"epoch": 2.6382282996432815,
"grad_norm": 7.028213977813721,
"learning_rate": 4.3404429250891796e-05,
"loss": 3.0549,
"step": 71000
},
{
"epoch": 2.656807372175981,
"grad_norm": 5.353559970855713,
"learning_rate": 4.335798156956005e-05,
"loss": 3.0684,
"step": 71500
},
{
"epoch": 2.67538644470868,
"grad_norm": 6.900554656982422,
"learning_rate": 4.3311533888228304e-05,
"loss": 3.035,
"step": 72000
},
{
"epoch": 2.6939655172413794,
"grad_norm": 6.68520450592041,
"learning_rate": 4.3265086206896554e-05,
"loss": 3.0307,
"step": 72500
},
{
"epoch": 2.7125445897740788,
"grad_norm": 6.080930233001709,
"learning_rate": 4.321863852556481e-05,
"loss": 3.0379,
"step": 73000
},
{
"epoch": 2.7311236623067776,
"grad_norm": 5.922386646270752,
"learning_rate": 4.3172190844233055e-05,
"loss": 3.0393,
"step": 73500
},
{
"epoch": 2.749702734839477,
"grad_norm": 6.372087001800537,
"learning_rate": 4.312574316290131e-05,
"loss": 3.0243,
"step": 74000
},
{
"epoch": 2.768281807372176,
"grad_norm": 6.071821689605713,
"learning_rate": 4.307929548156956e-05,
"loss": 3.0283,
"step": 74500
},
{
"epoch": 2.786860879904875,
"grad_norm": 6.012415409088135,
"learning_rate": 4.3032847800237813e-05,
"loss": 3.025,
"step": 75000
},
{
"epoch": 2.8054399524375744,
"grad_norm": 6.770437717437744,
"learning_rate": 4.2986400118906064e-05,
"loss": 3.0242,
"step": 75500
},
{
"epoch": 2.8240190249702737,
"grad_norm": 6.748111724853516,
"learning_rate": 4.2939952437574314e-05,
"loss": 3.016,
"step": 76000
},
{
"epoch": 2.8425980975029725,
"grad_norm": 6.000352382659912,
"learning_rate": 4.289350475624257e-05,
"loss": 3.0208,
"step": 76500
},
{
"epoch": 2.861177170035672,
"grad_norm": 6.079233646392822,
"learning_rate": 4.284705707491082e-05,
"loss": 3.0062,
"step": 77000
},
{
"epoch": 2.879756242568371,
"grad_norm": 5.8158040046691895,
"learning_rate": 4.280060939357907e-05,
"loss": 3.0162,
"step": 77500
},
{
"epoch": 2.89833531510107,
"grad_norm": 7.081645965576172,
"learning_rate": 4.275416171224732e-05,
"loss": 2.9933,
"step": 78000
},
{
"epoch": 2.9169143876337693,
"grad_norm": 7.042798042297363,
"learning_rate": 4.270771403091558e-05,
"loss": 3.0055,
"step": 78500
},
{
"epoch": 2.9354934601664686,
"grad_norm": 6.736599445343018,
"learning_rate": 4.266126634958383e-05,
"loss": 2.9964,
"step": 79000
},
{
"epoch": 2.9540725326991675,
"grad_norm": 6.28444242477417,
"learning_rate": 4.261481866825208e-05,
"loss": 2.9943,
"step": 79500
},
{
"epoch": 2.972651605231867,
"grad_norm": 6.56734561920166,
"learning_rate": 4.256837098692034e-05,
"loss": 2.9807,
"step": 80000
},
{
"epoch": 2.991230677764566,
"grad_norm": 6.312285423278809,
"learning_rate": 4.252192330558858e-05,
"loss": 2.9975,
"step": 80500
},
{
"epoch": 3.009809750297265,
"grad_norm": 5.604495048522949,
"learning_rate": 4.247547562425684e-05,
"loss": 2.964,
"step": 81000
},
{
"epoch": 3.0283888228299642,
"grad_norm": 6.2611083984375,
"learning_rate": 4.242902794292509e-05,
"loss": 2.9411,
"step": 81500
},
{
"epoch": 3.0469678953626635,
"grad_norm": 6.149163246154785,
"learning_rate": 4.238258026159334e-05,
"loss": 2.9448,
"step": 82000
},
{
"epoch": 3.065546967895363,
"grad_norm": 6.137192249298096,
"learning_rate": 4.23361325802616e-05,
"loss": 2.9208,
"step": 82500
},
{
"epoch": 3.0841260404280617,
"grad_norm": 5.697031497955322,
"learning_rate": 4.228968489892985e-05,
"loss": 2.9352,
"step": 83000
},
{
"epoch": 3.102705112960761,
"grad_norm": 6.298037528991699,
"learning_rate": 4.22432372175981e-05,
"loss": 2.9318,
"step": 83500
},
{
"epoch": 3.1212841854934603,
"grad_norm": 6.293707370758057,
"learning_rate": 4.219678953626635e-05,
"loss": 2.9484,
"step": 84000
},
{
"epoch": 3.139863258026159,
"grad_norm": 6.098722457885742,
"learning_rate": 4.215034185493461e-05,
"loss": 2.923,
"step": 84500
},
{
"epoch": 3.1584423305588585,
"grad_norm": 5.604320526123047,
"learning_rate": 4.210389417360285e-05,
"loss": 2.932,
"step": 85000
},
{
"epoch": 3.177021403091558,
"grad_norm": 6.741579055786133,
"learning_rate": 4.205744649227111e-05,
"loss": 2.911,
"step": 85500
},
{
"epoch": 3.1956004756242566,
"grad_norm": 6.246683120727539,
"learning_rate": 4.201099881093936e-05,
"loss": 2.9139,
"step": 86000
},
{
"epoch": 3.214179548156956,
"grad_norm": 6.600460052490234,
"learning_rate": 4.196455112960761e-05,
"loss": 2.9329,
"step": 86500
},
{
"epoch": 3.2327586206896552,
"grad_norm": 6.846024990081787,
"learning_rate": 4.1918103448275866e-05,
"loss": 2.9189,
"step": 87000
},
{
"epoch": 3.2513376932223546,
"grad_norm": 6.301860332489014,
"learning_rate": 4.1871655766944116e-05,
"loss": 2.9191,
"step": 87500
},
{
"epoch": 3.2699167657550534,
"grad_norm": 5.542537689208984,
"learning_rate": 4.182520808561237e-05,
"loss": 2.8991,
"step": 88000
},
{
"epoch": 3.2884958382877527,
"grad_norm": 6.527828216552734,
"learning_rate": 4.177876040428062e-05,
"loss": 2.8959,
"step": 88500
},
{
"epoch": 3.307074910820452,
"grad_norm": 6.696499824523926,
"learning_rate": 4.1732312722948875e-05,
"loss": 2.904,
"step": 89000
},
{
"epoch": 3.325653983353151,
"grad_norm": 6.901641368865967,
"learning_rate": 4.1685865041617125e-05,
"loss": 2.8918,
"step": 89500
},
{
"epoch": 3.34423305588585,
"grad_norm": 5.950034141540527,
"learning_rate": 4.1639417360285376e-05,
"loss": 2.8953,
"step": 90000
},
{
"epoch": 3.3628121284185495,
"grad_norm": 6.489218235015869,
"learning_rate": 4.159296967895363e-05,
"loss": 2.8983,
"step": 90500
},
{
"epoch": 3.3813912009512483,
"grad_norm": 7.06480073928833,
"learning_rate": 4.154652199762188e-05,
"loss": 2.9144,
"step": 91000
},
{
"epoch": 3.3999702734839476,
"grad_norm": 6.297793388366699,
"learning_rate": 4.1500074316290134e-05,
"loss": 2.8853,
"step": 91500
},
{
"epoch": 3.418549346016647,
"grad_norm": 6.4150919914245605,
"learning_rate": 4.1453626634958384e-05,
"loss": 2.8906,
"step": 92000
},
{
"epoch": 3.437128418549346,
"grad_norm": 7.301102638244629,
"learning_rate": 4.1407178953626635e-05,
"loss": 2.8706,
"step": 92500
},
{
"epoch": 3.455707491082045,
"grad_norm": 6.061220645904541,
"learning_rate": 4.136073127229489e-05,
"loss": 2.8732,
"step": 93000
},
{
"epoch": 3.4742865636147444,
"grad_norm": 6.419704914093018,
"learning_rate": 4.131428359096314e-05,
"loss": 2.861,
"step": 93500
},
{
"epoch": 3.4928656361474433,
"grad_norm": 7.272397994995117,
"learning_rate": 4.126783590963139e-05,
"loss": 2.8942,
"step": 94000
},
{
"epoch": 3.5114447086801426,
"grad_norm": 6.250875949859619,
"learning_rate": 4.1221388228299644e-05,
"loss": 2.8639,
"step": 94500
},
{
"epoch": 3.530023781212842,
"grad_norm": 6.624760150909424,
"learning_rate": 4.11749405469679e-05,
"loss": 2.8706,
"step": 95000
},
{
"epoch": 3.548602853745541,
"grad_norm": 7.242002487182617,
"learning_rate": 4.1128492865636145e-05,
"loss": 2.8549,
"step": 95500
},
{
"epoch": 3.56718192627824,
"grad_norm": 6.070115089416504,
"learning_rate": 4.10820451843044e-05,
"loss": 2.8564,
"step": 96000
},
{
"epoch": 3.5857609988109393,
"grad_norm": 6.022694110870361,
"learning_rate": 4.103559750297266e-05,
"loss": 2.8637,
"step": 96500
},
{
"epoch": 3.6043400713436387,
"grad_norm": 5.543400287628174,
"learning_rate": 4.09891498216409e-05,
"loss": 2.8219,
"step": 97000
},
{
"epoch": 3.622919143876338,
"grad_norm": 6.441455841064453,
"learning_rate": 4.094270214030916e-05,
"loss": 2.844,
"step": 97500
},
{
"epoch": 3.641498216409037,
"grad_norm": 6.443978786468506,
"learning_rate": 4.089625445897741e-05,
"loss": 2.8337,
"step": 98000
},
{
"epoch": 3.660077288941736,
"grad_norm": 7.063666820526123,
"learning_rate": 4.084980677764566e-05,
"loss": 2.869,
"step": 98500
},
{
"epoch": 3.6786563614744354,
"grad_norm": 6.34807825088501,
"learning_rate": 4.080335909631391e-05,
"loss": 2.8303,
"step": 99000
},
{
"epoch": 3.6972354340071343,
"grad_norm": 6.020463466644287,
"learning_rate": 4.075691141498217e-05,
"loss": 2.8442,
"step": 99500
},
{
"epoch": 3.7158145065398336,
"grad_norm": 6.808725357055664,
"learning_rate": 4.071046373365042e-05,
"loss": 2.8432,
"step": 100000
},
{
"epoch": 3.734393579072533,
"grad_norm": 6.207636833190918,
"learning_rate": 4.066401605231867e-05,
"loss": 2.859,
"step": 100500
},
{
"epoch": 3.7529726516052317,
"grad_norm": 6.0236616134643555,
"learning_rate": 4.061756837098693e-05,
"loss": 2.847,
"step": 101000
},
{
"epoch": 3.771551724137931,
"grad_norm": 5.8015241622924805,
"learning_rate": 4.057112068965517e-05,
"loss": 2.8239,
"step": 101500
},
{
"epoch": 3.7901307966706304,
"grad_norm": 6.354222297668457,
"learning_rate": 4.052467300832343e-05,
"loss": 2.8574,
"step": 102000
},
{
"epoch": 3.808709869203329,
"grad_norm": 6.587215900421143,
"learning_rate": 4.047822532699168e-05,
"loss": 2.8354,
"step": 102500
},
{
"epoch": 3.8272889417360285,
"grad_norm": 7.283754825592041,
"learning_rate": 4.043177764565993e-05,
"loss": 2.8218,
"step": 103000
},
{
"epoch": 3.845868014268728,
"grad_norm": 6.165238857269287,
"learning_rate": 4.0385329964328186e-05,
"loss": 2.841,
"step": 103500
},
{
"epoch": 3.8644470868014267,
"grad_norm": 6.120512008666992,
"learning_rate": 4.033888228299644e-05,
"loss": 2.8195,
"step": 104000
},
{
"epoch": 3.883026159334126,
"grad_norm": 6.8183698654174805,
"learning_rate": 4.029243460166469e-05,
"loss": 2.8044,
"step": 104500
},
{
"epoch": 3.9016052318668253,
"grad_norm": 5.847311973571777,
"learning_rate": 4.024598692033294e-05,
"loss": 2.8056,
"step": 105000
},
{
"epoch": 3.920184304399524,
"grad_norm": 7.423314571380615,
"learning_rate": 4.019953923900119e-05,
"loss": 2.8164,
"step": 105500
},
{
"epoch": 3.9387633769322234,
"grad_norm": 6.736742973327637,
"learning_rate": 4.015309155766944e-05,
"loss": 2.8099,
"step": 106000
},
{
"epoch": 3.9573424494649228,
"grad_norm": 6.3364644050598145,
"learning_rate": 4.0106643876337696e-05,
"loss": 2.8138,
"step": 106500
},
{
"epoch": 3.9759215219976216,
"grad_norm": 6.03089714050293,
"learning_rate": 4.006019619500595e-05,
"loss": 2.8161,
"step": 107000
},
{
"epoch": 3.994500594530321,
"grad_norm": 7.099618911743164,
"learning_rate": 4.00137485136742e-05,
"loss": 2.7899,
"step": 107500
},
{
"epoch": 4.01307966706302,
"grad_norm": 6.682999134063721,
"learning_rate": 3.9967300832342454e-05,
"loss": 2.7953,
"step": 108000
},
{
"epoch": 4.031658739595719,
"grad_norm": 5.30817985534668,
"learning_rate": 3.99208531510107e-05,
"loss": 2.7662,
"step": 108500
},
{
"epoch": 4.050237812128419,
"grad_norm": 6.152495384216309,
"learning_rate": 3.9874405469678955e-05,
"loss": 2.7628,
"step": 109000
},
{
"epoch": 4.068816884661118,
"grad_norm": 6.075979232788086,
"learning_rate": 3.9827957788347206e-05,
"loss": 2.7805,
"step": 109500
},
{
"epoch": 4.0873959571938165,
"grad_norm": 6.708266258239746,
"learning_rate": 3.9781510107015456e-05,
"loss": 2.7607,
"step": 110000
},
{
"epoch": 4.105975029726516,
"grad_norm": 6.425528049468994,
"learning_rate": 3.9735062425683714e-05,
"loss": 2.7738,
"step": 110500
},
{
"epoch": 4.124554102259215,
"grad_norm": 6.978008270263672,
"learning_rate": 3.9688614744351964e-05,
"loss": 2.7654,
"step": 111000
},
{
"epoch": 4.143133174791914,
"grad_norm": 6.780577182769775,
"learning_rate": 3.9642167063020215e-05,
"loss": 2.7632,
"step": 111500
},
{
"epoch": 4.161712247324614,
"grad_norm": 5.834601879119873,
"learning_rate": 3.9595719381688465e-05,
"loss": 2.7671,
"step": 112000
},
{
"epoch": 4.180291319857313,
"grad_norm": 7.513933181762695,
"learning_rate": 3.954927170035672e-05,
"loss": 2.7738,
"step": 112500
},
{
"epoch": 4.1988703923900115,
"grad_norm": 6.303833484649658,
"learning_rate": 3.950282401902497e-05,
"loss": 2.782,
"step": 113000
},
{
"epoch": 4.217449464922711,
"grad_norm": 5.807947158813477,
"learning_rate": 3.945637633769322e-05,
"loss": 2.7434,
"step": 113500
},
{
"epoch": 4.23602853745541,
"grad_norm": 6.926473617553711,
"learning_rate": 3.940992865636148e-05,
"loss": 2.7458,
"step": 114000
},
{
"epoch": 4.25460760998811,
"grad_norm": 6.764691352844238,
"learning_rate": 3.9363480975029724e-05,
"loss": 2.7357,
"step": 114500
},
{
"epoch": 4.273186682520809,
"grad_norm": 5.976272106170654,
"learning_rate": 3.931703329369798e-05,
"loss": 2.7473,
"step": 115000
},
{
"epoch": 4.2917657550535075,
"grad_norm": 6.0660810470581055,
"learning_rate": 3.927058561236623e-05,
"loss": 2.7387,
"step": 115500
},
{
"epoch": 4.310344827586207,
"grad_norm": 6.600549221038818,
"learning_rate": 3.922413793103448e-05,
"loss": 2.7409,
"step": 116000
},
{
"epoch": 4.328923900118906,
"grad_norm": 7.705731391906738,
"learning_rate": 3.917769024970274e-05,
"loss": 2.7408,
"step": 116500
},
{
"epoch": 4.347502972651605,
"grad_norm": 6.347229957580566,
"learning_rate": 3.913124256837099e-05,
"loss": 2.7517,
"step": 117000
},
{
"epoch": 4.366082045184305,
"grad_norm": 7.695369243621826,
"learning_rate": 3.908479488703924e-05,
"loss": 2.7443,
"step": 117500
},
{
"epoch": 4.384661117717004,
"grad_norm": 6.612791538238525,
"learning_rate": 3.903834720570749e-05,
"loss": 2.7378,
"step": 118000
},
{
"epoch": 4.4032401902497025,
"grad_norm": 6.125636577606201,
"learning_rate": 3.899189952437575e-05,
"loss": 2.7224,
"step": 118500
},
{
"epoch": 4.421819262782402,
"grad_norm": 6.215822696685791,
"learning_rate": 3.894545184304399e-05,
"loss": 2.7311,
"step": 119000
},
{
"epoch": 4.440398335315101,
"grad_norm": 6.436295032501221,
"learning_rate": 3.889900416171225e-05,
"loss": 2.722,
"step": 119500
},
{
"epoch": 4.4589774078478,
"grad_norm": 6.271787166595459,
"learning_rate": 3.88525564803805e-05,
"loss": 2.7421,
"step": 120000
},
{
"epoch": 4.4775564803805,
"grad_norm": 5.990880012512207,
"learning_rate": 3.880610879904875e-05,
"loss": 2.7284,
"step": 120500
},
{
"epoch": 4.4961355529131986,
"grad_norm": 6.567028999328613,
"learning_rate": 3.875966111771701e-05,
"loss": 2.7244,
"step": 121000
},
{
"epoch": 4.514714625445897,
"grad_norm": 6.399959087371826,
"learning_rate": 3.871321343638526e-05,
"loss": 2.7139,
"step": 121500
},
{
"epoch": 4.533293697978597,
"grad_norm": 6.813540458679199,
"learning_rate": 3.866676575505351e-05,
"loss": 2.7177,
"step": 122000
},
{
"epoch": 4.551872770511296,
"grad_norm": 6.263701438903809,
"learning_rate": 3.862031807372176e-05,
"loss": 2.7245,
"step": 122500
},
{
"epoch": 4.570451843043995,
"grad_norm": 6.281601428985596,
"learning_rate": 3.8573870392390017e-05,
"loss": 2.728,
"step": 123000
},
{
"epoch": 4.589030915576695,
"grad_norm": 6.198410511016846,
"learning_rate": 3.852742271105827e-05,
"loss": 2.7187,
"step": 123500
},
{
"epoch": 4.6076099881093935,
"grad_norm": 7.052980899810791,
"learning_rate": 3.848097502972652e-05,
"loss": 2.7294,
"step": 124000
},
{
"epoch": 4.626189060642092,
"grad_norm": 6.8353776931762695,
"learning_rate": 3.8434527348394775e-05,
"loss": 2.7366,
"step": 124500
},
{
"epoch": 4.644768133174792,
"grad_norm": 6.245896816253662,
"learning_rate": 3.838807966706302e-05,
"loss": 2.7061,
"step": 125000
},
{
"epoch": 4.663347205707491,
"grad_norm": 5.742074489593506,
"learning_rate": 3.8341631985731276e-05,
"loss": 2.7031,
"step": 125500
},
{
"epoch": 4.681926278240191,
"grad_norm": 6.55544376373291,
"learning_rate": 3.8295184304399526e-05,
"loss": 2.6768,
"step": 126000
},
{
"epoch": 4.70050535077289,
"grad_norm": 5.943970203399658,
"learning_rate": 3.824873662306778e-05,
"loss": 2.6986,
"step": 126500
},
{
"epoch": 4.719084423305588,
"grad_norm": 7.413682460784912,
"learning_rate": 3.8202288941736034e-05,
"loss": 2.7229,
"step": 127000
},
{
"epoch": 4.737663495838287,
"grad_norm": 7.258702278137207,
"learning_rate": 3.8155841260404285e-05,
"loss": 2.7006,
"step": 127500
},
{
"epoch": 4.756242568370987,
"grad_norm": 6.239523887634277,
"learning_rate": 3.8109393579072535e-05,
"loss": 2.7015,
"step": 128000
},
{
"epoch": 4.774821640903686,
"grad_norm": 6.031528949737549,
"learning_rate": 3.8062945897740786e-05,
"loss": 2.6794,
"step": 128500
},
{
"epoch": 4.793400713436386,
"grad_norm": 6.504217624664307,
"learning_rate": 3.801649821640904e-05,
"loss": 2.6756,
"step": 129000
},
{
"epoch": 4.8119797859690845,
"grad_norm": 6.261529922485352,
"learning_rate": 3.7970050535077287e-05,
"loss": 2.6825,
"step": 129500
},
{
"epoch": 4.830558858501783,
"grad_norm": 5.9492292404174805,
"learning_rate": 3.7923602853745544e-05,
"loss": 2.6894,
"step": 130000
},
{
"epoch": 4.849137931034483,
"grad_norm": 5.7504706382751465,
"learning_rate": 3.7877155172413794e-05,
"loss": 2.693,
"step": 130500
},
{
"epoch": 4.867717003567182,
"grad_norm": 6.545624256134033,
"learning_rate": 3.7830707491082045e-05,
"loss": 2.6888,
"step": 131000
},
{
"epoch": 4.886296076099881,
"grad_norm": 6.274423599243164,
"learning_rate": 3.77842598097503e-05,
"loss": 2.6884,
"step": 131500
},
{
"epoch": 4.904875148632581,
"grad_norm": 5.632358074188232,
"learning_rate": 3.773781212841855e-05,
"loss": 2.678,
"step": 132000
},
{
"epoch": 4.923454221165279,
"grad_norm": 6.883337020874023,
"learning_rate": 3.76913644470868e-05,
"loss": 2.677,
"step": 132500
},
{
"epoch": 4.942033293697978,
"grad_norm": 6.676144123077393,
"learning_rate": 3.7644916765755054e-05,
"loss": 2.6788,
"step": 133000
},
{
"epoch": 4.960612366230678,
"grad_norm": 8.354021072387695,
"learning_rate": 3.759846908442331e-05,
"loss": 2.6885,
"step": 133500
},
{
"epoch": 4.979191438763377,
"grad_norm": 6.048637866973877,
"learning_rate": 3.755202140309156e-05,
"loss": 2.6636,
"step": 134000
},
{
"epoch": 4.997770511296076,
"grad_norm": 5.709485054016113,
"learning_rate": 3.750557372175981e-05,
"loss": 2.6601,
"step": 134500
},
{
"epoch": 5.0163495838287755,
"grad_norm": 7.082876682281494,
"learning_rate": 3.745912604042807e-05,
"loss": 2.6477,
"step": 135000
},
{
"epoch": 5.034928656361474,
"grad_norm": 6.736342430114746,
"learning_rate": 3.741267835909631e-05,
"loss": 2.6357,
"step": 135500
},
{
"epoch": 5.053507728894173,
"grad_norm": 6.7299580574035645,
"learning_rate": 3.736623067776457e-05,
"loss": 2.6532,
"step": 136000
},
{
"epoch": 5.072086801426873,
"grad_norm": 6.488595008850098,
"learning_rate": 3.731978299643282e-05,
"loss": 2.6478,
"step": 136500
},
{
"epoch": 5.090665873959572,
"grad_norm": 6.1401262283325195,
"learning_rate": 3.727333531510107e-05,
"loss": 2.6271,
"step": 137000
},
{
"epoch": 5.109244946492271,
"grad_norm": 6.6415300369262695,
"learning_rate": 3.722688763376933e-05,
"loss": 2.6347,
"step": 137500
},
{
"epoch": 5.12782401902497,
"grad_norm": 6.715450286865234,
"learning_rate": 3.718043995243757e-05,
"loss": 2.6377,
"step": 138000
},
{
"epoch": 5.146403091557669,
"grad_norm": 6.399317741394043,
"learning_rate": 3.713399227110583e-05,
"loss": 2.6348,
"step": 138500
},
{
"epoch": 5.164982164090369,
"grad_norm": 7.233635902404785,
"learning_rate": 3.708754458977408e-05,
"loss": 2.6411,
"step": 139000
},
{
"epoch": 5.183561236623068,
"grad_norm": 6.6088433265686035,
"learning_rate": 3.704109690844233e-05,
"loss": 2.62,
"step": 139500
},
{
"epoch": 5.202140309155767,
"grad_norm": 6.2975287437438965,
"learning_rate": 3.699464922711058e-05,
"loss": 2.6337,
"step": 140000
},
{
"epoch": 5.2207193816884665,
"grad_norm": 5.73189115524292,
"learning_rate": 3.694820154577884e-05,
"loss": 2.6347,
"step": 140500
},
{
"epoch": 5.239298454221165,
"grad_norm": 6.38447904586792,
"learning_rate": 3.690175386444709e-05,
"loss": 2.6338,
"step": 141000
},
{
"epoch": 5.257877526753864,
"grad_norm": 6.772334098815918,
"learning_rate": 3.685530618311534e-05,
"loss": 2.6371,
"step": 141500
},
{
"epoch": 5.276456599286564,
"grad_norm": 6.404881477355957,
"learning_rate": 3.6808858501783596e-05,
"loss": 2.6223,
"step": 142000
},
{
"epoch": 5.295035671819263,
"grad_norm": 6.889057159423828,
"learning_rate": 3.676241082045184e-05,
"loss": 2.624,
"step": 142500
},
{
"epoch": 5.313614744351962,
"grad_norm": 6.435732841491699,
"learning_rate": 3.67159631391201e-05,
"loss": 2.6408,
"step": 143000
},
{
"epoch": 5.332193816884661,
"grad_norm": 6.9687418937683105,
"learning_rate": 3.666951545778835e-05,
"loss": 2.6239,
"step": 143500
},
{
"epoch": 5.35077288941736,
"grad_norm": 6.787994861602783,
"learning_rate": 3.66230677764566e-05,
"loss": 2.6285,
"step": 144000
},
{
"epoch": 5.369351961950059,
"grad_norm": 6.9550700187683105,
"learning_rate": 3.6576620095124856e-05,
"loss": 2.6173,
"step": 144500
},
{
"epoch": 5.387931034482759,
"grad_norm": 6.4159345626831055,
"learning_rate": 3.6530172413793106e-05,
"loss": 2.6186,
"step": 145000
},
{
"epoch": 5.406510107015458,
"grad_norm": 6.8777995109558105,
"learning_rate": 3.6483724732461357e-05,
"loss": 2.6239,
"step": 145500
},
{
"epoch": 5.425089179548157,
"grad_norm": 6.115660667419434,
"learning_rate": 3.643727705112961e-05,
"loss": 2.6243,
"step": 146000
},
{
"epoch": 5.443668252080856,
"grad_norm": 7.484089374542236,
"learning_rate": 3.6390829369797864e-05,
"loss": 2.6211,
"step": 146500
},
{
"epoch": 5.462247324613555,
"grad_norm": 6.411886692047119,
"learning_rate": 3.6344381688466115e-05,
"loss": 2.61,
"step": 147000
},
{
"epoch": 5.480826397146254,
"grad_norm": 6.482817649841309,
"learning_rate": 3.6297934007134365e-05,
"loss": 2.5962,
"step": 147500
},
{
"epoch": 5.499405469678954,
"grad_norm": 5.861370086669922,
"learning_rate": 3.625148632580262e-05,
"loss": 2.6081,
"step": 148000
},
{
"epoch": 5.517984542211653,
"grad_norm": 7.179725170135498,
"learning_rate": 3.6205038644470866e-05,
"loss": 2.6138,
"step": 148500
},
{
"epoch": 5.536563614744352,
"grad_norm": 6.607731819152832,
"learning_rate": 3.6158590963139124e-05,
"loss": 2.6335,
"step": 149000
},
{
"epoch": 5.555142687277051,
"grad_norm": 7.58914041519165,
"learning_rate": 3.6112143281807374e-05,
"loss": 2.6188,
"step": 149500
},
{
"epoch": 5.57372175980975,
"grad_norm": 6.815672397613525,
"learning_rate": 3.6065695600475625e-05,
"loss": 2.5999,
"step": 150000
},
{
"epoch": 5.592300832342449,
"grad_norm": 7.304187297821045,
"learning_rate": 3.6019247919143875e-05,
"loss": 2.61,
"step": 150500
},
{
"epoch": 5.610879904875149,
"grad_norm": 6.256832599639893,
"learning_rate": 3.597280023781213e-05,
"loss": 2.5896,
"step": 151000
},
{
"epoch": 5.629458977407848,
"grad_norm": 6.603561878204346,
"learning_rate": 3.592635255648038e-05,
"loss": 2.5937,
"step": 151500
},
{
"epoch": 5.648038049940547,
"grad_norm": 6.757023334503174,
"learning_rate": 3.587990487514863e-05,
"loss": 2.6102,
"step": 152000
},
{
"epoch": 5.666617122473246,
"grad_norm": 6.520168304443359,
"learning_rate": 3.583345719381689e-05,
"loss": 2.5915,
"step": 152500
},
{
"epoch": 5.685196195005945,
"grad_norm": 6.486233234405518,
"learning_rate": 3.5787009512485134e-05,
"loss": 2.5833,
"step": 153000
},
{
"epoch": 5.703775267538645,
"grad_norm": 5.79095458984375,
"learning_rate": 3.574056183115339e-05,
"loss": 2.5862,
"step": 153500
},
{
"epoch": 5.722354340071344,
"grad_norm": 6.390963077545166,
"learning_rate": 3.569411414982164e-05,
"loss": 2.5867,
"step": 154000
},
{
"epoch": 5.7409334126040426,
"grad_norm": 6.4793548583984375,
"learning_rate": 3.564766646848989e-05,
"loss": 2.5983,
"step": 154500
},
{
"epoch": 5.759512485136742,
"grad_norm": 6.3781585693359375,
"learning_rate": 3.560121878715815e-05,
"loss": 2.6013,
"step": 155000
},
{
"epoch": 5.778091557669441,
"grad_norm": 6.004998207092285,
"learning_rate": 3.55547711058264e-05,
"loss": 2.5908,
"step": 155500
},
{
"epoch": 5.79667063020214,
"grad_norm": 7.406827926635742,
"learning_rate": 3.550832342449465e-05,
"loss": 2.6126,
"step": 156000
},
{
"epoch": 5.81524970273484,
"grad_norm": 6.263004302978516,
"learning_rate": 3.54618757431629e-05,
"loss": 2.5957,
"step": 156500
},
{
"epoch": 5.833828775267539,
"grad_norm": 6.236379623413086,
"learning_rate": 3.541542806183116e-05,
"loss": 2.5819,
"step": 157000
},
{
"epoch": 5.8524078478002375,
"grad_norm": 7.537994861602783,
"learning_rate": 3.536898038049941e-05,
"loss": 2.574,
"step": 157500
},
{
"epoch": 5.870986920332937,
"grad_norm": 5.823127269744873,
"learning_rate": 3.532253269916766e-05,
"loss": 2.5702,
"step": 158000
},
{
"epoch": 5.889565992865636,
"grad_norm": 5.820526123046875,
"learning_rate": 3.527608501783592e-05,
"loss": 2.5799,
"step": 158500
},
{
"epoch": 5.908145065398335,
"grad_norm": 6.082313060760498,
"learning_rate": 3.522963733650416e-05,
"loss": 2.5835,
"step": 159000
},
{
"epoch": 5.926724137931035,
"grad_norm": 6.335425853729248,
"learning_rate": 3.518318965517242e-05,
"loss": 2.5823,
"step": 159500
},
{
"epoch": 5.945303210463734,
"grad_norm": 6.930739879608154,
"learning_rate": 3.513674197384067e-05,
"loss": 2.5829,
"step": 160000
},
{
"epoch": 5.963882282996433,
"grad_norm": 6.215325832366943,
"learning_rate": 3.509029429250892e-05,
"loss": 2.5851,
"step": 160500
},
{
"epoch": 5.982461355529132,
"grad_norm": 5.954530239105225,
"learning_rate": 3.504384661117717e-05,
"loss": 2.5843,
"step": 161000
},
{
"epoch": 6.001040428061831,
"grad_norm": 6.572493076324463,
"learning_rate": 3.4997398929845426e-05,
"loss": 2.5786,
"step": 161500
},
{
"epoch": 6.01961950059453,
"grad_norm": 6.725315093994141,
"learning_rate": 3.495095124851368e-05,
"loss": 2.5384,
"step": 162000
},
{
"epoch": 6.03819857312723,
"grad_norm": 6.982800483703613,
"learning_rate": 3.490450356718193e-05,
"loss": 2.5501,
"step": 162500
},
{
"epoch": 6.0567776456599285,
"grad_norm": 6.240696430206299,
"learning_rate": 3.4858055885850185e-05,
"loss": 2.5446,
"step": 163000
},
{
"epoch": 6.075356718192628,
"grad_norm": 6.301703453063965,
"learning_rate": 3.481160820451843e-05,
"loss": 2.5471,
"step": 163500
},
{
"epoch": 6.093935790725327,
"grad_norm": 7.478944778442383,
"learning_rate": 3.4765160523186686e-05,
"loss": 2.5473,
"step": 164000
},
{
"epoch": 6.112514863258026,
"grad_norm": 6.435521125793457,
"learning_rate": 3.4718712841854936e-05,
"loss": 2.5417,
"step": 164500
},
{
"epoch": 6.131093935790726,
"grad_norm": 7.630947589874268,
"learning_rate": 3.467226516052319e-05,
"loss": 2.5365,
"step": 165000
},
{
"epoch": 6.149673008323425,
"grad_norm": 7.021152973175049,
"learning_rate": 3.4625817479191444e-05,
"loss": 2.5493,
"step": 165500
},
{
"epoch": 6.168252080856123,
"grad_norm": 6.182572841644287,
"learning_rate": 3.457936979785969e-05,
"loss": 2.5435,
"step": 166000
},
{
"epoch": 6.186831153388823,
"grad_norm": 7.1868767738342285,
"learning_rate": 3.4532922116527945e-05,
"loss": 2.5461,
"step": 166500
},
{
"epoch": 6.205410225921522,
"grad_norm": 6.992275714874268,
"learning_rate": 3.4486474435196195e-05,
"loss": 2.5243,
"step": 167000
},
{
"epoch": 6.223989298454221,
"grad_norm": 6.819701194763184,
"learning_rate": 3.4440026753864446e-05,
"loss": 2.533,
"step": 167500
},
{
"epoch": 6.242568370986921,
"grad_norm": 7.018156051635742,
"learning_rate": 3.43935790725327e-05,
"loss": 2.5373,
"step": 168000
},
{
"epoch": 6.2611474435196195,
"grad_norm": 6.9675188064575195,
"learning_rate": 3.4347131391200954e-05,
"loss": 2.5354,
"step": 168500
},
{
"epoch": 6.279726516052318,
"grad_norm": 6.449595928192139,
"learning_rate": 3.4300683709869204e-05,
"loss": 2.5198,
"step": 169000
},
{
"epoch": 6.298305588585018,
"grad_norm": 6.839005470275879,
"learning_rate": 3.4254236028537455e-05,
"loss": 2.529,
"step": 169500
},
{
"epoch": 6.316884661117717,
"grad_norm": 6.704039096832275,
"learning_rate": 3.420778834720571e-05,
"loss": 2.5365,
"step": 170000
},
{
"epoch": 6.335463733650416,
"grad_norm": 6.419273853302002,
"learning_rate": 3.4161340665873956e-05,
"loss": 2.5441,
"step": 170500
},
{
"epoch": 6.354042806183116,
"grad_norm": 7.052849292755127,
"learning_rate": 3.411489298454221e-05,
"loss": 2.5277,
"step": 171000
},
{
"epoch": 6.372621878715814,
"grad_norm": 7.161109447479248,
"learning_rate": 3.4068445303210463e-05,
"loss": 2.522,
"step": 171500
},
{
"epoch": 6.391200951248513,
"grad_norm": 6.348012447357178,
"learning_rate": 3.4021997621878714e-05,
"loss": 2.524,
"step": 172000
},
{
"epoch": 6.409780023781213,
"grad_norm": 6.336347579956055,
"learning_rate": 3.397554994054697e-05,
"loss": 2.5222,
"step": 172500
},
{
"epoch": 6.428359096313912,
"grad_norm": 5.670961380004883,
"learning_rate": 3.392910225921522e-05,
"loss": 2.5421,
"step": 173000
},
{
"epoch": 6.446938168846611,
"grad_norm": 6.4542717933654785,
"learning_rate": 3.388265457788347e-05,
"loss": 2.5347,
"step": 173500
},
{
"epoch": 6.4655172413793105,
"grad_norm": 6.445559024810791,
"learning_rate": 3.383620689655172e-05,
"loss": 2.5161,
"step": 174000
},
{
"epoch": 6.484096313912009,
"grad_norm": 6.366390228271484,
"learning_rate": 3.378975921521998e-05,
"loss": 2.5254,
"step": 174500
},
{
"epoch": 6.502675386444709,
"grad_norm": 5.990453720092773,
"learning_rate": 3.374331153388823e-05,
"loss": 2.5291,
"step": 175000
},
{
"epoch": 6.521254458977408,
"grad_norm": 7.384641170501709,
"learning_rate": 3.369686385255648e-05,
"loss": 2.5273,
"step": 175500
},
{
"epoch": 6.539833531510107,
"grad_norm": 7.899537563323975,
"learning_rate": 3.365041617122474e-05,
"loss": 2.5217,
"step": 176000
},
{
"epoch": 6.558412604042807,
"grad_norm": 7.456251621246338,
"learning_rate": 3.360396848989298e-05,
"loss": 2.5183,
"step": 176500
},
{
"epoch": 6.576991676575505,
"grad_norm": 7.728662967681885,
"learning_rate": 3.355752080856124e-05,
"loss": 2.5242,
"step": 177000
},
{
"epoch": 6.595570749108204,
"grad_norm": 6.4795708656311035,
"learning_rate": 3.351107312722949e-05,
"loss": 2.5148,
"step": 177500
},
{
"epoch": 6.614149821640904,
"grad_norm": 6.569213390350342,
"learning_rate": 3.346462544589774e-05,
"loss": 2.513,
"step": 178000
},
{
"epoch": 6.632728894173603,
"grad_norm": 5.95412015914917,
"learning_rate": 3.3418177764566e-05,
"loss": 2.5141,
"step": 178500
},
{
"epoch": 6.651307966706302,
"grad_norm": 6.39993143081665,
"learning_rate": 3.337173008323425e-05,
"loss": 2.5209,
"step": 179000
},
{
"epoch": 6.6698870392390015,
"grad_norm": 6.558811664581299,
"learning_rate": 3.33252824019025e-05,
"loss": 2.5168,
"step": 179500
},
{
"epoch": 6.6884661117717,
"grad_norm": 6.443490982055664,
"learning_rate": 3.327883472057075e-05,
"loss": 2.5122,
"step": 180000
},
{
"epoch": 6.707045184304399,
"grad_norm": 6.475789546966553,
"learning_rate": 3.3232387039239006e-05,
"loss": 2.5265,
"step": 180500
},
{
"epoch": 6.725624256837099,
"grad_norm": 6.097142219543457,
"learning_rate": 3.318593935790725e-05,
"loss": 2.4953,
"step": 181000
},
{
"epoch": 6.744203329369798,
"grad_norm": 6.5415849685668945,
"learning_rate": 3.313949167657551e-05,
"loss": 2.504,
"step": 181500
},
{
"epoch": 6.762782401902497,
"grad_norm": 6.81630277633667,
"learning_rate": 3.309304399524376e-05,
"loss": 2.515,
"step": 182000
},
{
"epoch": 6.781361474435196,
"grad_norm": 6.129889488220215,
"learning_rate": 3.304659631391201e-05,
"loss": 2.4993,
"step": 182500
},
{
"epoch": 6.799940546967895,
"grad_norm": 6.960236072540283,
"learning_rate": 3.3000148632580265e-05,
"loss": 2.5146,
"step": 183000
},
{
"epoch": 6.818519619500594,
"grad_norm": 7.540809154510498,
"learning_rate": 3.2953700951248516e-05,
"loss": 2.4899,
"step": 183500
},
{
"epoch": 6.837098692033294,
"grad_norm": 6.699360370635986,
"learning_rate": 3.2907253269916766e-05,
"loss": 2.5032,
"step": 184000
},
{
"epoch": 6.855677764565993,
"grad_norm": 6.967233180999756,
"learning_rate": 3.286080558858502e-05,
"loss": 2.5079,
"step": 184500
},
{
"epoch": 6.874256837098692,
"grad_norm": 6.475770473480225,
"learning_rate": 3.2814357907253274e-05,
"loss": 2.5278,
"step": 185000
},
{
"epoch": 6.892835909631391,
"grad_norm": 7.317842483520508,
"learning_rate": 3.2767910225921525e-05,
"loss": 2.4923,
"step": 185500
},
{
"epoch": 6.91141498216409,
"grad_norm": 6.920095443725586,
"learning_rate": 3.2721462544589775e-05,
"loss": 2.5094,
"step": 186000
},
{
"epoch": 6.92999405469679,
"grad_norm": 6.825646877288818,
"learning_rate": 3.267501486325803e-05,
"loss": 2.4832,
"step": 186500
},
{
"epoch": 6.948573127229489,
"grad_norm": 7.122133255004883,
"learning_rate": 3.2628567181926276e-05,
"loss": 2.5033,
"step": 187000
},
{
"epoch": 6.967152199762188,
"grad_norm": 6.76582145690918,
"learning_rate": 3.2582119500594533e-05,
"loss": 2.4841,
"step": 187500
},
{
"epoch": 6.9857312722948866,
"grad_norm": 6.667826175689697,
"learning_rate": 3.2535671819262784e-05,
"loss": 2.5107,
"step": 188000
},
{
"epoch": 7.004310344827586,
"grad_norm": 6.997557640075684,
"learning_rate": 3.2489224137931034e-05,
"loss": 2.4932,
"step": 188500
},
{
"epoch": 7.022889417360285,
"grad_norm": 7.383888244628906,
"learning_rate": 3.244277645659929e-05,
"loss": 2.4599,
"step": 189000
},
{
"epoch": 7.041468489892985,
"grad_norm": 6.721455097198486,
"learning_rate": 3.239632877526754e-05,
"loss": 2.461,
"step": 189500
},
{
"epoch": 7.060047562425684,
"grad_norm": 5.605399131774902,
"learning_rate": 3.234988109393579e-05,
"loss": 2.4733,
"step": 190000
},
{
"epoch": 7.078626634958383,
"grad_norm": 6.753121852874756,
"learning_rate": 3.230343341260404e-05,
"loss": 2.4739,
"step": 190500
},
{
"epoch": 7.097205707491082,
"grad_norm": 6.254253387451172,
"learning_rate": 3.22569857312723e-05,
"loss": 2.4702,
"step": 191000
},
{
"epoch": 7.115784780023781,
"grad_norm": 7.067044258117676,
"learning_rate": 3.2210538049940544e-05,
"loss": 2.4565,
"step": 191500
},
{
"epoch": 7.13436385255648,
"grad_norm": 6.651601791381836,
"learning_rate": 3.21640903686088e-05,
"loss": 2.4591,
"step": 192000
},
{
"epoch": 7.15294292508918,
"grad_norm": 7.131402015686035,
"learning_rate": 3.211764268727705e-05,
"loss": 2.4699,
"step": 192500
},
{
"epoch": 7.171521997621879,
"grad_norm": 6.57224702835083,
"learning_rate": 3.20711950059453e-05,
"loss": 2.4652,
"step": 193000
},
{
"epoch": 7.190101070154578,
"grad_norm": 6.226948261260986,
"learning_rate": 3.202474732461356e-05,
"loss": 2.4553,
"step": 193500
},
{
"epoch": 7.208680142687277,
"grad_norm": 6.283173561096191,
"learning_rate": 3.197829964328181e-05,
"loss": 2.4666,
"step": 194000
},
{
"epoch": 7.227259215219976,
"grad_norm": 6.692994117736816,
"learning_rate": 3.193185196195006e-05,
"loss": 2.4634,
"step": 194500
},
{
"epoch": 7.245838287752675,
"grad_norm": 5.542157173156738,
"learning_rate": 3.188540428061831e-05,
"loss": 2.4483,
"step": 195000
},
{
"epoch": 7.264417360285375,
"grad_norm": 7.492745876312256,
"learning_rate": 3.183895659928656e-05,
"loss": 2.467,
"step": 195500
},
{
"epoch": 7.282996432818074,
"grad_norm": 6.997331619262695,
"learning_rate": 3.179250891795482e-05,
"loss": 2.4562,
"step": 196000
},
{
"epoch": 7.3015755053507725,
"grad_norm": 7.160475730895996,
"learning_rate": 3.174606123662307e-05,
"loss": 2.4645,
"step": 196500
},
{
"epoch": 7.320154577883472,
"grad_norm": 6.583847522735596,
"learning_rate": 3.169961355529132e-05,
"loss": 2.4591,
"step": 197000
},
{
"epoch": 7.338733650416171,
"grad_norm": 7.247707366943359,
"learning_rate": 3.165316587395957e-05,
"loss": 2.447,
"step": 197500
},
{
"epoch": 7.357312722948871,
"grad_norm": 6.818671226501465,
"learning_rate": 3.160671819262783e-05,
"loss": 2.4524,
"step": 198000
},
{
"epoch": 7.37589179548157,
"grad_norm": 6.533426284790039,
"learning_rate": 3.156027051129608e-05,
"loss": 2.4614,
"step": 198500
},
{
"epoch": 7.394470868014269,
"grad_norm": 6.117506504058838,
"learning_rate": 3.151382282996433e-05,
"loss": 2.4523,
"step": 199000
},
{
"epoch": 7.413049940546968,
"grad_norm": 6.545726776123047,
"learning_rate": 3.1467375148632586e-05,
"loss": 2.4413,
"step": 199500
},
{
"epoch": 7.431629013079667,
"grad_norm": 6.267510414123535,
"learning_rate": 3.142092746730083e-05,
"loss": 2.4626,
"step": 200000
},
{
"epoch": 7.450208085612366,
"grad_norm": 6.45424222946167,
"learning_rate": 3.137447978596909e-05,
"loss": 2.4525,
"step": 200500
},
{
"epoch": 7.468787158145066,
"grad_norm": 6.826467990875244,
"learning_rate": 3.132803210463734e-05,
"loss": 2.4535,
"step": 201000
},
{
"epoch": 7.487366230677765,
"grad_norm": 6.419857501983643,
"learning_rate": 3.128158442330559e-05,
"loss": 2.4567,
"step": 201500
},
{
"epoch": 7.5059453032104635,
"grad_norm": 7.912742614746094,
"learning_rate": 3.123513674197384e-05,
"loss": 2.4352,
"step": 202000
},
{
"epoch": 7.524524375743163,
"grad_norm": 6.15361213684082,
"learning_rate": 3.1188689060642096e-05,
"loss": 2.4525,
"step": 202500
},
{
"epoch": 7.543103448275862,
"grad_norm": 6.077796936035156,
"learning_rate": 3.1142241379310346e-05,
"loss": 2.4621,
"step": 203000
},
{
"epoch": 7.561682520808561,
"grad_norm": 6.890556335449219,
"learning_rate": 3.10957936979786e-05,
"loss": 2.4454,
"step": 203500
},
{
"epoch": 7.580261593341261,
"grad_norm": 7.002103328704834,
"learning_rate": 3.1049346016646854e-05,
"loss": 2.4737,
"step": 204000
},
{
"epoch": 7.59884066587396,
"grad_norm": 7.24050760269165,
"learning_rate": 3.10028983353151e-05,
"loss": 2.4369,
"step": 204500
},
{
"epoch": 7.617419738406658,
"grad_norm": 7.357000827789307,
"learning_rate": 3.0956450653983355e-05,
"loss": 2.4338,
"step": 205000
},
{
"epoch": 7.635998810939358,
"grad_norm": 6.06101131439209,
"learning_rate": 3.0910002972651605e-05,
"loss": 2.4378,
"step": 205500
},
{
"epoch": 7.654577883472057,
"grad_norm": 7.14568567276001,
"learning_rate": 3.0863555291319856e-05,
"loss": 2.4448,
"step": 206000
},
{
"epoch": 7.673156956004756,
"grad_norm": 6.747462272644043,
"learning_rate": 3.081710760998811e-05,
"loss": 2.4604,
"step": 206500
},
{
"epoch": 7.691736028537456,
"grad_norm": 7.445852756500244,
"learning_rate": 3.0770659928656364e-05,
"loss": 2.4454,
"step": 207000
},
{
"epoch": 7.7103151010701545,
"grad_norm": 6.196556568145752,
"learning_rate": 3.0724212247324614e-05,
"loss": 2.4378,
"step": 207500
},
{
"epoch": 7.728894173602853,
"grad_norm": 6.7122039794921875,
"learning_rate": 3.0677764565992865e-05,
"loss": 2.4286,
"step": 208000
},
{
"epoch": 7.747473246135553,
"grad_norm": 7.239169120788574,
"learning_rate": 3.063131688466112e-05,
"loss": 2.4459,
"step": 208500
},
{
"epoch": 7.766052318668252,
"grad_norm": 5.942273139953613,
"learning_rate": 3.058486920332937e-05,
"loss": 2.4554,
"step": 209000
},
{
"epoch": 7.784631391200952,
"grad_norm": 6.494337558746338,
"learning_rate": 3.053842152199762e-05,
"loss": 2.4404,
"step": 209500
},
{
"epoch": 7.803210463733651,
"grad_norm": 7.0354084968566895,
"learning_rate": 3.0491973840665877e-05,
"loss": 2.4454,
"step": 210000
},
{
"epoch": 7.821789536266349,
"grad_norm": 6.828258037567139,
"learning_rate": 3.0445526159334127e-05,
"loss": 2.4325,
"step": 210500
},
{
"epoch": 7.840368608799048,
"grad_norm": 7.0825724601745605,
"learning_rate": 3.039907847800238e-05,
"loss": 2.4289,
"step": 211000
},
{
"epoch": 7.858947681331748,
"grad_norm": 7.346935749053955,
"learning_rate": 3.0352630796670635e-05,
"loss": 2.4548,
"step": 211500
},
{
"epoch": 7.877526753864447,
"grad_norm": 6.80324125289917,
"learning_rate": 3.0306183115338882e-05,
"loss": 2.4507,
"step": 212000
},
{
"epoch": 7.896105826397147,
"grad_norm": 6.606076717376709,
"learning_rate": 3.0259735434007136e-05,
"loss": 2.43,
"step": 212500
},
{
"epoch": 7.9146848989298455,
"grad_norm": 7.006173133850098,
"learning_rate": 3.021328775267539e-05,
"loss": 2.4376,
"step": 213000
},
{
"epoch": 7.933263971462544,
"grad_norm": 6.683685779571533,
"learning_rate": 3.0166840071343637e-05,
"loss": 2.4173,
"step": 213500
},
{
"epoch": 7.951843043995244,
"grad_norm": 7.217254161834717,
"learning_rate": 3.012039239001189e-05,
"loss": 2.4465,
"step": 214000
},
{
"epoch": 7.970422116527943,
"grad_norm": 6.2831573486328125,
"learning_rate": 3.0073944708680145e-05,
"loss": 2.4347,
"step": 214500
},
{
"epoch": 7.989001189060642,
"grad_norm": 7.447052955627441,
"learning_rate": 3.0027497027348395e-05,
"loss": 2.4346,
"step": 215000
},
{
"epoch": 8.007580261593342,
"grad_norm": 6.3113884925842285,
"learning_rate": 2.998104934601665e-05,
"loss": 2.4264,
"step": 215500
},
{
"epoch": 8.02615933412604,
"grad_norm": 7.349926948547363,
"learning_rate": 2.9934601664684903e-05,
"loss": 2.409,
"step": 216000
},
{
"epoch": 8.04473840665874,
"grad_norm": 7.6226959228515625,
"learning_rate": 2.988815398335315e-05,
"loss": 2.4047,
"step": 216500
},
{
"epoch": 8.063317479191438,
"grad_norm": 7.297638893127441,
"learning_rate": 2.9841706302021404e-05,
"loss": 2.4064,
"step": 217000
},
{
"epoch": 8.081896551724139,
"grad_norm": 6.703174114227295,
"learning_rate": 2.9795258620689658e-05,
"loss": 2.3948,
"step": 217500
},
{
"epoch": 8.100475624256838,
"grad_norm": 7.86271858215332,
"learning_rate": 2.974881093935791e-05,
"loss": 2.4033,
"step": 218000
},
{
"epoch": 8.119054696789537,
"grad_norm": 6.666792392730713,
"learning_rate": 2.9702363258026162e-05,
"loss": 2.4189,
"step": 218500
},
{
"epoch": 8.137633769322235,
"grad_norm": 7.112173557281494,
"learning_rate": 2.9655915576694416e-05,
"loss": 2.4139,
"step": 219000
},
{
"epoch": 8.156212841854934,
"grad_norm": 7.117358684539795,
"learning_rate": 2.9609467895362663e-05,
"loss": 2.4092,
"step": 219500
},
{
"epoch": 8.174791914387633,
"grad_norm": 5.946983337402344,
"learning_rate": 2.9563020214030917e-05,
"loss": 2.403,
"step": 220000
},
{
"epoch": 8.193370986920334,
"grad_norm": 6.8523030281066895,
"learning_rate": 2.951657253269917e-05,
"loss": 2.3906,
"step": 220500
},
{
"epoch": 8.211950059453033,
"grad_norm": 6.419975280761719,
"learning_rate": 2.947012485136742e-05,
"loss": 2.4058,
"step": 221000
},
{
"epoch": 8.230529131985731,
"grad_norm": 7.008522033691406,
"learning_rate": 2.9423677170035675e-05,
"loss": 2.3949,
"step": 221500
},
{
"epoch": 8.24910820451843,
"grad_norm": 6.398033618927002,
"learning_rate": 2.937722948870393e-05,
"loss": 2.3894,
"step": 222000
},
{
"epoch": 8.26768727705113,
"grad_norm": 6.911588668823242,
"learning_rate": 2.9330781807372176e-05,
"loss": 2.4026,
"step": 222500
},
{
"epoch": 8.286266349583828,
"grad_norm": 6.391911029815674,
"learning_rate": 2.928433412604043e-05,
"loss": 2.388,
"step": 223000
},
{
"epoch": 8.304845422116529,
"grad_norm": 6.878973484039307,
"learning_rate": 2.9237886444708684e-05,
"loss": 2.3964,
"step": 223500
},
{
"epoch": 8.323424494649228,
"grad_norm": 6.681369781494141,
"learning_rate": 2.919143876337693e-05,
"loss": 2.4049,
"step": 224000
},
{
"epoch": 8.342003567181926,
"grad_norm": 6.652570724487305,
"learning_rate": 2.9144991082045185e-05,
"loss": 2.3935,
"step": 224500
},
{
"epoch": 8.360582639714625,
"grad_norm": 6.757369041442871,
"learning_rate": 2.909854340071344e-05,
"loss": 2.4024,
"step": 225000
},
{
"epoch": 8.379161712247324,
"grad_norm": 7.730061054229736,
"learning_rate": 2.905209571938169e-05,
"loss": 2.3815,
"step": 225500
},
{
"epoch": 8.397740784780023,
"grad_norm": 7.044541358947754,
"learning_rate": 2.9005648038049943e-05,
"loss": 2.3872,
"step": 226000
},
{
"epoch": 8.416319857312724,
"grad_norm": 7.104819297790527,
"learning_rate": 2.895920035671819e-05,
"loss": 2.3888,
"step": 226500
},
{
"epoch": 8.434898929845422,
"grad_norm": 6.207997798919678,
"learning_rate": 2.8912752675386444e-05,
"loss": 2.3901,
"step": 227000
},
{
"epoch": 8.453478002378121,
"grad_norm": 6.412841796875,
"learning_rate": 2.8866304994054698e-05,
"loss": 2.404,
"step": 227500
},
{
"epoch": 8.47205707491082,
"grad_norm": 7.31563663482666,
"learning_rate": 2.881985731272295e-05,
"loss": 2.3952,
"step": 228000
},
{
"epoch": 8.490636147443519,
"grad_norm": 6.783107757568359,
"learning_rate": 2.8773409631391203e-05,
"loss": 2.369,
"step": 228500
},
{
"epoch": 8.50921521997622,
"grad_norm": 7.456410884857178,
"learning_rate": 2.8726961950059456e-05,
"loss": 2.3955,
"step": 229000
},
{
"epoch": 8.527794292508919,
"grad_norm": 6.817208766937256,
"learning_rate": 2.8680514268727704e-05,
"loss": 2.3777,
"step": 229500
},
{
"epoch": 8.546373365041617,
"grad_norm": 6.829710483551025,
"learning_rate": 2.8634066587395957e-05,
"loss": 2.3803,
"step": 230000
},
{
"epoch": 8.564952437574316,
"grad_norm": 6.171419620513916,
"learning_rate": 2.858761890606421e-05,
"loss": 2.3867,
"step": 230500
},
{
"epoch": 8.583531510107015,
"grad_norm": 7.179515361785889,
"learning_rate": 2.8541171224732462e-05,
"loss": 2.3819,
"step": 231000
},
{
"epoch": 8.602110582639714,
"grad_norm": 7.424422264099121,
"learning_rate": 2.8494723543400716e-05,
"loss": 2.3892,
"step": 231500
},
{
"epoch": 8.620689655172415,
"grad_norm": 6.56906795501709,
"learning_rate": 2.844827586206897e-05,
"loss": 2.3875,
"step": 232000
},
{
"epoch": 8.639268727705113,
"grad_norm": 5.986749649047852,
"learning_rate": 2.8401828180737217e-05,
"loss": 2.3881,
"step": 232500
},
{
"epoch": 8.657847800237812,
"grad_norm": 7.885437965393066,
"learning_rate": 2.835538049940547e-05,
"loss": 2.3898,
"step": 233000
},
{
"epoch": 8.676426872770511,
"grad_norm": 8.217313766479492,
"learning_rate": 2.8308932818073724e-05,
"loss": 2.3853,
"step": 233500
},
{
"epoch": 8.69500594530321,
"grad_norm": 7.467879295349121,
"learning_rate": 2.826248513674197e-05,
"loss": 2.3894,
"step": 234000
},
{
"epoch": 8.713585017835909,
"grad_norm": 6.856407642364502,
"learning_rate": 2.8216037455410225e-05,
"loss": 2.3884,
"step": 234500
},
{
"epoch": 8.73216409036861,
"grad_norm": 7.717813014984131,
"learning_rate": 2.816958977407848e-05,
"loss": 2.3735,
"step": 235000
},
{
"epoch": 8.750743162901308,
"grad_norm": 6.215982913970947,
"learning_rate": 2.812314209274673e-05,
"loss": 2.3704,
"step": 235500
},
{
"epoch": 8.769322235434007,
"grad_norm": 5.821375370025635,
"learning_rate": 2.8076694411414984e-05,
"loss": 2.3821,
"step": 236000
},
{
"epoch": 8.787901307966706,
"grad_norm": 5.752195358276367,
"learning_rate": 2.8030246730083238e-05,
"loss": 2.362,
"step": 236500
},
{
"epoch": 8.806480380499405,
"grad_norm": 7.1153693199157715,
"learning_rate": 2.7983799048751485e-05,
"loss": 2.3804,
"step": 237000
},
{
"epoch": 8.825059453032104,
"grad_norm": 7.165075302124023,
"learning_rate": 2.793735136741974e-05,
"loss": 2.3749,
"step": 237500
},
{
"epoch": 8.843638525564804,
"grad_norm": 7.609332084655762,
"learning_rate": 2.7890903686087992e-05,
"loss": 2.3783,
"step": 238000
},
{
"epoch": 8.862217598097503,
"grad_norm": 7.269701957702637,
"learning_rate": 2.7844456004756243e-05,
"loss": 2.3612,
"step": 238500
},
{
"epoch": 8.880796670630202,
"grad_norm": 6.229999542236328,
"learning_rate": 2.7798008323424497e-05,
"loss": 2.3674,
"step": 239000
},
{
"epoch": 8.899375743162901,
"grad_norm": 6.712778568267822,
"learning_rate": 2.775156064209275e-05,
"loss": 2.4045,
"step": 239500
},
{
"epoch": 8.9179548156956,
"grad_norm": 6.752030372619629,
"learning_rate": 2.7705112960760998e-05,
"loss": 2.3665,
"step": 240000
},
{
"epoch": 8.9365338882283,
"grad_norm": 7.107761383056641,
"learning_rate": 2.765866527942925e-05,
"loss": 2.3757,
"step": 240500
},
{
"epoch": 8.955112960761,
"grad_norm": 6.4916300773620605,
"learning_rate": 2.7612217598097506e-05,
"loss": 2.364,
"step": 241000
},
{
"epoch": 8.973692033293698,
"grad_norm": 6.902660846710205,
"learning_rate": 2.7565769916765756e-05,
"loss": 2.3945,
"step": 241500
},
{
"epoch": 8.992271105826397,
"grad_norm": 6.676261901855469,
"learning_rate": 2.751932223543401e-05,
"loss": 2.368,
"step": 242000
},
{
"epoch": 9.010850178359096,
"grad_norm": 7.0637125968933105,
"learning_rate": 2.7472874554102264e-05,
"loss": 2.3526,
"step": 242500
},
{
"epoch": 9.029429250891795,
"grad_norm": 6.886041164398193,
"learning_rate": 2.742642687277051e-05,
"loss": 2.3409,
"step": 243000
},
{
"epoch": 9.048008323424495,
"grad_norm": 6.17530632019043,
"learning_rate": 2.7379979191438765e-05,
"loss": 2.3368,
"step": 243500
},
{
"epoch": 9.066587395957194,
"grad_norm": 6.835616588592529,
"learning_rate": 2.733353151010702e-05,
"loss": 2.3521,
"step": 244000
},
{
"epoch": 9.085166468489893,
"grad_norm": 7.837756156921387,
"learning_rate": 2.7287083828775266e-05,
"loss": 2.3517,
"step": 244500
},
{
"epoch": 9.103745541022592,
"grad_norm": 7.3295793533325195,
"learning_rate": 2.724063614744352e-05,
"loss": 2.3351,
"step": 245000
},
{
"epoch": 9.122324613555291,
"grad_norm": 6.278160095214844,
"learning_rate": 2.7194188466111774e-05,
"loss": 2.3544,
"step": 245500
},
{
"epoch": 9.14090368608799,
"grad_norm": 6.8166823387146,
"learning_rate": 2.7147740784780024e-05,
"loss": 2.3562,
"step": 246000
},
{
"epoch": 9.15948275862069,
"grad_norm": 6.9190473556518555,
"learning_rate": 2.7101293103448278e-05,
"loss": 2.3707,
"step": 246500
},
{
"epoch": 9.17806183115339,
"grad_norm": 8.471137046813965,
"learning_rate": 2.7054845422116532e-05,
"loss": 2.3348,
"step": 247000
},
{
"epoch": 9.196640903686088,
"grad_norm": 7.1549553871154785,
"learning_rate": 2.700839774078478e-05,
"loss": 2.3482,
"step": 247500
},
{
"epoch": 9.215219976218787,
"grad_norm": 7.972681999206543,
"learning_rate": 2.6961950059453033e-05,
"loss": 2.3327,
"step": 248000
},
{
"epoch": 9.233799048751486,
"grad_norm": 6.290485858917236,
"learning_rate": 2.6915502378121287e-05,
"loss": 2.3344,
"step": 248500
},
{
"epoch": 9.252378121284185,
"grad_norm": 7.835150718688965,
"learning_rate": 2.6869054696789537e-05,
"loss": 2.3523,
"step": 249000
},
{
"epoch": 9.270957193816885,
"grad_norm": 6.171538829803467,
"learning_rate": 2.682260701545779e-05,
"loss": 2.3439,
"step": 249500
},
{
"epoch": 9.289536266349584,
"grad_norm": 6.854957580566406,
"learning_rate": 2.6776159334126045e-05,
"loss": 2.348,
"step": 250000
},
{
"epoch": 9.308115338882283,
"grad_norm": 6.949794769287109,
"learning_rate": 2.6729711652794292e-05,
"loss": 2.3416,
"step": 250500
},
{
"epoch": 9.326694411414982,
"grad_norm": 7.924169540405273,
"learning_rate": 2.6683263971462546e-05,
"loss": 2.341,
"step": 251000
},
{
"epoch": 9.34527348394768,
"grad_norm": 6.802456378936768,
"learning_rate": 2.66368162901308e-05,
"loss": 2.3373,
"step": 251500
},
{
"epoch": 9.363852556480381,
"grad_norm": 5.974133491516113,
"learning_rate": 2.659036860879905e-05,
"loss": 2.3447,
"step": 252000
},
{
"epoch": 9.38243162901308,
"grad_norm": 7.3315277099609375,
"learning_rate": 2.6543920927467304e-05,
"loss": 2.345,
"step": 252500
},
{
"epoch": 9.40101070154578,
"grad_norm": 7.01455020904541,
"learning_rate": 2.6497473246135558e-05,
"loss": 2.3354,
"step": 253000
},
{
"epoch": 9.419589774078478,
"grad_norm": 6.553669452667236,
"learning_rate": 2.6451025564803805e-05,
"loss": 2.3505,
"step": 253500
},
{
"epoch": 9.438168846611177,
"grad_norm": 7.384204387664795,
"learning_rate": 2.640457788347206e-05,
"loss": 2.3406,
"step": 254000
},
{
"epoch": 9.456747919143876,
"grad_norm": 7.899343490600586,
"learning_rate": 2.6358130202140313e-05,
"loss": 2.3534,
"step": 254500
},
{
"epoch": 9.475326991676576,
"grad_norm": 6.718962669372559,
"learning_rate": 2.631168252080856e-05,
"loss": 2.3447,
"step": 255000
},
{
"epoch": 9.493906064209275,
"grad_norm": 7.7100629806518555,
"learning_rate": 2.6265234839476814e-05,
"loss": 2.3378,
"step": 255500
},
{
"epoch": 9.512485136741974,
"grad_norm": 6.307003974914551,
"learning_rate": 2.6218787158145064e-05,
"loss": 2.3673,
"step": 256000
},
{
"epoch": 9.531064209274673,
"grad_norm": 6.968733787536621,
"learning_rate": 2.6172339476813318e-05,
"loss": 2.3502,
"step": 256500
},
{
"epoch": 9.549643281807372,
"grad_norm": 7.223754405975342,
"learning_rate": 2.6125891795481572e-05,
"loss": 2.3397,
"step": 257000
},
{
"epoch": 9.56822235434007,
"grad_norm": 7.984851360321045,
"learning_rate": 2.607944411414982e-05,
"loss": 2.3394,
"step": 257500
},
{
"epoch": 9.586801426872771,
"grad_norm": 6.745290279388428,
"learning_rate": 2.6032996432818073e-05,
"loss": 2.357,
"step": 258000
},
{
"epoch": 9.60538049940547,
"grad_norm": 6.241764068603516,
"learning_rate": 2.5986548751486327e-05,
"loss": 2.3294,
"step": 258500
},
{
"epoch": 9.623959571938169,
"grad_norm": 6.849953651428223,
"learning_rate": 2.5940101070154577e-05,
"loss": 2.319,
"step": 259000
},
{
"epoch": 9.642538644470868,
"grad_norm": 6.786033630371094,
"learning_rate": 2.589365338882283e-05,
"loss": 2.3381,
"step": 259500
},
{
"epoch": 9.661117717003567,
"grad_norm": 6.5294952392578125,
"learning_rate": 2.5847205707491085e-05,
"loss": 2.3292,
"step": 260000
},
{
"epoch": 9.679696789536266,
"grad_norm": 6.852995872497559,
"learning_rate": 2.5800758026159332e-05,
"loss": 2.3513,
"step": 260500
},
{
"epoch": 9.698275862068966,
"grad_norm": 7.107331275939941,
"learning_rate": 2.5754310344827586e-05,
"loss": 2.342,
"step": 261000
},
{
"epoch": 9.716854934601665,
"grad_norm": 6.497838020324707,
"learning_rate": 2.570786266349584e-05,
"loss": 2.3518,
"step": 261500
},
{
"epoch": 9.735434007134364,
"grad_norm": 7.103449821472168,
"learning_rate": 2.566141498216409e-05,
"loss": 2.3243,
"step": 262000
},
{
"epoch": 9.754013079667063,
"grad_norm": 6.207728862762451,
"learning_rate": 2.5614967300832344e-05,
"loss": 2.3295,
"step": 262500
},
{
"epoch": 9.772592152199762,
"grad_norm": 6.938514232635498,
"learning_rate": 2.55685196195006e-05,
"loss": 2.3378,
"step": 263000
},
{
"epoch": 9.791171224732462,
"grad_norm": 8.32728385925293,
"learning_rate": 2.5522071938168845e-05,
"loss": 2.3397,
"step": 263500
},
{
"epoch": 9.809750297265161,
"grad_norm": 7.170902729034424,
"learning_rate": 2.54756242568371e-05,
"loss": 2.3152,
"step": 264000
},
{
"epoch": 9.82832936979786,
"grad_norm": 6.303475856781006,
"learning_rate": 2.5429176575505353e-05,
"loss": 2.335,
"step": 264500
},
{
"epoch": 9.846908442330559,
"grad_norm": 6.429758548736572,
"learning_rate": 2.53827288941736e-05,
"loss": 2.3193,
"step": 265000
},
{
"epoch": 9.865487514863258,
"grad_norm": 7.365509986877441,
"learning_rate": 2.5336281212841854e-05,
"loss": 2.3291,
"step": 265500
},
{
"epoch": 9.884066587395957,
"grad_norm": 6.403247356414795,
"learning_rate": 2.528983353151011e-05,
"loss": 2.3289,
"step": 266000
},
{
"epoch": 9.902645659928655,
"grad_norm": 6.402617454528809,
"learning_rate": 2.524338585017836e-05,
"loss": 2.3383,
"step": 266500
},
{
"epoch": 9.921224732461356,
"grad_norm": 8.039521217346191,
"learning_rate": 2.5196938168846612e-05,
"loss": 2.3535,
"step": 267000
},
{
"epoch": 9.939803804994055,
"grad_norm": 6.797732830047607,
"learning_rate": 2.5150490487514866e-05,
"loss": 2.3102,
"step": 267500
},
{
"epoch": 9.958382877526754,
"grad_norm": 6.878042221069336,
"learning_rate": 2.5104042806183113e-05,
"loss": 2.3252,
"step": 268000
},
{
"epoch": 9.976961950059453,
"grad_norm": 7.837581634521484,
"learning_rate": 2.5057595124851367e-05,
"loss": 2.3259,
"step": 268500
},
{
"epoch": 9.995541022592151,
"grad_norm": 7.878035545349121,
"learning_rate": 2.501114744351962e-05,
"loss": 2.3206,
"step": 269000
},
{
"epoch": 10.014120095124852,
"grad_norm": 7.06614875793457,
"learning_rate": 2.4964699762187875e-05,
"loss": 2.3031,
"step": 269500
},
{
"epoch": 10.032699167657551,
"grad_norm": 6.305147647857666,
"learning_rate": 2.4918252080856126e-05,
"loss": 2.2958,
"step": 270000
},
{
"epoch": 10.05127824019025,
"grad_norm": 7.321694374084473,
"learning_rate": 2.4871804399524376e-05,
"loss": 2.3102,
"step": 270500
},
{
"epoch": 10.069857312722949,
"grad_norm": 6.2910356521606445,
"learning_rate": 2.482535671819263e-05,
"loss": 2.3087,
"step": 271000
},
{
"epoch": 10.088436385255648,
"grad_norm": 6.352067470550537,
"learning_rate": 2.477890903686088e-05,
"loss": 2.2997,
"step": 271500
},
{
"epoch": 10.107015457788346,
"grad_norm": 7.583943843841553,
"learning_rate": 2.473246135552913e-05,
"loss": 2.2976,
"step": 272000
},
{
"epoch": 10.125594530321047,
"grad_norm": 6.128369331359863,
"learning_rate": 2.4686013674197385e-05,
"loss": 2.3184,
"step": 272500
},
{
"epoch": 10.144173602853746,
"grad_norm": 7.117658615112305,
"learning_rate": 2.463956599286564e-05,
"loss": 2.297,
"step": 273000
},
{
"epoch": 10.162752675386445,
"grad_norm": 6.37664270401001,
"learning_rate": 2.459311831153389e-05,
"loss": 2.3054,
"step": 273500
},
{
"epoch": 10.181331747919144,
"grad_norm": 8.254295349121094,
"learning_rate": 2.4546670630202143e-05,
"loss": 2.2856,
"step": 274000
},
{
"epoch": 10.199910820451842,
"grad_norm": 7.399996757507324,
"learning_rate": 2.4500222948870394e-05,
"loss": 2.3191,
"step": 274500
},
{
"epoch": 10.218489892984541,
"grad_norm": 7.4784464836120605,
"learning_rate": 2.4453775267538644e-05,
"loss": 2.2994,
"step": 275000
},
{
"epoch": 10.237068965517242,
"grad_norm": 7.332183837890625,
"learning_rate": 2.4407327586206898e-05,
"loss": 2.3022,
"step": 275500
},
{
"epoch": 10.25564803804994,
"grad_norm": 6.316469192504883,
"learning_rate": 2.4360879904875152e-05,
"loss": 2.306,
"step": 276000
},
{
"epoch": 10.27422711058264,
"grad_norm": 7.272724628448486,
"learning_rate": 2.4314432223543402e-05,
"loss": 2.293,
"step": 276500
},
{
"epoch": 10.292806183115339,
"grad_norm": 7.283202171325684,
"learning_rate": 2.4267984542211656e-05,
"loss": 2.3086,
"step": 277000
},
{
"epoch": 10.311385255648037,
"grad_norm": 6.357330799102783,
"learning_rate": 2.4221536860879907e-05,
"loss": 2.2958,
"step": 277500
},
{
"epoch": 10.329964328180738,
"grad_norm": 6.361136436462402,
"learning_rate": 2.4175089179548157e-05,
"loss": 2.2856,
"step": 278000
},
{
"epoch": 10.348543400713437,
"grad_norm": 7.32297420501709,
"learning_rate": 2.4128641498216408e-05,
"loss": 2.2904,
"step": 278500
},
{
"epoch": 10.367122473246136,
"grad_norm": 7.6246161460876465,
"learning_rate": 2.408219381688466e-05,
"loss": 2.2872,
"step": 279000
},
{
"epoch": 10.385701545778835,
"grad_norm": 6.27332067489624,
"learning_rate": 2.4035746135552915e-05,
"loss": 2.2829,
"step": 279500
},
{
"epoch": 10.404280618311534,
"grad_norm": 7.062289714813232,
"learning_rate": 2.3989298454221166e-05,
"loss": 2.2938,
"step": 280000
},
{
"epoch": 10.422859690844232,
"grad_norm": 8.132457733154297,
"learning_rate": 2.394285077288942e-05,
"loss": 2.2994,
"step": 280500
},
{
"epoch": 10.441438763376933,
"grad_norm": 6.456370830535889,
"learning_rate": 2.389640309155767e-05,
"loss": 2.2845,
"step": 281000
},
{
"epoch": 10.460017835909632,
"grad_norm": 8.033242225646973,
"learning_rate": 2.384995541022592e-05,
"loss": 2.2907,
"step": 281500
},
{
"epoch": 10.47859690844233,
"grad_norm": 7.318391799926758,
"learning_rate": 2.3803507728894175e-05,
"loss": 2.288,
"step": 282000
},
{
"epoch": 10.49717598097503,
"grad_norm": 6.92618465423584,
"learning_rate": 2.3757060047562425e-05,
"loss": 2.2875,
"step": 282500
},
{
"epoch": 10.515755053507728,
"grad_norm": 6.721688747406006,
"learning_rate": 2.371061236623068e-05,
"loss": 2.295,
"step": 283000
},
{
"epoch": 10.534334126040427,
"grad_norm": 7.079250335693359,
"learning_rate": 2.3664164684898933e-05,
"loss": 2.2806,
"step": 283500
},
{
"epoch": 10.552913198573128,
"grad_norm": 7.229697227478027,
"learning_rate": 2.3617717003567183e-05,
"loss": 2.2828,
"step": 284000
},
{
"epoch": 10.571492271105827,
"grad_norm": 6.85770845413208,
"learning_rate": 2.3571269322235434e-05,
"loss": 2.3038,
"step": 284500
},
{
"epoch": 10.590071343638526,
"grad_norm": 7.07368803024292,
"learning_rate": 2.3524821640903688e-05,
"loss": 2.2918,
"step": 285000
},
{
"epoch": 10.608650416171225,
"grad_norm": 7.446401119232178,
"learning_rate": 2.3478373959571938e-05,
"loss": 2.3097,
"step": 285500
},
{
"epoch": 10.627229488703923,
"grad_norm": 7.388403415679932,
"learning_rate": 2.3431926278240192e-05,
"loss": 2.2753,
"step": 286000
},
{
"epoch": 10.645808561236624,
"grad_norm": 7.510107517242432,
"learning_rate": 2.3385478596908446e-05,
"loss": 2.2592,
"step": 286500
},
{
"epoch": 10.664387633769323,
"grad_norm": 6.856348514556885,
"learning_rate": 2.3339030915576697e-05,
"loss": 2.3018,
"step": 287000
},
{
"epoch": 10.682966706302022,
"grad_norm": 5.952792644500732,
"learning_rate": 2.3292583234244947e-05,
"loss": 2.293,
"step": 287500
},
{
"epoch": 10.70154577883472,
"grad_norm": 6.156429290771484,
"learning_rate": 2.32461355529132e-05,
"loss": 2.2794,
"step": 288000
},
{
"epoch": 10.72012485136742,
"grad_norm": 7.464205741882324,
"learning_rate": 2.319968787158145e-05,
"loss": 2.2819,
"step": 288500
},
{
"epoch": 10.738703923900118,
"grad_norm": 6.248416423797607,
"learning_rate": 2.3153240190249702e-05,
"loss": 2.2841,
"step": 289000
},
{
"epoch": 10.757282996432817,
"grad_norm": 6.5093183517456055,
"learning_rate": 2.3106792508917956e-05,
"loss": 2.2974,
"step": 289500
},
{
"epoch": 10.775862068965518,
"grad_norm": 6.669436454772949,
"learning_rate": 2.306034482758621e-05,
"loss": 2.2823,
"step": 290000
},
{
"epoch": 10.794441141498217,
"grad_norm": 6.547306537628174,
"learning_rate": 2.301389714625446e-05,
"loss": 2.2783,
"step": 290500
},
{
"epoch": 10.813020214030916,
"grad_norm": 7.420673847198486,
"learning_rate": 2.2967449464922714e-05,
"loss": 2.2803,
"step": 291000
},
{
"epoch": 10.831599286563614,
"grad_norm": 7.08470344543457,
"learning_rate": 2.2921001783590965e-05,
"loss": 2.2897,
"step": 291500
},
{
"epoch": 10.850178359096313,
"grad_norm": 7.092275142669678,
"learning_rate": 2.2874554102259215e-05,
"loss": 2.2842,
"step": 292000
},
{
"epoch": 10.868757431629014,
"grad_norm": 6.814739227294922,
"learning_rate": 2.282810642092747e-05,
"loss": 2.2637,
"step": 292500
},
{
"epoch": 10.887336504161713,
"grad_norm": 6.778537750244141,
"learning_rate": 2.278165873959572e-05,
"loss": 2.2802,
"step": 293000
},
{
"epoch": 10.905915576694412,
"grad_norm": 6.529074668884277,
"learning_rate": 2.2735211058263973e-05,
"loss": 2.282,
"step": 293500
},
{
"epoch": 10.92449464922711,
"grad_norm": 7.486764430999756,
"learning_rate": 2.2688763376932224e-05,
"loss": 2.2964,
"step": 294000
},
{
"epoch": 10.94307372175981,
"grad_norm": 9.576150894165039,
"learning_rate": 2.2642315695600478e-05,
"loss": 2.2853,
"step": 294500
},
{
"epoch": 10.961652794292508,
"grad_norm": 7.3996429443359375,
"learning_rate": 2.2595868014268728e-05,
"loss": 2.278,
"step": 295000
},
{
"epoch": 10.980231866825209,
"grad_norm": 6.478265762329102,
"learning_rate": 2.254942033293698e-05,
"loss": 2.2857,
"step": 295500
},
{
"epoch": 10.998810939357908,
"grad_norm": 7.264919757843018,
"learning_rate": 2.2502972651605233e-05,
"loss": 2.2638,
"step": 296000
},
{
"epoch": 11.017390011890607,
"grad_norm": 6.449435234069824,
"learning_rate": 2.2456524970273486e-05,
"loss": 2.2538,
"step": 296500
},
{
"epoch": 11.035969084423305,
"grad_norm": 8.838685035705566,
"learning_rate": 2.2410077288941737e-05,
"loss": 2.2658,
"step": 297000
},
{
"epoch": 11.054548156956004,
"grad_norm": 7.12150764465332,
"learning_rate": 2.236362960760999e-05,
"loss": 2.2582,
"step": 297500
},
{
"epoch": 11.073127229488703,
"grad_norm": 7.337321758270264,
"learning_rate": 2.231718192627824e-05,
"loss": 2.2531,
"step": 298000
},
{
"epoch": 11.091706302021404,
"grad_norm": 7.290600776672363,
"learning_rate": 2.2270734244946492e-05,
"loss": 2.2607,
"step": 298500
},
{
"epoch": 11.110285374554103,
"grad_norm": 6.834112644195557,
"learning_rate": 2.2224286563614746e-05,
"loss": 2.2593,
"step": 299000
},
{
"epoch": 11.128864447086801,
"grad_norm": 7.174058437347412,
"learning_rate": 2.2177838882282996e-05,
"loss": 2.2584,
"step": 299500
},
{
"epoch": 11.1474435196195,
"grad_norm": 6.08710241317749,
"learning_rate": 2.213139120095125e-05,
"loss": 2.2572,
"step": 300000
},
{
"epoch": 11.1660225921522,
"grad_norm": 7.66245174407959,
"learning_rate": 2.2084943519619504e-05,
"loss": 2.2597,
"step": 300500
},
{
"epoch": 11.1846016646849,
"grad_norm": 6.607715606689453,
"learning_rate": 2.2038495838287754e-05,
"loss": 2.2383,
"step": 301000
},
{
"epoch": 11.203180737217599,
"grad_norm": 6.562816143035889,
"learning_rate": 2.1992048156956005e-05,
"loss": 2.2497,
"step": 301500
},
{
"epoch": 11.221759809750298,
"grad_norm": 6.655299186706543,
"learning_rate": 2.194560047562426e-05,
"loss": 2.2628,
"step": 302000
},
{
"epoch": 11.240338882282996,
"grad_norm": 6.629017353057861,
"learning_rate": 2.189915279429251e-05,
"loss": 2.2568,
"step": 302500
},
{
"epoch": 11.258917954815695,
"grad_norm": 7.567939281463623,
"learning_rate": 2.185270511296076e-05,
"loss": 2.2732,
"step": 303000
},
{
"epoch": 11.277497027348394,
"grad_norm": 8.384344100952148,
"learning_rate": 2.1806257431629014e-05,
"loss": 2.2702,
"step": 303500
},
{
"epoch": 11.296076099881095,
"grad_norm": 7.6042914390563965,
"learning_rate": 2.1759809750297268e-05,
"loss": 2.2671,
"step": 304000
},
{
"epoch": 11.314655172413794,
"grad_norm": 6.45172643661499,
"learning_rate": 2.1713362068965518e-05,
"loss": 2.2531,
"step": 304500
},
{
"epoch": 11.333234244946492,
"grad_norm": 6.863234519958496,
"learning_rate": 2.1666914387633772e-05,
"loss": 2.2627,
"step": 305000
},
{
"epoch": 11.351813317479191,
"grad_norm": 8.442804336547852,
"learning_rate": 2.1620466706302022e-05,
"loss": 2.2551,
"step": 305500
},
{
"epoch": 11.37039239001189,
"grad_norm": 8.2174072265625,
"learning_rate": 2.1574019024970273e-05,
"loss": 2.2559,
"step": 306000
},
{
"epoch": 11.388971462544589,
"grad_norm": 7.479830265045166,
"learning_rate": 2.1527571343638527e-05,
"loss": 2.257,
"step": 306500
},
{
"epoch": 11.40755053507729,
"grad_norm": 6.4119744300842285,
"learning_rate": 2.148112366230678e-05,
"loss": 2.2577,
"step": 307000
},
{
"epoch": 11.426129607609989,
"grad_norm": 7.141465187072754,
"learning_rate": 2.143467598097503e-05,
"loss": 2.2583,
"step": 307500
},
{
"epoch": 11.444708680142687,
"grad_norm": 7.255865097045898,
"learning_rate": 2.138822829964328e-05,
"loss": 2.2549,
"step": 308000
},
{
"epoch": 11.463287752675386,
"grad_norm": 6.533185958862305,
"learning_rate": 2.1341780618311535e-05,
"loss": 2.2563,
"step": 308500
},
{
"epoch": 11.481866825208085,
"grad_norm": 5.948304176330566,
"learning_rate": 2.1295332936979786e-05,
"loss": 2.2629,
"step": 309000
},
{
"epoch": 11.500445897740784,
"grad_norm": 7.485329627990723,
"learning_rate": 2.1248885255648036e-05,
"loss": 2.243,
"step": 309500
},
{
"epoch": 11.519024970273485,
"grad_norm": 7.400222301483154,
"learning_rate": 2.120243757431629e-05,
"loss": 2.2436,
"step": 310000
},
{
"epoch": 11.537604042806183,
"grad_norm": 7.361048221588135,
"learning_rate": 2.1155989892984544e-05,
"loss": 2.2597,
"step": 310500
},
{
"epoch": 11.556183115338882,
"grad_norm": 7.483823776245117,
"learning_rate": 2.1109542211652795e-05,
"loss": 2.2643,
"step": 311000
},
{
"epoch": 11.574762187871581,
"grad_norm": 7.027825832366943,
"learning_rate": 2.106309453032105e-05,
"loss": 2.2455,
"step": 311500
},
{
"epoch": 11.59334126040428,
"grad_norm": 6.856015205383301,
"learning_rate": 2.10166468489893e-05,
"loss": 2.2351,
"step": 312000
},
{
"epoch": 11.611920332936979,
"grad_norm": 7.20182991027832,
"learning_rate": 2.097019916765755e-05,
"loss": 2.2472,
"step": 312500
},
{
"epoch": 11.63049940546968,
"grad_norm": 6.145348072052002,
"learning_rate": 2.0923751486325803e-05,
"loss": 2.2493,
"step": 313000
},
{
"epoch": 11.649078478002378,
"grad_norm": 7.6849517822265625,
"learning_rate": 2.0877303804994054e-05,
"loss": 2.2468,
"step": 313500
},
{
"epoch": 11.667657550535077,
"grad_norm": 7.373369216918945,
"learning_rate": 2.0830856123662308e-05,
"loss": 2.252,
"step": 314000
},
{
"epoch": 11.686236623067776,
"grad_norm": 7.262668132781982,
"learning_rate": 2.0784408442330562e-05,
"loss": 2.2411,
"step": 314500
},
{
"epoch": 11.704815695600475,
"grad_norm": 6.475069999694824,
"learning_rate": 2.0737960760998812e-05,
"loss": 2.245,
"step": 315000
},
{
"epoch": 11.723394768133176,
"grad_norm": 6.434516906738281,
"learning_rate": 2.0691513079667063e-05,
"loss": 2.2459,
"step": 315500
},
{
"epoch": 11.741973840665874,
"grad_norm": 7.697376251220703,
"learning_rate": 2.0645065398335317e-05,
"loss": 2.2556,
"step": 316000
},
{
"epoch": 11.760552913198573,
"grad_norm": 7.839350700378418,
"learning_rate": 2.0598617717003567e-05,
"loss": 2.2431,
"step": 316500
},
{
"epoch": 11.779131985731272,
"grad_norm": 7.546802997589111,
"learning_rate": 2.055217003567182e-05,
"loss": 2.2627,
"step": 317000
},
{
"epoch": 11.797711058263971,
"grad_norm": 6.828023910522461,
"learning_rate": 2.0505722354340075e-05,
"loss": 2.2353,
"step": 317500
},
{
"epoch": 11.81629013079667,
"grad_norm": 6.4239935874938965,
"learning_rate": 2.0459274673008325e-05,
"loss": 2.2472,
"step": 318000
},
{
"epoch": 11.83486920332937,
"grad_norm": 6.941580772399902,
"learning_rate": 2.0412826991676576e-05,
"loss": 2.2423,
"step": 318500
},
{
"epoch": 11.85344827586207,
"grad_norm": 7.385081768035889,
"learning_rate": 2.036637931034483e-05,
"loss": 2.2332,
"step": 319000
},
{
"epoch": 11.872027348394768,
"grad_norm": 7.3545613288879395,
"learning_rate": 2.031993162901308e-05,
"loss": 2.2592,
"step": 319500
},
{
"epoch": 11.890606420927467,
"grad_norm": 6.4375104904174805,
"learning_rate": 2.027348394768133e-05,
"loss": 2.2352,
"step": 320000
},
{
"epoch": 11.909185493460166,
"grad_norm": 6.863650798797607,
"learning_rate": 2.0227036266349585e-05,
"loss": 2.2622,
"step": 320500
},
{
"epoch": 11.927764565992865,
"grad_norm": 6.5175275802612305,
"learning_rate": 2.018058858501784e-05,
"loss": 2.2421,
"step": 321000
},
{
"epoch": 11.946343638525565,
"grad_norm": 7.415239334106445,
"learning_rate": 2.013414090368609e-05,
"loss": 2.2483,
"step": 321500
},
{
"epoch": 11.964922711058264,
"grad_norm": 8.416884422302246,
"learning_rate": 2.0087693222354343e-05,
"loss": 2.245,
"step": 322000
},
{
"epoch": 11.983501783590963,
"grad_norm": 6.286489009857178,
"learning_rate": 2.0041245541022593e-05,
"loss": 2.2409,
"step": 322500
},
{
"epoch": 12.002080856123662,
"grad_norm": 7.4863080978393555,
"learning_rate": 1.9994797859690844e-05,
"loss": 2.2337,
"step": 323000
},
{
"epoch": 12.020659928656361,
"grad_norm": 6.175674915313721,
"learning_rate": 1.9948350178359094e-05,
"loss": 2.2054,
"step": 323500
},
{
"epoch": 12.03923900118906,
"grad_norm": 7.600936412811279,
"learning_rate": 1.9901902497027348e-05,
"loss": 2.2135,
"step": 324000
},
{
"epoch": 12.05781807372176,
"grad_norm": 7.510547637939453,
"learning_rate": 1.9855454815695602e-05,
"loss": 2.231,
"step": 324500
},
{
"epoch": 12.07639714625446,
"grad_norm": 6.505836009979248,
"learning_rate": 1.9809007134363853e-05,
"loss": 2.2123,
"step": 325000
},
{
"epoch": 12.094976218787158,
"grad_norm": 7.495330333709717,
"learning_rate": 1.9762559453032106e-05,
"loss": 2.2048,
"step": 325500
},
{
"epoch": 12.113555291319857,
"grad_norm": 7.062661170959473,
"learning_rate": 1.9716111771700357e-05,
"loss": 2.2055,
"step": 326000
},
{
"epoch": 12.132134363852556,
"grad_norm": 7.220265865325928,
"learning_rate": 1.9669664090368607e-05,
"loss": 2.2333,
"step": 326500
},
{
"epoch": 12.150713436385256,
"grad_norm": 6.432553768157959,
"learning_rate": 1.962321640903686e-05,
"loss": 2.2274,
"step": 327000
},
{
"epoch": 12.169292508917955,
"grad_norm": 7.610962390899658,
"learning_rate": 1.9576768727705115e-05,
"loss": 2.2108,
"step": 327500
},
{
"epoch": 12.187871581450654,
"grad_norm": 8.169533729553223,
"learning_rate": 1.9530321046373366e-05,
"loss": 2.1948,
"step": 328000
},
{
"epoch": 12.206450653983353,
"grad_norm": 6.529592037200928,
"learning_rate": 1.948387336504162e-05,
"loss": 2.2195,
"step": 328500
},
{
"epoch": 12.225029726516052,
"grad_norm": 7.463806629180908,
"learning_rate": 1.943742568370987e-05,
"loss": 2.221,
"step": 329000
},
{
"epoch": 12.24360879904875,
"grad_norm": 7.339646816253662,
"learning_rate": 1.939097800237812e-05,
"loss": 2.2126,
"step": 329500
},
{
"epoch": 12.262187871581451,
"grad_norm": 7.518458366394043,
"learning_rate": 1.9344530321046374e-05,
"loss": 2.2102,
"step": 330000
},
{
"epoch": 12.28076694411415,
"grad_norm": 7.828365325927734,
"learning_rate": 1.9298082639714625e-05,
"loss": 2.2335,
"step": 330500
},
{
"epoch": 12.29934601664685,
"grad_norm": 7.198127269744873,
"learning_rate": 1.925163495838288e-05,
"loss": 2.2214,
"step": 331000
},
{
"epoch": 12.317925089179548,
"grad_norm": 6.6039533615112305,
"learning_rate": 1.9205187277051133e-05,
"loss": 2.1985,
"step": 331500
},
{
"epoch": 12.336504161712247,
"grad_norm": 7.200562477111816,
"learning_rate": 1.9158739595719383e-05,
"loss": 2.211,
"step": 332000
},
{
"epoch": 12.355083234244946,
"grad_norm": 7.252729892730713,
"learning_rate": 1.9112291914387634e-05,
"loss": 2.2302,
"step": 332500
},
{
"epoch": 12.373662306777646,
"grad_norm": 7.972862243652344,
"learning_rate": 1.9065844233055888e-05,
"loss": 2.2095,
"step": 333000
},
{
"epoch": 12.392241379310345,
"grad_norm": 8.594975471496582,
"learning_rate": 1.9019396551724138e-05,
"loss": 2.2204,
"step": 333500
},
{
"epoch": 12.410820451843044,
"grad_norm": 7.73285436630249,
"learning_rate": 1.897294887039239e-05,
"loss": 2.2174,
"step": 334000
},
{
"epoch": 12.429399524375743,
"grad_norm": 6.429736614227295,
"learning_rate": 1.8926501189060646e-05,
"loss": 2.2236,
"step": 334500
},
{
"epoch": 12.447978596908442,
"grad_norm": 6.68847131729126,
"learning_rate": 1.8880053507728896e-05,
"loss": 2.2293,
"step": 335000
},
{
"epoch": 12.46655766944114,
"grad_norm": 6.902133464813232,
"learning_rate": 1.8833605826397147e-05,
"loss": 2.215,
"step": 335500
},
{
"epoch": 12.485136741973841,
"grad_norm": 6.436554908752441,
"learning_rate": 1.87871581450654e-05,
"loss": 2.2269,
"step": 336000
},
{
"epoch": 12.50371581450654,
"grad_norm": 6.80860710144043,
"learning_rate": 1.874071046373365e-05,
"loss": 2.2223,
"step": 336500
},
{
"epoch": 12.522294887039239,
"grad_norm": 7.977982044219971,
"learning_rate": 1.86942627824019e-05,
"loss": 2.2155,
"step": 337000
},
{
"epoch": 12.540873959571938,
"grad_norm": 7.9569478034973145,
"learning_rate": 1.8647815101070156e-05,
"loss": 2.2305,
"step": 337500
},
{
"epoch": 12.559453032104637,
"grad_norm": 6.445404529571533,
"learning_rate": 1.860136741973841e-05,
"loss": 2.2075,
"step": 338000
},
{
"epoch": 12.578032104637337,
"grad_norm": 7.153224468231201,
"learning_rate": 1.855491973840666e-05,
"loss": 2.2246,
"step": 338500
},
{
"epoch": 12.596611177170036,
"grad_norm": 7.287299633026123,
"learning_rate": 1.850847205707491e-05,
"loss": 2.1913,
"step": 339000
},
{
"epoch": 12.615190249702735,
"grad_norm": 6.6666107177734375,
"learning_rate": 1.8462024375743164e-05,
"loss": 2.2267,
"step": 339500
},
{
"epoch": 12.633769322235434,
"grad_norm": 7.024231433868408,
"learning_rate": 1.8415576694411415e-05,
"loss": 2.2106,
"step": 340000
},
{
"epoch": 12.652348394768133,
"grad_norm": 6.549313068389893,
"learning_rate": 1.8369129013079665e-05,
"loss": 2.2208,
"step": 340500
},
{
"epoch": 12.670927467300832,
"grad_norm": 6.641164302825928,
"learning_rate": 1.832268133174792e-05,
"loss": 2.2157,
"step": 341000
},
{
"epoch": 12.689506539833532,
"grad_norm": 7.615879535675049,
"learning_rate": 1.8276233650416173e-05,
"loss": 2.217,
"step": 341500
},
{
"epoch": 12.708085612366231,
"grad_norm": 7.870852470397949,
"learning_rate": 1.8229785969084424e-05,
"loss": 2.2254,
"step": 342000
},
{
"epoch": 12.72666468489893,
"grad_norm": 5.989630222320557,
"learning_rate": 1.8183338287752677e-05,
"loss": 2.2129,
"step": 342500
},
{
"epoch": 12.745243757431629,
"grad_norm": 6.8082594871521,
"learning_rate": 1.8136890606420928e-05,
"loss": 2.2229,
"step": 343000
},
{
"epoch": 12.763822829964328,
"grad_norm": 7.244877338409424,
"learning_rate": 1.809044292508918e-05,
"loss": 2.2127,
"step": 343500
},
{
"epoch": 12.782401902497027,
"grad_norm": 7.6857008934021,
"learning_rate": 1.8043995243757432e-05,
"loss": 2.2235,
"step": 344000
},
{
"epoch": 12.800980975029727,
"grad_norm": 7.00359582901001,
"learning_rate": 1.7997547562425686e-05,
"loss": 2.2211,
"step": 344500
},
{
"epoch": 12.819560047562426,
"grad_norm": 7.0071187019348145,
"learning_rate": 1.7951099881093937e-05,
"loss": 2.2182,
"step": 345000
},
{
"epoch": 12.838139120095125,
"grad_norm": 6.9319634437561035,
"learning_rate": 1.790465219976219e-05,
"loss": 2.2043,
"step": 345500
},
{
"epoch": 12.856718192627824,
"grad_norm": 6.487482070922852,
"learning_rate": 1.785820451843044e-05,
"loss": 2.2139,
"step": 346000
},
{
"epoch": 12.875297265160523,
"grad_norm": 7.508727550506592,
"learning_rate": 1.781175683709869e-05,
"loss": 2.2243,
"step": 346500
},
{
"epoch": 12.893876337693222,
"grad_norm": 6.555574893951416,
"learning_rate": 1.7765309155766945e-05,
"loss": 2.2167,
"step": 347000
},
{
"epoch": 12.912455410225922,
"grad_norm": 7.410988807678223,
"learning_rate": 1.7718861474435196e-05,
"loss": 2.2158,
"step": 347500
},
{
"epoch": 12.931034482758621,
"grad_norm": 8.217428207397461,
"learning_rate": 1.767241379310345e-05,
"loss": 2.2031,
"step": 348000
},
{
"epoch": 12.94961355529132,
"grad_norm": 6.6040754318237305,
"learning_rate": 1.7625966111771704e-05,
"loss": 2.1866,
"step": 348500
},
{
"epoch": 12.968192627824019,
"grad_norm": 6.99837064743042,
"learning_rate": 1.7579518430439954e-05,
"loss": 2.1899,
"step": 349000
},
{
"epoch": 12.986771700356718,
"grad_norm": 6.531412124633789,
"learning_rate": 1.7533070749108205e-05,
"loss": 2.2244,
"step": 349500
},
{
"epoch": 13.005350772889418,
"grad_norm": 7.704728126525879,
"learning_rate": 1.748662306777646e-05,
"loss": 2.18,
"step": 350000
},
{
"epoch": 13.023929845422117,
"grad_norm": 6.77532434463501,
"learning_rate": 1.744017538644471e-05,
"loss": 2.1789,
"step": 350500
},
{
"epoch": 13.042508917954816,
"grad_norm": 6.446128845214844,
"learning_rate": 1.739372770511296e-05,
"loss": 2.1707,
"step": 351000
},
{
"epoch": 13.061087990487515,
"grad_norm": 7.576733589172363,
"learning_rate": 1.7347280023781213e-05,
"loss": 2.1768,
"step": 351500
},
{
"epoch": 13.079667063020214,
"grad_norm": 7.239291191101074,
"learning_rate": 1.7300832342449467e-05,
"loss": 2.2011,
"step": 352000
},
{
"epoch": 13.098246135552913,
"grad_norm": 6.936691761016846,
"learning_rate": 1.7254384661117718e-05,
"loss": 2.1767,
"step": 352500
},
{
"epoch": 13.116825208085613,
"grad_norm": 7.205715179443359,
"learning_rate": 1.7207936979785968e-05,
"loss": 2.1581,
"step": 353000
},
{
"epoch": 13.135404280618312,
"grad_norm": 6.61326789855957,
"learning_rate": 1.7161489298454222e-05,
"loss": 2.185,
"step": 353500
},
{
"epoch": 13.15398335315101,
"grad_norm": 7.715660572052002,
"learning_rate": 1.7115041617122473e-05,
"loss": 2.1924,
"step": 354000
},
{
"epoch": 13.17256242568371,
"grad_norm": 6.543544769287109,
"learning_rate": 1.7068593935790726e-05,
"loss": 2.1998,
"step": 354500
},
{
"epoch": 13.191141498216409,
"grad_norm": 8.281086921691895,
"learning_rate": 1.702214625445898e-05,
"loss": 2.1787,
"step": 355000
},
{
"epoch": 13.209720570749107,
"grad_norm": 6.323915481567383,
"learning_rate": 1.697569857312723e-05,
"loss": 2.1834,
"step": 355500
},
{
"epoch": 13.228299643281808,
"grad_norm": 8.45340347290039,
"learning_rate": 1.692925089179548e-05,
"loss": 2.1806,
"step": 356000
},
{
"epoch": 13.246878715814507,
"grad_norm": 8.1563720703125,
"learning_rate": 1.6882803210463735e-05,
"loss": 2.1708,
"step": 356500
},
{
"epoch": 13.265457788347206,
"grad_norm": 7.083395481109619,
"learning_rate": 1.6836355529131986e-05,
"loss": 2.1866,
"step": 357000
},
{
"epoch": 13.284036860879905,
"grad_norm": 6.55299186706543,
"learning_rate": 1.6789907847800236e-05,
"loss": 2.1723,
"step": 357500
},
{
"epoch": 13.302615933412604,
"grad_norm": 6.710261821746826,
"learning_rate": 1.674346016646849e-05,
"loss": 2.1977,
"step": 358000
},
{
"epoch": 13.321195005945302,
"grad_norm": 7.0249738693237305,
"learning_rate": 1.6697012485136744e-05,
"loss": 2.2007,
"step": 358500
},
{
"epoch": 13.339774078478003,
"grad_norm": 7.835285663604736,
"learning_rate": 1.6650564803804994e-05,
"loss": 2.1959,
"step": 359000
},
{
"epoch": 13.358353151010702,
"grad_norm": 8.400995254516602,
"learning_rate": 1.660411712247325e-05,
"loss": 2.1991,
"step": 359500
},
{
"epoch": 13.3769322235434,
"grad_norm": 6.235854148864746,
"learning_rate": 1.65576694411415e-05,
"loss": 2.174,
"step": 360000
},
{
"epoch": 13.3955112960761,
"grad_norm": 6.741766929626465,
"learning_rate": 1.651122175980975e-05,
"loss": 2.1777,
"step": 360500
},
{
"epoch": 13.414090368608798,
"grad_norm": 8.243950843811035,
"learning_rate": 1.6464774078478003e-05,
"loss": 2.1841,
"step": 361000
},
{
"epoch": 13.432669441141499,
"grad_norm": 6.43676233291626,
"learning_rate": 1.6418326397146254e-05,
"loss": 2.1769,
"step": 361500
},
{
"epoch": 13.451248513674198,
"grad_norm": 6.800743579864502,
"learning_rate": 1.6371878715814508e-05,
"loss": 2.2043,
"step": 362000
},
{
"epoch": 13.469827586206897,
"grad_norm": 6.082602500915527,
"learning_rate": 1.632543103448276e-05,
"loss": 2.167,
"step": 362500
},
{
"epoch": 13.488406658739596,
"grad_norm": 7.768115520477295,
"learning_rate": 1.6278983353151012e-05,
"loss": 2.1623,
"step": 363000
},
{
"epoch": 13.506985731272295,
"grad_norm": 6.893867492675781,
"learning_rate": 1.6232535671819262e-05,
"loss": 2.1835,
"step": 363500
},
{
"epoch": 13.525564803804993,
"grad_norm": 6.749509811401367,
"learning_rate": 1.6186087990487516e-05,
"loss": 2.1659,
"step": 364000
},
{
"epoch": 13.544143876337694,
"grad_norm": 6.05668306350708,
"learning_rate": 1.6139640309155767e-05,
"loss": 2.1703,
"step": 364500
},
{
"epoch": 13.562722948870393,
"grad_norm": 7.0912251472473145,
"learning_rate": 1.609319262782402e-05,
"loss": 2.1919,
"step": 365000
},
{
"epoch": 13.581302021403092,
"grad_norm": 6.6050310134887695,
"learning_rate": 1.6046744946492275e-05,
"loss": 2.1756,
"step": 365500
},
{
"epoch": 13.59988109393579,
"grad_norm": 6.950946807861328,
"learning_rate": 1.6000297265160525e-05,
"loss": 2.1825,
"step": 366000
},
{
"epoch": 13.61846016646849,
"grad_norm": 7.240453243255615,
"learning_rate": 1.5953849583828776e-05,
"loss": 2.1837,
"step": 366500
},
{
"epoch": 13.637039239001188,
"grad_norm": 8.0787935256958,
"learning_rate": 1.590740190249703e-05,
"loss": 2.1747,
"step": 367000
},
{
"epoch": 13.655618311533889,
"grad_norm": 6.953646659851074,
"learning_rate": 1.586095422116528e-05,
"loss": 2.1821,
"step": 367500
},
{
"epoch": 13.674197384066588,
"grad_norm": 6.981358051300049,
"learning_rate": 1.581450653983353e-05,
"loss": 2.1751,
"step": 368000
},
{
"epoch": 13.692776456599287,
"grad_norm": 7.580711841583252,
"learning_rate": 1.5768058858501784e-05,
"loss": 2.1685,
"step": 368500
},
{
"epoch": 13.711355529131986,
"grad_norm": 7.360109806060791,
"learning_rate": 1.5721611177170038e-05,
"loss": 2.1566,
"step": 369000
},
{
"epoch": 13.729934601664684,
"grad_norm": 6.589022636413574,
"learning_rate": 1.567516349583829e-05,
"loss": 2.1725,
"step": 369500
},
{
"epoch": 13.748513674197383,
"grad_norm": 7.376802444458008,
"learning_rate": 1.562871581450654e-05,
"loss": 2.1814,
"step": 370000
},
{
"epoch": 13.767092746730084,
"grad_norm": 7.36546516418457,
"learning_rate": 1.5582268133174793e-05,
"loss": 2.1824,
"step": 370500
},
{
"epoch": 13.785671819262783,
"grad_norm": 7.832765579223633,
"learning_rate": 1.5535820451843044e-05,
"loss": 2.1651,
"step": 371000
},
{
"epoch": 13.804250891795482,
"grad_norm": 7.414605617523193,
"learning_rate": 1.5489372770511294e-05,
"loss": 2.1488,
"step": 371500
},
{
"epoch": 13.82282996432818,
"grad_norm": 7.148501873016357,
"learning_rate": 1.5442925089179548e-05,
"loss": 2.185,
"step": 372000
},
{
"epoch": 13.84140903686088,
"grad_norm": 6.733073711395264,
"learning_rate": 1.5396477407847802e-05,
"loss": 2.1645,
"step": 372500
},
{
"epoch": 13.85998810939358,
"grad_norm": 7.812681198120117,
"learning_rate": 1.5350029726516052e-05,
"loss": 2.1836,
"step": 373000
},
{
"epoch": 13.878567181926279,
"grad_norm": 6.853206634521484,
"learning_rate": 1.5303582045184306e-05,
"loss": 2.1778,
"step": 373500
},
{
"epoch": 13.897146254458978,
"grad_norm": 7.234543323516846,
"learning_rate": 1.5257134363852557e-05,
"loss": 2.1759,
"step": 374000
},
{
"epoch": 13.915725326991677,
"grad_norm": 7.433253765106201,
"learning_rate": 1.5210686682520809e-05,
"loss": 2.1722,
"step": 374500
},
{
"epoch": 13.934304399524375,
"grad_norm": 7.073111534118652,
"learning_rate": 1.5164239001189063e-05,
"loss": 2.1855,
"step": 375000
},
{
"epoch": 13.952883472057074,
"grad_norm": 7.280003547668457,
"learning_rate": 1.5117791319857313e-05,
"loss": 2.1819,
"step": 375500
},
{
"epoch": 13.971462544589775,
"grad_norm": 6.7823991775512695,
"learning_rate": 1.5071343638525565e-05,
"loss": 2.1585,
"step": 376000
},
{
"epoch": 13.990041617122474,
"grad_norm": 7.181284427642822,
"learning_rate": 1.502489595719382e-05,
"loss": 2.1689,
"step": 376500
},
{
"epoch": 14.008620689655173,
"grad_norm": 6.957113265991211,
"learning_rate": 1.497844827586207e-05,
"loss": 2.1754,
"step": 377000
},
{
"epoch": 14.027199762187871,
"grad_norm": 7.111293315887451,
"learning_rate": 1.493200059453032e-05,
"loss": 2.1651,
"step": 377500
},
{
"epoch": 14.04577883472057,
"grad_norm": 7.025313854217529,
"learning_rate": 1.4885552913198574e-05,
"loss": 2.1458,
"step": 378000
},
{
"epoch": 14.06435790725327,
"grad_norm": 6.963667869567871,
"learning_rate": 1.4839105231866826e-05,
"loss": 2.1456,
"step": 378500
},
{
"epoch": 14.08293697978597,
"grad_norm": 7.611172199249268,
"learning_rate": 1.4792657550535077e-05,
"loss": 2.158,
"step": 379000
},
{
"epoch": 14.101516052318669,
"grad_norm": 6.874037265777588,
"learning_rate": 1.474620986920333e-05,
"loss": 2.1446,
"step": 379500
},
{
"epoch": 14.120095124851368,
"grad_norm": 7.512300491333008,
"learning_rate": 1.4699762187871583e-05,
"loss": 2.1528,
"step": 380000
},
{
"epoch": 14.138674197384066,
"grad_norm": 6.693312168121338,
"learning_rate": 1.4653314506539833e-05,
"loss": 2.1563,
"step": 380500
},
{
"epoch": 14.157253269916765,
"grad_norm": 6.6438164710998535,
"learning_rate": 1.4606866825208087e-05,
"loss": 2.1383,
"step": 381000
},
{
"epoch": 14.175832342449464,
"grad_norm": 7.537757873535156,
"learning_rate": 1.456041914387634e-05,
"loss": 2.1554,
"step": 381500
},
{
"epoch": 14.194411414982165,
"grad_norm": 8.159100532531738,
"learning_rate": 1.451397146254459e-05,
"loss": 2.1538,
"step": 382000
},
{
"epoch": 14.212990487514864,
"grad_norm": 7.427910327911377,
"learning_rate": 1.4467523781212844e-05,
"loss": 2.1623,
"step": 382500
},
{
"epoch": 14.231569560047562,
"grad_norm": 7.805336952209473,
"learning_rate": 1.4421076099881094e-05,
"loss": 2.1645,
"step": 383000
},
{
"epoch": 14.250148632580261,
"grad_norm": 6.669980525970459,
"learning_rate": 1.4374628418549347e-05,
"loss": 2.1525,
"step": 383500
},
{
"epoch": 14.26872770511296,
"grad_norm": 7.358639240264893,
"learning_rate": 1.4328180737217597e-05,
"loss": 2.1542,
"step": 384000
},
{
"epoch": 14.28730677764566,
"grad_norm": 7.103815078735352,
"learning_rate": 1.4281733055885851e-05,
"loss": 2.1567,
"step": 384500
},
{
"epoch": 14.30588585017836,
"grad_norm": 7.218321800231934,
"learning_rate": 1.4235285374554103e-05,
"loss": 2.1569,
"step": 385000
},
{
"epoch": 14.324464922711059,
"grad_norm": 7.941781520843506,
"learning_rate": 1.4188837693222354e-05,
"loss": 2.1554,
"step": 385500
},
{
"epoch": 14.343043995243757,
"grad_norm": 8.86156940460205,
"learning_rate": 1.4142390011890607e-05,
"loss": 2.1649,
"step": 386000
},
{
"epoch": 14.361623067776456,
"grad_norm": 6.904116153717041,
"learning_rate": 1.409594233055886e-05,
"loss": 2.1486,
"step": 386500
},
{
"epoch": 14.380202140309155,
"grad_norm": 6.8697943687438965,
"learning_rate": 1.404949464922711e-05,
"loss": 2.1686,
"step": 387000
},
{
"epoch": 14.398781212841856,
"grad_norm": 7.536423683166504,
"learning_rate": 1.4003046967895364e-05,
"loss": 2.1534,
"step": 387500
},
{
"epoch": 14.417360285374555,
"grad_norm": 6.2832465171813965,
"learning_rate": 1.3956599286563615e-05,
"loss": 2.1638,
"step": 388000
},
{
"epoch": 14.435939357907253,
"grad_norm": 7.5254926681518555,
"learning_rate": 1.3910151605231867e-05,
"loss": 2.1611,
"step": 388500
},
{
"epoch": 14.454518430439952,
"grad_norm": 6.102006912231445,
"learning_rate": 1.386370392390012e-05,
"loss": 2.1481,
"step": 389000
},
{
"epoch": 14.473097502972651,
"grad_norm": 6.829434871673584,
"learning_rate": 1.3817256242568371e-05,
"loss": 2.1309,
"step": 389500
},
{
"epoch": 14.49167657550535,
"grad_norm": 7.072176456451416,
"learning_rate": 1.3770808561236623e-05,
"loss": 2.1493,
"step": 390000
},
{
"epoch": 14.51025564803805,
"grad_norm": 8.32613754272461,
"learning_rate": 1.3724360879904877e-05,
"loss": 2.128,
"step": 390500
},
{
"epoch": 14.52883472057075,
"grad_norm": 7.587469577789307,
"learning_rate": 1.3677913198573128e-05,
"loss": 2.1446,
"step": 391000
},
{
"epoch": 14.547413793103448,
"grad_norm": 7.003942966461182,
"learning_rate": 1.363146551724138e-05,
"loss": 2.1493,
"step": 391500
},
{
"epoch": 14.565992865636147,
"grad_norm": 6.587801456451416,
"learning_rate": 1.3585017835909634e-05,
"loss": 2.1554,
"step": 392000
},
{
"epoch": 14.584571938168846,
"grad_norm": 6.796844005584717,
"learning_rate": 1.3538570154577884e-05,
"loss": 2.1699,
"step": 392500
},
{
"epoch": 14.603151010701545,
"grad_norm": 6.230968952178955,
"learning_rate": 1.3492122473246135e-05,
"loss": 2.1514,
"step": 393000
},
{
"epoch": 14.621730083234246,
"grad_norm": 7.986715793609619,
"learning_rate": 1.3445674791914389e-05,
"loss": 2.1439,
"step": 393500
},
{
"epoch": 14.640309155766944,
"grad_norm": 6.953087329864502,
"learning_rate": 1.339922711058264e-05,
"loss": 2.1359,
"step": 394000
},
{
"epoch": 14.658888228299643,
"grad_norm": 6.939476490020752,
"learning_rate": 1.3352779429250891e-05,
"loss": 2.1438,
"step": 394500
},
{
"epoch": 14.677467300832342,
"grad_norm": 7.4189229011535645,
"learning_rate": 1.3306331747919145e-05,
"loss": 2.1494,
"step": 395000
},
{
"epoch": 14.696046373365041,
"grad_norm": 6.914766788482666,
"learning_rate": 1.3259884066587397e-05,
"loss": 2.1336,
"step": 395500
},
{
"epoch": 14.714625445897742,
"grad_norm": 6.602614402770996,
"learning_rate": 1.3213436385255648e-05,
"loss": 2.154,
"step": 396000
},
{
"epoch": 14.73320451843044,
"grad_norm": 7.446470260620117,
"learning_rate": 1.3166988703923902e-05,
"loss": 2.1425,
"step": 396500
},
{
"epoch": 14.75178359096314,
"grad_norm": 6.55057430267334,
"learning_rate": 1.3120541022592154e-05,
"loss": 2.1403,
"step": 397000
},
{
"epoch": 14.770362663495838,
"grad_norm": 6.798906326293945,
"learning_rate": 1.3074093341260404e-05,
"loss": 2.1396,
"step": 397500
},
{
"epoch": 14.788941736028537,
"grad_norm": 7.93524169921875,
"learning_rate": 1.3027645659928655e-05,
"loss": 2.1665,
"step": 398000
},
{
"epoch": 14.807520808561236,
"grad_norm": 8.041824340820312,
"learning_rate": 1.2981197978596909e-05,
"loss": 2.1377,
"step": 398500
},
{
"epoch": 14.826099881093937,
"grad_norm": 6.651689529418945,
"learning_rate": 1.2934750297265161e-05,
"loss": 2.1461,
"step": 399000
},
{
"epoch": 14.844678953626635,
"grad_norm": 6.821606636047363,
"learning_rate": 1.2888302615933411e-05,
"loss": 2.158,
"step": 399500
},
{
"epoch": 14.863258026159334,
"grad_norm": 8.040721893310547,
"learning_rate": 1.2841854934601665e-05,
"loss": 2.1466,
"step": 400000
},
{
"epoch": 14.881837098692033,
"grad_norm": 7.286508083343506,
"learning_rate": 1.2795407253269918e-05,
"loss": 2.1428,
"step": 400500
},
{
"epoch": 14.900416171224732,
"grad_norm": 8.08362102508545,
"learning_rate": 1.2748959571938168e-05,
"loss": 2.1515,
"step": 401000
},
{
"epoch": 14.918995243757431,
"grad_norm": 8.438191413879395,
"learning_rate": 1.2702511890606422e-05,
"loss": 2.1446,
"step": 401500
},
{
"epoch": 14.937574316290132,
"grad_norm": 7.372959136962891,
"learning_rate": 1.2656064209274674e-05,
"loss": 2.1496,
"step": 402000
},
{
"epoch": 14.95615338882283,
"grad_norm": 7.080979347229004,
"learning_rate": 1.2609616527942925e-05,
"loss": 2.1579,
"step": 402500
},
{
"epoch": 14.97473246135553,
"grad_norm": 7.254255294799805,
"learning_rate": 1.2563168846611178e-05,
"loss": 2.1414,
"step": 403000
},
{
"epoch": 14.993311533888228,
"grad_norm": 7.761992931365967,
"learning_rate": 1.2516721165279429e-05,
"loss": 2.15,
"step": 403500
},
{
"epoch": 15.011890606420927,
"grad_norm": 7.0644049644470215,
"learning_rate": 1.2470273483947683e-05,
"loss": 2.1474,
"step": 404000
},
{
"epoch": 15.030469678953626,
"grad_norm": 8.067272186279297,
"learning_rate": 1.2423825802615933e-05,
"loss": 2.1103,
"step": 404500
},
{
"epoch": 15.049048751486326,
"grad_norm": 6.896698474884033,
"learning_rate": 1.2377378121284185e-05,
"loss": 2.1159,
"step": 405000
},
{
"epoch": 15.067627824019025,
"grad_norm": 6.983173370361328,
"learning_rate": 1.233093043995244e-05,
"loss": 2.1463,
"step": 405500
},
{
"epoch": 15.086206896551724,
"grad_norm": 8.10067367553711,
"learning_rate": 1.228448275862069e-05,
"loss": 2.1418,
"step": 406000
},
{
"epoch": 15.104785969084423,
"grad_norm": 7.817485332489014,
"learning_rate": 1.2238035077288942e-05,
"loss": 2.1433,
"step": 406500
},
{
"epoch": 15.123365041617122,
"grad_norm": 7.6188578605651855,
"learning_rate": 1.2191587395957194e-05,
"loss": 2.1198,
"step": 407000
},
{
"epoch": 15.14194411414982,
"grad_norm": 7.024149417877197,
"learning_rate": 1.2145139714625446e-05,
"loss": 2.1157,
"step": 407500
},
{
"epoch": 15.160523186682521,
"grad_norm": 6.95907735824585,
"learning_rate": 1.2098692033293699e-05,
"loss": 2.134,
"step": 408000
},
{
"epoch": 15.17910225921522,
"grad_norm": 6.850398540496826,
"learning_rate": 1.205224435196195e-05,
"loss": 2.1178,
"step": 408500
},
{
"epoch": 15.19768133174792,
"grad_norm": 7.054015159606934,
"learning_rate": 1.2005796670630203e-05,
"loss": 2.1353,
"step": 409000
},
{
"epoch": 15.216260404280618,
"grad_norm": 8.049177169799805,
"learning_rate": 1.1959348989298455e-05,
"loss": 2.1175,
"step": 409500
},
{
"epoch": 15.234839476813317,
"grad_norm": 7.3112568855285645,
"learning_rate": 1.1912901307966706e-05,
"loss": 2.1269,
"step": 410000
},
{
"epoch": 15.253418549346017,
"grad_norm": 7.102066516876221,
"learning_rate": 1.186645362663496e-05,
"loss": 2.121,
"step": 410500
},
{
"epoch": 15.271997621878716,
"grad_norm": 7.103978633880615,
"learning_rate": 1.1820005945303212e-05,
"loss": 2.123,
"step": 411000
},
{
"epoch": 15.290576694411415,
"grad_norm": 7.16837215423584,
"learning_rate": 1.1773558263971462e-05,
"loss": 2.1555,
"step": 411500
},
{
"epoch": 15.309155766944114,
"grad_norm": 7.387100696563721,
"learning_rate": 1.1727110582639714e-05,
"loss": 2.1323,
"step": 412000
},
{
"epoch": 15.327734839476813,
"grad_norm": 7.893144607543945,
"learning_rate": 1.1680662901307968e-05,
"loss": 2.1358,
"step": 412500
},
{
"epoch": 15.346313912009512,
"grad_norm": 7.737049579620361,
"learning_rate": 1.1634215219976219e-05,
"loss": 2.122,
"step": 413000
},
{
"epoch": 15.364892984542212,
"grad_norm": 7.758161544799805,
"learning_rate": 1.1587767538644471e-05,
"loss": 2.1262,
"step": 413500
},
{
"epoch": 15.383472057074911,
"grad_norm": 7.8588666915893555,
"learning_rate": 1.1541319857312725e-05,
"loss": 2.1202,
"step": 414000
},
{
"epoch": 15.40205112960761,
"grad_norm": 7.353470325469971,
"learning_rate": 1.1494872175980975e-05,
"loss": 2.1406,
"step": 414500
},
{
"epoch": 15.420630202140309,
"grad_norm": 6.766369819641113,
"learning_rate": 1.1448424494649228e-05,
"loss": 2.1344,
"step": 415000
},
{
"epoch": 15.439209274673008,
"grad_norm": 7.156630992889404,
"learning_rate": 1.140197681331748e-05,
"loss": 2.1232,
"step": 415500
},
{
"epoch": 15.457788347205707,
"grad_norm": 7.754790782928467,
"learning_rate": 1.1355529131985732e-05,
"loss": 2.1154,
"step": 416000
},
{
"epoch": 15.476367419738407,
"grad_norm": 8.716788291931152,
"learning_rate": 1.1309081450653984e-05,
"loss": 2.1236,
"step": 416500
},
{
"epoch": 15.494946492271106,
"grad_norm": 7.345715522766113,
"learning_rate": 1.1262633769322235e-05,
"loss": 2.1299,
"step": 417000
},
{
"epoch": 15.513525564803805,
"grad_norm": 7.088531494140625,
"learning_rate": 1.1216186087990488e-05,
"loss": 2.1168,
"step": 417500
},
{
"epoch": 15.532104637336504,
"grad_norm": 7.417008876800537,
"learning_rate": 1.116973840665874e-05,
"loss": 2.1247,
"step": 418000
},
{
"epoch": 15.550683709869203,
"grad_norm": 7.3177995681762695,
"learning_rate": 1.1123290725326991e-05,
"loss": 2.1198,
"step": 418500
},
{
"epoch": 15.569262782401903,
"grad_norm": 6.9706711769104,
"learning_rate": 1.1076843043995245e-05,
"loss": 2.1228,
"step": 419000
},
{
"epoch": 15.587841854934602,
"grad_norm": 6.97265625,
"learning_rate": 1.1030395362663497e-05,
"loss": 2.1237,
"step": 419500
},
{
"epoch": 15.606420927467301,
"grad_norm": 6.226667404174805,
"learning_rate": 1.0983947681331748e-05,
"loss": 2.1017,
"step": 420000
},
{
"epoch": 15.625,
"grad_norm": 7.427140712738037,
"learning_rate": 1.09375e-05,
"loss": 2.1131,
"step": 420500
},
{
"epoch": 15.643579072532699,
"grad_norm": 8.942204475402832,
"learning_rate": 1.0891052318668254e-05,
"loss": 2.1294,
"step": 421000
},
{
"epoch": 15.662158145065398,
"grad_norm": 7.123710632324219,
"learning_rate": 1.0844604637336504e-05,
"loss": 2.1207,
"step": 421500
},
{
"epoch": 15.680737217598097,
"grad_norm": 6.2210798263549805,
"learning_rate": 1.0798156956004756e-05,
"loss": 2.1222,
"step": 422000
},
{
"epoch": 15.699316290130797,
"grad_norm": 7.38429069519043,
"learning_rate": 1.0751709274673009e-05,
"loss": 2.114,
"step": 422500
},
{
"epoch": 15.717895362663496,
"grad_norm": 6.752946853637695,
"learning_rate": 1.070526159334126e-05,
"loss": 2.1105,
"step": 423000
},
{
"epoch": 15.736474435196195,
"grad_norm": 6.8533406257629395,
"learning_rate": 1.0658813912009513e-05,
"loss": 2.1263,
"step": 423500
},
{
"epoch": 15.755053507728894,
"grad_norm": 8.36920166015625,
"learning_rate": 1.0612366230677765e-05,
"loss": 2.1082,
"step": 424000
},
{
"epoch": 15.773632580261593,
"grad_norm": 6.900448799133301,
"learning_rate": 1.0565918549346017e-05,
"loss": 2.1245,
"step": 424500
},
{
"epoch": 15.792211652794293,
"grad_norm": 7.180041313171387,
"learning_rate": 1.051947086801427e-05,
"loss": 2.1163,
"step": 425000
},
{
"epoch": 15.810790725326992,
"grad_norm": 7.32526159286499,
"learning_rate": 1.047302318668252e-05,
"loss": 2.1344,
"step": 425500
},
{
"epoch": 15.829369797859691,
"grad_norm": 7.500328540802002,
"learning_rate": 1.0426575505350774e-05,
"loss": 2.1127,
"step": 426000
},
{
"epoch": 15.84794887039239,
"grad_norm": 7.36287784576416,
"learning_rate": 1.0380127824019026e-05,
"loss": 2.1104,
"step": 426500
},
{
"epoch": 15.866527942925089,
"grad_norm": 7.004654884338379,
"learning_rate": 1.0333680142687277e-05,
"loss": 2.13,
"step": 427000
},
{
"epoch": 15.885107015457788,
"grad_norm": 6.9634528160095215,
"learning_rate": 1.0287232461355529e-05,
"loss": 2.1196,
"step": 427500
},
{
"epoch": 15.903686087990488,
"grad_norm": 7.970580101013184,
"learning_rate": 1.0240784780023783e-05,
"loss": 2.1144,
"step": 428000
},
{
"epoch": 15.922265160523187,
"grad_norm": 7.777002334594727,
"learning_rate": 1.0194337098692033e-05,
"loss": 2.1226,
"step": 428500
},
{
"epoch": 15.940844233055886,
"grad_norm": 6.956545352935791,
"learning_rate": 1.0147889417360285e-05,
"loss": 2.1353,
"step": 429000
},
{
"epoch": 15.959423305588585,
"grad_norm": 7.85087251663208,
"learning_rate": 1.010144173602854e-05,
"loss": 2.1281,
"step": 429500
},
{
"epoch": 15.978002378121284,
"grad_norm": 8.030372619628906,
"learning_rate": 1.005499405469679e-05,
"loss": 2.1247,
"step": 430000
},
{
"epoch": 15.996581450653984,
"grad_norm": 7.764926433563232,
"learning_rate": 1.0008546373365042e-05,
"loss": 2.1291,
"step": 430500
},
{
"epoch": 16.015160523186683,
"grad_norm": 6.365900039672852,
"learning_rate": 9.962098692033294e-06,
"loss": 2.098,
"step": 431000
},
{
"epoch": 16.033739595719382,
"grad_norm": 7.203670024871826,
"learning_rate": 9.915651010701546e-06,
"loss": 2.1182,
"step": 431500
},
{
"epoch": 16.05231866825208,
"grad_norm": 7.516459941864014,
"learning_rate": 9.869203329369798e-06,
"loss": 2.0847,
"step": 432000
},
{
"epoch": 16.07089774078478,
"grad_norm": 6.9018235206604,
"learning_rate": 9.822755648038049e-06,
"loss": 2.0801,
"step": 432500
},
{
"epoch": 16.08947681331748,
"grad_norm": 7.418632507324219,
"learning_rate": 9.776307966706303e-06,
"loss": 2.0981,
"step": 433000
},
{
"epoch": 16.108055885850177,
"grad_norm": 7.646805763244629,
"learning_rate": 9.729860285374555e-06,
"loss": 2.1049,
"step": 433500
},
{
"epoch": 16.126634958382876,
"grad_norm": 6.691248893737793,
"learning_rate": 9.683412604042806e-06,
"loss": 2.1026,
"step": 434000
},
{
"epoch": 16.145214030915575,
"grad_norm": 8.201228141784668,
"learning_rate": 9.63696492271106e-06,
"loss": 2.1197,
"step": 434500
},
{
"epoch": 16.163793103448278,
"grad_norm": 6.836193561553955,
"learning_rate": 9.590517241379312e-06,
"loss": 2.1048,
"step": 435000
},
{
"epoch": 16.182372175980976,
"grad_norm": 6.607935905456543,
"learning_rate": 9.544069560047562e-06,
"loss": 2.0952,
"step": 435500
},
{
"epoch": 16.200951248513675,
"grad_norm": 7.329438209533691,
"learning_rate": 9.497621878715814e-06,
"loss": 2.1096,
"step": 436000
},
{
"epoch": 16.219530321046374,
"grad_norm": 7.701877117156982,
"learning_rate": 9.451174197384068e-06,
"loss": 2.1061,
"step": 436500
},
{
"epoch": 16.238109393579073,
"grad_norm": 6.743167877197266,
"learning_rate": 9.404726516052319e-06,
"loss": 2.1151,
"step": 437000
},
{
"epoch": 16.256688466111772,
"grad_norm": 7.008676528930664,
"learning_rate": 9.35827883472057e-06,
"loss": 2.1113,
"step": 437500
},
{
"epoch": 16.27526753864447,
"grad_norm": 7.036728858947754,
"learning_rate": 9.311831153388825e-06,
"loss": 2.0898,
"step": 438000
},
{
"epoch": 16.29384661117717,
"grad_norm": 7.374510765075684,
"learning_rate": 9.265383472057075e-06,
"loss": 2.0845,
"step": 438500
},
{
"epoch": 16.31242568370987,
"grad_norm": 7.095835208892822,
"learning_rate": 9.218935790725327e-06,
"loss": 2.117,
"step": 439000
},
{
"epoch": 16.331004756242567,
"grad_norm": 7.737233638763428,
"learning_rate": 9.17248810939358e-06,
"loss": 2.0987,
"step": 439500
},
{
"epoch": 16.349583828775266,
"grad_norm": 7.745171546936035,
"learning_rate": 9.126040428061832e-06,
"loss": 2.0858,
"step": 440000
},
{
"epoch": 16.368162901307965,
"grad_norm": 6.25264835357666,
"learning_rate": 9.079592746730084e-06,
"loss": 2.1014,
"step": 440500
},
{
"epoch": 16.386741973840667,
"grad_norm": 8.324295043945312,
"learning_rate": 9.033145065398334e-06,
"loss": 2.1006,
"step": 441000
},
{
"epoch": 16.405321046373366,
"grad_norm": 7.7967352867126465,
"learning_rate": 8.986697384066588e-06,
"loss": 2.0758,
"step": 441500
},
{
"epoch": 16.423900118906065,
"grad_norm": 7.272579193115234,
"learning_rate": 8.94024970273484e-06,
"loss": 2.1062,
"step": 442000
},
{
"epoch": 16.442479191438764,
"grad_norm": 7.0281195640563965,
"learning_rate": 8.893802021403091e-06,
"loss": 2.0979,
"step": 442500
},
{
"epoch": 16.461058263971463,
"grad_norm": 7.969797611236572,
"learning_rate": 8.847354340071345e-06,
"loss": 2.0782,
"step": 443000
},
{
"epoch": 16.47963733650416,
"grad_norm": 7.431224822998047,
"learning_rate": 8.800906658739597e-06,
"loss": 2.0986,
"step": 443500
},
{
"epoch": 16.49821640903686,
"grad_norm": 7.004672050476074,
"learning_rate": 8.754458977407848e-06,
"loss": 2.1162,
"step": 444000
},
{
"epoch": 16.51679548156956,
"grad_norm": 7.328388214111328,
"learning_rate": 8.7080112960761e-06,
"loss": 2.0903,
"step": 444500
},
{
"epoch": 16.53537455410226,
"grad_norm": 7.997599124908447,
"learning_rate": 8.661563614744354e-06,
"loss": 2.0926,
"step": 445000
},
{
"epoch": 16.553953626634957,
"grad_norm": 6.598504066467285,
"learning_rate": 8.615115933412604e-06,
"loss": 2.103,
"step": 445500
},
{
"epoch": 16.572532699167656,
"grad_norm": 8.041633605957031,
"learning_rate": 8.568668252080856e-06,
"loss": 2.0902,
"step": 446000
},
{
"epoch": 16.591111771700355,
"grad_norm": 6.456114768981934,
"learning_rate": 8.522220570749109e-06,
"loss": 2.1113,
"step": 446500
},
{
"epoch": 16.609690844233057,
"grad_norm": 8.524587631225586,
"learning_rate": 8.47577288941736e-06,
"loss": 2.1034,
"step": 447000
},
{
"epoch": 16.628269916765756,
"grad_norm": 7.4559102058410645,
"learning_rate": 8.429325208085613e-06,
"loss": 2.0911,
"step": 447500
},
{
"epoch": 16.646848989298455,
"grad_norm": 7.678273677825928,
"learning_rate": 8.382877526753865e-06,
"loss": 2.1009,
"step": 448000
},
{
"epoch": 16.665428061831154,
"grad_norm": 6.468957424163818,
"learning_rate": 8.336429845422117e-06,
"loss": 2.104,
"step": 448500
},
{
"epoch": 16.684007134363853,
"grad_norm": 7.746886730194092,
"learning_rate": 8.28998216409037e-06,
"loss": 2.0961,
"step": 449000
},
{
"epoch": 16.70258620689655,
"grad_norm": 6.837747097015381,
"learning_rate": 8.24353448275862e-06,
"loss": 2.0867,
"step": 449500
},
{
"epoch": 16.72116527942925,
"grad_norm": 7.098623275756836,
"learning_rate": 8.197086801426874e-06,
"loss": 2.1093,
"step": 450000
},
{
"epoch": 16.73974435196195,
"grad_norm": 6.478063106536865,
"learning_rate": 8.150639120095126e-06,
"loss": 2.0933,
"step": 450500
},
{
"epoch": 16.758323424494648,
"grad_norm": 7.2032012939453125,
"learning_rate": 8.104191438763376e-06,
"loss": 2.0801,
"step": 451000
},
{
"epoch": 16.776902497027347,
"grad_norm": 6.382145881652832,
"learning_rate": 8.057743757431629e-06,
"loss": 2.1063,
"step": 451500
},
{
"epoch": 16.795481569560046,
"grad_norm": 7.381346702575684,
"learning_rate": 8.011296076099883e-06,
"loss": 2.0992,
"step": 452000
},
{
"epoch": 16.81406064209275,
"grad_norm": 6.544224739074707,
"learning_rate": 7.964848394768133e-06,
"loss": 2.0993,
"step": 452500
},
{
"epoch": 16.832639714625447,
"grad_norm": 7.141576290130615,
"learning_rate": 7.918400713436385e-06,
"loss": 2.0919,
"step": 453000
},
{
"epoch": 16.851218787158146,
"grad_norm": 6.41404914855957,
"learning_rate": 7.871953032104639e-06,
"loss": 2.0961,
"step": 453500
},
{
"epoch": 16.869797859690845,
"grad_norm": 7.792717933654785,
"learning_rate": 7.82550535077289e-06,
"loss": 2.0875,
"step": 454000
},
{
"epoch": 16.888376932223544,
"grad_norm": 8.0609130859375,
"learning_rate": 7.779057669441142e-06,
"loss": 2.0905,
"step": 454500
},
{
"epoch": 16.906956004756243,
"grad_norm": 7.00869083404541,
"learning_rate": 7.732609988109394e-06,
"loss": 2.0877,
"step": 455000
},
{
"epoch": 16.92553507728894,
"grad_norm": 7.780180931091309,
"learning_rate": 7.686162306777646e-06,
"loss": 2.0956,
"step": 455500
},
{
"epoch": 16.94411414982164,
"grad_norm": 7.056099891662598,
"learning_rate": 7.639714625445898e-06,
"loss": 2.0762,
"step": 456000
},
{
"epoch": 16.96269322235434,
"grad_norm": 6.861847877502441,
"learning_rate": 7.59326694411415e-06,
"loss": 2.0946,
"step": 456500
},
{
"epoch": 16.981272294887038,
"grad_norm": 7.449362754821777,
"learning_rate": 7.546819262782402e-06,
"loss": 2.1104,
"step": 457000
},
{
"epoch": 16.999851367419737,
"grad_norm": 7.2395782470703125,
"learning_rate": 7.500371581450655e-06,
"loss": 2.1044,
"step": 457500
},
{
"epoch": 17.01843043995244,
"grad_norm": 7.514138221740723,
"learning_rate": 7.453923900118906e-06,
"loss": 2.0918,
"step": 458000
},
{
"epoch": 17.037009512485138,
"grad_norm": 6.817535877227783,
"learning_rate": 7.4074762187871585e-06,
"loss": 2.1078,
"step": 458500
},
{
"epoch": 17.055588585017837,
"grad_norm": 7.827926158905029,
"learning_rate": 7.3610285374554115e-06,
"loss": 2.0747,
"step": 459000
},
{
"epoch": 17.074167657550536,
"grad_norm": 9.247724533081055,
"learning_rate": 7.314580856123662e-06,
"loss": 2.0964,
"step": 459500
},
{
"epoch": 17.092746730083235,
"grad_norm": 8.57845687866211,
"learning_rate": 7.268133174791915e-06,
"loss": 2.0876,
"step": 460000
},
{
"epoch": 17.111325802615934,
"grad_norm": 7.123178482055664,
"learning_rate": 7.221685493460167e-06,
"loss": 2.0672,
"step": 460500
},
{
"epoch": 17.129904875148632,
"grad_norm": 7.820250034332275,
"learning_rate": 7.1752378121284185e-06,
"loss": 2.0708,
"step": 461000
},
{
"epoch": 17.14848394768133,
"grad_norm": 7.021051406860352,
"learning_rate": 7.1287901307966716e-06,
"loss": 2.0801,
"step": 461500
},
{
"epoch": 17.16706302021403,
"grad_norm": 8.586702346801758,
"learning_rate": 7.082342449464922e-06,
"loss": 2.0749,
"step": 462000
},
{
"epoch": 17.18564209274673,
"grad_norm": 6.818421363830566,
"learning_rate": 7.035894768133175e-06,
"loss": 2.0916,
"step": 462500
},
{
"epoch": 17.204221165279428,
"grad_norm": 7.275014877319336,
"learning_rate": 6.989447086801427e-06,
"loss": 2.0739,
"step": 463000
},
{
"epoch": 17.222800237812127,
"grad_norm": 6.750241756439209,
"learning_rate": 6.942999405469679e-06,
"loss": 2.0852,
"step": 463500
},
{
"epoch": 17.24137931034483,
"grad_norm": 7.445390701293945,
"learning_rate": 6.896551724137932e-06,
"loss": 2.0748,
"step": 464000
},
{
"epoch": 17.259958382877528,
"grad_norm": 8.087651252746582,
"learning_rate": 6.850104042806184e-06,
"loss": 2.0588,
"step": 464500
},
{
"epoch": 17.278537455410227,
"grad_norm": 7.12742805480957,
"learning_rate": 6.803656361474435e-06,
"loss": 2.074,
"step": 465000
},
{
"epoch": 17.297116527942926,
"grad_norm": 7.231345176696777,
"learning_rate": 6.757208680142687e-06,
"loss": 2.0682,
"step": 465500
},
{
"epoch": 17.315695600475625,
"grad_norm": 7.275602340698242,
"learning_rate": 6.71076099881094e-06,
"loss": 2.0813,
"step": 466000
},
{
"epoch": 17.334274673008323,
"grad_norm": 7.546669006347656,
"learning_rate": 6.664313317479192e-06,
"loss": 2.0839,
"step": 466500
},
{
"epoch": 17.352853745541022,
"grad_norm": 7.166531085968018,
"learning_rate": 6.617865636147444e-06,
"loss": 2.064,
"step": 467000
},
{
"epoch": 17.37143281807372,
"grad_norm": 8.803594589233398,
"learning_rate": 6.571417954815696e-06,
"loss": 2.0847,
"step": 467500
},
{
"epoch": 17.39001189060642,
"grad_norm": 7.301925182342529,
"learning_rate": 6.5249702734839475e-06,
"loss": 2.0938,
"step": 468000
},
{
"epoch": 17.40859096313912,
"grad_norm": 7.235419273376465,
"learning_rate": 6.4785225921522005e-06,
"loss": 2.0824,
"step": 468500
},
{
"epoch": 17.427170035671818,
"grad_norm": 9.021172523498535,
"learning_rate": 6.432074910820453e-06,
"loss": 2.0924,
"step": 469000
},
{
"epoch": 17.445749108204517,
"grad_norm": 7.037625789642334,
"learning_rate": 6.385627229488704e-06,
"loss": 2.0872,
"step": 469500
},
{
"epoch": 17.46432818073722,
"grad_norm": 8.20162296295166,
"learning_rate": 6.339179548156956e-06,
"loss": 2.0755,
"step": 470000
},
{
"epoch": 17.482907253269918,
"grad_norm": 7.615068435668945,
"learning_rate": 6.2927318668252075e-06,
"loss": 2.0826,
"step": 470500
},
{
"epoch": 17.501486325802617,
"grad_norm": 7.641859531402588,
"learning_rate": 6.2462841854934606e-06,
"loss": 2.087,
"step": 471000
},
{
"epoch": 17.520065398335316,
"grad_norm": 6.667306900024414,
"learning_rate": 6.199836504161712e-06,
"loss": 2.0842,
"step": 471500
},
{
"epoch": 17.538644470868014,
"grad_norm": 6.990174770355225,
"learning_rate": 6.153388822829965e-06,
"loss": 2.0861,
"step": 472000
},
{
"epoch": 17.557223543400713,
"grad_norm": 7.540374755859375,
"learning_rate": 6.106941141498216e-06,
"loss": 2.078,
"step": 472500
},
{
"epoch": 17.575802615933412,
"grad_norm": 6.960676670074463,
"learning_rate": 6.0604934601664685e-06,
"loss": 2.0877,
"step": 473000
},
{
"epoch": 17.59438168846611,
"grad_norm": 8.197839736938477,
"learning_rate": 6.0140457788347215e-06,
"loss": 2.0718,
"step": 473500
},
{
"epoch": 17.61296076099881,
"grad_norm": 7.723132610321045,
"learning_rate": 5.967598097502973e-06,
"loss": 2.0793,
"step": 474000
},
{
"epoch": 17.63153983353151,
"grad_norm": 6.541485786437988,
"learning_rate": 5.921150416171225e-06,
"loss": 2.0754,
"step": 474500
},
{
"epoch": 17.650118906064208,
"grad_norm": 7.376631736755371,
"learning_rate": 5.874702734839476e-06,
"loss": 2.0792,
"step": 475000
},
{
"epoch": 17.66869797859691,
"grad_norm": 6.127633094787598,
"learning_rate": 5.828255053507729e-06,
"loss": 2.0492,
"step": 475500
},
{
"epoch": 17.68727705112961,
"grad_norm": 7.734124183654785,
"learning_rate": 5.781807372175982e-06,
"loss": 2.0748,
"step": 476000
},
{
"epoch": 17.705856123662308,
"grad_norm": 6.9572601318359375,
"learning_rate": 5.735359690844233e-06,
"loss": 2.0992,
"step": 476500
},
{
"epoch": 17.724435196195007,
"grad_norm": 6.885385513305664,
"learning_rate": 5.688912009512486e-06,
"loss": 2.0771,
"step": 477000
},
{
"epoch": 17.743014268727705,
"grad_norm": 7.826180458068848,
"learning_rate": 5.642464328180737e-06,
"loss": 2.081,
"step": 477500
},
{
"epoch": 17.761593341260404,
"grad_norm": 7.1644439697265625,
"learning_rate": 5.5960166468489895e-06,
"loss": 2.0847,
"step": 478000
},
{
"epoch": 17.780172413793103,
"grad_norm": 8.081832885742188,
"learning_rate": 5.549568965517242e-06,
"loss": 2.072,
"step": 478500
},
{
"epoch": 17.798751486325802,
"grad_norm": 6.1492600440979,
"learning_rate": 5.503121284185494e-06,
"loss": 2.0919,
"step": 479000
},
{
"epoch": 17.8173305588585,
"grad_norm": 6.837408542633057,
"learning_rate": 5.456673602853746e-06,
"loss": 2.0745,
"step": 479500
},
{
"epoch": 17.8359096313912,
"grad_norm": 6.619295120239258,
"learning_rate": 5.410225921521997e-06,
"loss": 2.0758,
"step": 480000
},
{
"epoch": 17.8544887039239,
"grad_norm": 7.465627193450928,
"learning_rate": 5.3637782401902504e-06,
"loss": 2.0705,
"step": 480500
},
{
"epoch": 17.8730677764566,
"grad_norm": 7.469555854797363,
"learning_rate": 5.317330558858502e-06,
"loss": 2.0675,
"step": 481000
},
{
"epoch": 17.8916468489893,
"grad_norm": 7.39703893661499,
"learning_rate": 5.270882877526754e-06,
"loss": 2.0869,
"step": 481500
},
{
"epoch": 17.910225921522,
"grad_norm": 6.684396743774414,
"learning_rate": 5.224435196195006e-06,
"loss": 2.0782,
"step": 482000
},
{
"epoch": 17.928804994054698,
"grad_norm": 8.273653984069824,
"learning_rate": 5.177987514863258e-06,
"loss": 2.0792,
"step": 482500
},
{
"epoch": 17.947384066587396,
"grad_norm": 7.827981472015381,
"learning_rate": 5.1315398335315105e-06,
"loss": 2.0862,
"step": 483000
},
{
"epoch": 17.965963139120095,
"grad_norm": 7.737405300140381,
"learning_rate": 5.085092152199762e-06,
"loss": 2.0632,
"step": 483500
},
{
"epoch": 17.984542211652794,
"grad_norm": 7.617379665374756,
"learning_rate": 5.038644470868015e-06,
"loss": 2.1037,
"step": 484000
},
{
"epoch": 18.003121284185493,
"grad_norm": 7.147322177886963,
"learning_rate": 4.992196789536266e-06,
"loss": 2.0701,
"step": 484500
},
{
"epoch": 18.021700356718192,
"grad_norm": 6.316223621368408,
"learning_rate": 4.945749108204518e-06,
"loss": 2.0536,
"step": 485000
},
{
"epoch": 18.04027942925089,
"grad_norm": 7.639254093170166,
"learning_rate": 4.8993014268727714e-06,
"loss": 2.0594,
"step": 485500
},
{
"epoch": 18.05885850178359,
"grad_norm": 7.149983882904053,
"learning_rate": 4.852853745541023e-06,
"loss": 2.0546,
"step": 486000
},
{
"epoch": 18.07743757431629,
"grad_norm": 7.123045921325684,
"learning_rate": 4.806406064209275e-06,
"loss": 2.0819,
"step": 486500
},
{
"epoch": 18.09601664684899,
"grad_norm": 7.2495293617248535,
"learning_rate": 4.759958382877526e-06,
"loss": 2.0641,
"step": 487000
},
{
"epoch": 18.11459571938169,
"grad_norm": 7.309257507324219,
"learning_rate": 4.713510701545779e-06,
"loss": 2.0664,
"step": 487500
},
{
"epoch": 18.13317479191439,
"grad_norm": 6.188238620758057,
"learning_rate": 4.6670630202140315e-06,
"loss": 2.0801,
"step": 488000
},
{
"epoch": 18.151753864447087,
"grad_norm": 7.894054889678955,
"learning_rate": 4.620615338882283e-06,
"loss": 2.0576,
"step": 488500
},
{
"epoch": 18.170332936979786,
"grad_norm": 7.271005153656006,
"learning_rate": 4.574167657550536e-06,
"loss": 2.0477,
"step": 489000
},
{
"epoch": 18.188912009512485,
"grad_norm": 7.505859851837158,
"learning_rate": 4.527719976218787e-06,
"loss": 2.0726,
"step": 489500
},
{
"epoch": 18.207491082045184,
"grad_norm": 7.29171085357666,
"learning_rate": 4.4812722948870394e-06,
"loss": 2.0641,
"step": 490000
},
{
"epoch": 18.226070154577883,
"grad_norm": 7.959132671356201,
"learning_rate": 4.434824613555292e-06,
"loss": 2.0704,
"step": 490500
},
{
"epoch": 18.244649227110582,
"grad_norm": 6.843657493591309,
"learning_rate": 4.388376932223544e-06,
"loss": 2.0695,
"step": 491000
},
{
"epoch": 18.26322829964328,
"grad_norm": 6.887396812438965,
"learning_rate": 4.341929250891796e-06,
"loss": 2.0622,
"step": 491500
},
{
"epoch": 18.28180737217598,
"grad_norm": 7.143764019012451,
"learning_rate": 4.295481569560047e-06,
"loss": 2.0729,
"step": 492000
},
{
"epoch": 18.30038644470868,
"grad_norm": 7.79412841796875,
"learning_rate": 4.2490338882283e-06,
"loss": 2.054,
"step": 492500
},
{
"epoch": 18.31896551724138,
"grad_norm": 7.328498363494873,
"learning_rate": 4.202586206896552e-06,
"loss": 2.0582,
"step": 493000
},
{
"epoch": 18.33754458977408,
"grad_norm": 6.897115230560303,
"learning_rate": 4.156138525564804e-06,
"loss": 2.0548,
"step": 493500
},
{
"epoch": 18.35612366230678,
"grad_norm": 7.248096942901611,
"learning_rate": 4.109690844233056e-06,
"loss": 2.0668,
"step": 494000
},
{
"epoch": 18.374702734839477,
"grad_norm": 8.02023983001709,
"learning_rate": 4.063243162901308e-06,
"loss": 2.0527,
"step": 494500
},
{
"epoch": 18.393281807372176,
"grad_norm": 7.436459541320801,
"learning_rate": 4.0167954815695605e-06,
"loss": 2.0685,
"step": 495000
},
{
"epoch": 18.411860879904875,
"grad_norm": 7.74916410446167,
"learning_rate": 3.970347800237812e-06,
"loss": 2.0709,
"step": 495500
},
{
"epoch": 18.430439952437574,
"grad_norm": 8.027193069458008,
"learning_rate": 3.923900118906065e-06,
"loss": 2.0614,
"step": 496000
},
{
"epoch": 18.449019024970273,
"grad_norm": 6.885724067687988,
"learning_rate": 3.877452437574316e-06,
"loss": 2.0519,
"step": 496500
},
{
"epoch": 18.46759809750297,
"grad_norm": 7.010785102844238,
"learning_rate": 3.831004756242568e-06,
"loss": 2.068,
"step": 497000
},
{
"epoch": 18.48617717003567,
"grad_norm": 6.670163631439209,
"learning_rate": 3.78455707491082e-06,
"loss": 2.0695,
"step": 497500
},
{
"epoch": 18.50475624256837,
"grad_norm": 7.944169998168945,
"learning_rate": 3.7381093935790727e-06,
"loss": 2.0723,
"step": 498000
},
{
"epoch": 18.523335315101072,
"grad_norm": 5.955043792724609,
"learning_rate": 3.691661712247325e-06,
"loss": 2.0456,
"step": 498500
},
{
"epoch": 18.54191438763377,
"grad_norm": 7.109121322631836,
"learning_rate": 3.6452140309155767e-06,
"loss": 2.0612,
"step": 499000
},
{
"epoch": 18.56049346016647,
"grad_norm": 6.318941593170166,
"learning_rate": 3.5987663495838293e-06,
"loss": 2.0702,
"step": 499500
},
{
"epoch": 18.57907253269917,
"grad_norm": 8.715611457824707,
"learning_rate": 3.552318668252081e-06,
"loss": 2.08,
"step": 500000
},
{
"epoch": 18.597651605231867,
"grad_norm": 9.135302543640137,
"learning_rate": 3.505870986920333e-06,
"loss": 2.0762,
"step": 500500
},
{
"epoch": 18.616230677764566,
"grad_norm": 7.45161247253418,
"learning_rate": 3.4594233055885854e-06,
"loss": 2.0533,
"step": 501000
},
{
"epoch": 18.634809750297265,
"grad_norm": 8.179544448852539,
"learning_rate": 3.412975624256837e-06,
"loss": 2.0548,
"step": 501500
},
{
"epoch": 18.653388822829964,
"grad_norm": 9.695868492126465,
"learning_rate": 3.3665279429250894e-06,
"loss": 2.0502,
"step": 502000
},
{
"epoch": 18.671967895362663,
"grad_norm": 9.127315521240234,
"learning_rate": 3.320080261593341e-06,
"loss": 2.0559,
"step": 502500
},
{
"epoch": 18.69054696789536,
"grad_norm": 7.58563232421875,
"learning_rate": 3.2736325802615937e-06,
"loss": 2.0618,
"step": 503000
},
{
"epoch": 18.70912604042806,
"grad_norm": 6.781043529510498,
"learning_rate": 3.2271848989298455e-06,
"loss": 2.0654,
"step": 503500
},
{
"epoch": 18.727705112960763,
"grad_norm": 7.929651737213135,
"learning_rate": 3.1807372175980973e-06,
"loss": 2.0493,
"step": 504000
},
{
"epoch": 18.74628418549346,
"grad_norm": 7.395569324493408,
"learning_rate": 3.13428953626635e-06,
"loss": 2.042,
"step": 504500
},
{
"epoch": 18.76486325802616,
"grad_norm": 8.050883293151855,
"learning_rate": 3.087841854934602e-06,
"loss": 2.0625,
"step": 505000
},
{
"epoch": 18.78344233055886,
"grad_norm": 8.531946182250977,
"learning_rate": 3.041394173602854e-06,
"loss": 2.0636,
"step": 505500
},
{
"epoch": 18.80202140309156,
"grad_norm": 7.74788236618042,
"learning_rate": 2.994946492271106e-06,
"loss": 2.0758,
"step": 506000
},
{
"epoch": 18.820600475624257,
"grad_norm": 7.532721996307373,
"learning_rate": 2.9484988109393578e-06,
"loss": 2.0559,
"step": 506500
},
{
"epoch": 18.839179548156956,
"grad_norm": 6.848814487457275,
"learning_rate": 2.90205112960761e-06,
"loss": 2.0552,
"step": 507000
},
{
"epoch": 18.857758620689655,
"grad_norm": 7.606546401977539,
"learning_rate": 2.855603448275862e-06,
"loss": 2.0528,
"step": 507500
},
{
"epoch": 18.876337693222354,
"grad_norm": 7.560408592224121,
"learning_rate": 2.8091557669441143e-06,
"loss": 2.0529,
"step": 508000
},
{
"epoch": 18.894916765755053,
"grad_norm": 8.788424491882324,
"learning_rate": 2.7627080856123665e-06,
"loss": 2.0564,
"step": 508500
},
{
"epoch": 18.91349583828775,
"grad_norm": 6.34813928604126,
"learning_rate": 2.7162604042806183e-06,
"loss": 2.0517,
"step": 509000
},
{
"epoch": 18.93207491082045,
"grad_norm": 7.5938005447387695,
"learning_rate": 2.6698127229488705e-06,
"loss": 2.0615,
"step": 509500
},
{
"epoch": 18.950653983353153,
"grad_norm": 7.773651123046875,
"learning_rate": 2.6233650416171222e-06,
"loss": 2.0572,
"step": 510000
},
{
"epoch": 18.96923305588585,
"grad_norm": 6.474369049072266,
"learning_rate": 2.576917360285375e-06,
"loss": 2.0563,
"step": 510500
},
{
"epoch": 18.98781212841855,
"grad_norm": 7.805785179138184,
"learning_rate": 2.530469678953627e-06,
"loss": 2.0697,
"step": 511000
},
{
"epoch": 19.00639120095125,
"grad_norm": 6.911838054656982,
"learning_rate": 2.484021997621879e-06,
"loss": 2.066,
"step": 511500
},
{
"epoch": 19.024970273483948,
"grad_norm": 7.869637966156006,
"learning_rate": 2.437574316290131e-06,
"loss": 2.0383,
"step": 512000
},
{
"epoch": 19.043549346016647,
"grad_norm": 8.383206367492676,
"learning_rate": 2.3911266349583828e-06,
"loss": 2.0579,
"step": 512500
},
{
"epoch": 19.062128418549346,
"grad_norm": 8.408047676086426,
"learning_rate": 2.344678953626635e-06,
"loss": 2.0651,
"step": 513000
},
{
"epoch": 19.080707491082045,
"grad_norm": 7.509279251098633,
"learning_rate": 2.298231272294887e-06,
"loss": 2.0762,
"step": 513500
},
{
"epoch": 19.099286563614744,
"grad_norm": 7.116653919219971,
"learning_rate": 2.2517835909631393e-06,
"loss": 2.05,
"step": 514000
},
{
"epoch": 19.117865636147442,
"grad_norm": 6.725174427032471,
"learning_rate": 2.2053359096313915e-06,
"loss": 2.0654,
"step": 514500
},
{
"epoch": 19.13644470868014,
"grad_norm": 6.7676544189453125,
"learning_rate": 2.1588882282996433e-06,
"loss": 2.0608,
"step": 515000
},
{
"epoch": 19.15502378121284,
"grad_norm": 7.207517147064209,
"learning_rate": 2.1124405469678954e-06,
"loss": 2.06,
"step": 515500
},
{
"epoch": 19.173602853745543,
"grad_norm": 7.989516735076904,
"learning_rate": 2.065992865636147e-06,
"loss": 2.034,
"step": 516000
},
{
"epoch": 19.19218192627824,
"grad_norm": 7.003942489624023,
"learning_rate": 2.0195451843044e-06,
"loss": 2.0518,
"step": 516500
},
{
"epoch": 19.21076099881094,
"grad_norm": 6.7362236976623535,
"learning_rate": 1.973097502972652e-06,
"loss": 2.0609,
"step": 517000
},
{
"epoch": 19.22934007134364,
"grad_norm": 6.881633758544922,
"learning_rate": 1.9266498216409038e-06,
"loss": 2.0541,
"step": 517500
},
{
"epoch": 19.247919143876338,
"grad_norm": 7.07053279876709,
"learning_rate": 1.880202140309156e-06,
"loss": 2.037,
"step": 518000
},
{
"epoch": 19.266498216409037,
"grad_norm": 7.328449249267578,
"learning_rate": 1.833754458977408e-06,
"loss": 2.0379,
"step": 518500
},
{
"epoch": 19.285077288941736,
"grad_norm": 7.447302341461182,
"learning_rate": 1.7873067776456601e-06,
"loss": 2.0461,
"step": 519000
},
{
"epoch": 19.303656361474435,
"grad_norm": 6.588597774505615,
"learning_rate": 1.7408590963139119e-06,
"loss": 2.0395,
"step": 519500
},
{
"epoch": 19.322235434007133,
"grad_norm": 6.768307685852051,
"learning_rate": 1.694411414982164e-06,
"loss": 2.0747,
"step": 520000
},
{
"epoch": 19.340814506539832,
"grad_norm": 7.688553810119629,
"learning_rate": 1.6479637336504165e-06,
"loss": 2.0446,
"step": 520500
},
{
"epoch": 19.35939357907253,
"grad_norm": 8.146514892578125,
"learning_rate": 1.6015160523186682e-06,
"loss": 2.0389,
"step": 521000
},
{
"epoch": 19.377972651605234,
"grad_norm": 8.384140968322754,
"learning_rate": 1.5550683709869204e-06,
"loss": 2.0367,
"step": 521500
},
{
"epoch": 19.396551724137932,
"grad_norm": 7.63324499130249,
"learning_rate": 1.5086206896551726e-06,
"loss": 2.0461,
"step": 522000
},
{
"epoch": 19.41513079667063,
"grad_norm": 6.784617900848389,
"learning_rate": 1.4621730083234246e-06,
"loss": 2.0453,
"step": 522500
},
{
"epoch": 19.43370986920333,
"grad_norm": 6.640545845031738,
"learning_rate": 1.4157253269916766e-06,
"loss": 2.0388,
"step": 523000
},
{
"epoch": 19.45228894173603,
"grad_norm": 6.723217487335205,
"learning_rate": 1.3692776456599287e-06,
"loss": 2.0414,
"step": 523500
},
{
"epoch": 19.470868014268728,
"grad_norm": 6.989643573760986,
"learning_rate": 1.3228299643281807e-06,
"loss": 2.055,
"step": 524000
},
{
"epoch": 19.489447086801427,
"grad_norm": 6.394150257110596,
"learning_rate": 1.276382282996433e-06,
"loss": 2.0539,
"step": 524500
},
{
"epoch": 19.508026159334126,
"grad_norm": 7.73260498046875,
"learning_rate": 1.229934601664685e-06,
"loss": 2.0549,
"step": 525000
},
{
"epoch": 19.526605231866824,
"grad_norm": 7.458393096923828,
"learning_rate": 1.183486920332937e-06,
"loss": 2.0486,
"step": 525500
},
{
"epoch": 19.545184304399523,
"grad_norm": 7.173522472381592,
"learning_rate": 1.137039239001189e-06,
"loss": 2.0644,
"step": 526000
},
{
"epoch": 19.563763376932222,
"grad_norm": 7.556340217590332,
"learning_rate": 1.0905915576694412e-06,
"loss": 2.0456,
"step": 526500
},
{
"epoch": 19.582342449464925,
"grad_norm": 8.111367225646973,
"learning_rate": 1.0441438763376932e-06,
"loss": 2.0397,
"step": 527000
},
{
"epoch": 19.600921521997623,
"grad_norm": 6.623202323913574,
"learning_rate": 9.976961950059454e-07,
"loss": 2.0488,
"step": 527500
},
{
"epoch": 19.619500594530322,
"grad_norm": 7.327664375305176,
"learning_rate": 9.512485136741974e-07,
"loss": 2.037,
"step": 528000
},
{
"epoch": 19.63807966706302,
"grad_norm": 7.518941879272461,
"learning_rate": 9.048008323424495e-07,
"loss": 2.0694,
"step": 528500
},
{
"epoch": 19.65665873959572,
"grad_norm": 7.00496244430542,
"learning_rate": 8.583531510107016e-07,
"loss": 2.0459,
"step": 529000
},
{
"epoch": 19.67523781212842,
"grad_norm": 7.160311222076416,
"learning_rate": 8.119054696789537e-07,
"loss": 2.029,
"step": 529500
},
{
"epoch": 19.693816884661118,
"grad_norm": 7.951440811157227,
"learning_rate": 7.654577883472057e-07,
"loss": 2.0529,
"step": 530000
},
{
"epoch": 19.712395957193817,
"grad_norm": 7.71318244934082,
"learning_rate": 7.190101070154579e-07,
"loss": 2.057,
"step": 530500
},
{
"epoch": 19.730975029726515,
"grad_norm": 7.5362043380737305,
"learning_rate": 6.7256242568371e-07,
"loss": 2.0493,
"step": 531000
},
{
"epoch": 19.749554102259214,
"grad_norm": 8.362653732299805,
"learning_rate": 6.261147443519619e-07,
"loss": 2.0467,
"step": 531500
},
{
"epoch": 19.768133174791913,
"grad_norm": 7.16049337387085,
"learning_rate": 5.79667063020214e-07,
"loss": 2.0346,
"step": 532000
},
{
"epoch": 19.786712247324612,
"grad_norm": 7.634875297546387,
"learning_rate": 5.332193816884662e-07,
"loss": 2.035,
"step": 532500
},
{
"epoch": 19.805291319857314,
"grad_norm": 7.416409015655518,
"learning_rate": 4.867717003567182e-07,
"loss": 2.0513,
"step": 533000
},
{
"epoch": 19.823870392390013,
"grad_norm": 6.575763702392578,
"learning_rate": 4.4032401902497025e-07,
"loss": 2.0582,
"step": 533500
},
{
"epoch": 19.842449464922712,
"grad_norm": 7.2025909423828125,
"learning_rate": 3.938763376932224e-07,
"loss": 2.0534,
"step": 534000
},
{
"epoch": 19.86102853745541,
"grad_norm": 7.560851573944092,
"learning_rate": 3.4742865636147446e-07,
"loss": 2.0566,
"step": 534500
},
{
"epoch": 19.87960760998811,
"grad_norm": 7.525179386138916,
"learning_rate": 3.0098097502972654e-07,
"loss": 2.0374,
"step": 535000
},
{
"epoch": 19.89818668252081,
"grad_norm": 6.616377830505371,
"learning_rate": 2.5453329369797857e-07,
"loss": 2.0483,
"step": 535500
},
{
"epoch": 19.916765755053508,
"grad_norm": 7.127399444580078,
"learning_rate": 2.0808561236623068e-07,
"loss": 2.0612,
"step": 536000
},
{
"epoch": 19.935344827586206,
"grad_norm": 8.101058959960938,
"learning_rate": 1.6163793103448276e-07,
"loss": 2.046,
"step": 536500
},
{
"epoch": 19.953923900118905,
"grad_norm": 7.135190010070801,
"learning_rate": 1.1519024970273484e-07,
"loss": 2.0619,
"step": 537000
},
{
"epoch": 19.972502972651604,
"grad_norm": 7.481634616851807,
"learning_rate": 6.874256837098692e-08,
"loss": 2.0529,
"step": 537500
},
{
"epoch": 19.991082045184303,
"grad_norm": 7.486691474914551,
"learning_rate": 2.2294887039239002e-08,
"loss": 2.0536,
"step": 538000
}
],
"logging_steps": 500,
"max_steps": 538240,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.258410922006282e+17,
"train_batch_size": 46,
"trial_name": null,
"trial_params": null
}