KoseiUemura's picture
Add checkpoint checkpoint-50000
e1242cc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.4109148371804279,
"eval_steps": 500,
"global_step": 50000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028218296743608554,
"grad_norm": 175.4037628173828,
"learning_rate": 1.3261851015801355e-07,
"loss": 3.1599,
"step": 100
},
{
"epoch": 0.005643659348721711,
"grad_norm": 24.378341674804688,
"learning_rate": 2.737020316027088e-07,
"loss": 1.848,
"step": 200
},
{
"epoch": 0.008465489023082567,
"grad_norm": 27.120737075805664,
"learning_rate": 4.147855530474041e-07,
"loss": 1.2576,
"step": 300
},
{
"epoch": 0.011287318697443422,
"grad_norm": 17.9366397857666,
"learning_rate": 5.558690744920993e-07,
"loss": 1.0372,
"step": 400
},
{
"epoch": 0.014109148371804279,
"grad_norm": 24.153575897216797,
"learning_rate": 6.969525959367947e-07,
"loss": 0.9444,
"step": 500
},
{
"epoch": 0.016930978046165134,
"grad_norm": 15.082656860351562,
"learning_rate": 8.3803611738149e-07,
"loss": 0.8119,
"step": 600
},
{
"epoch": 0.01975280772052599,
"grad_norm": 23.82472801208496,
"learning_rate": 9.79119638826185e-07,
"loss": 0.7448,
"step": 700
},
{
"epoch": 0.022574637394886844,
"grad_norm": 46.331058502197266,
"learning_rate": 1.1187923250564336e-06,
"loss": 0.7746,
"step": 800
},
{
"epoch": 0.0253964670692477,
"grad_norm": 29.84396743774414,
"learning_rate": 1.2598758465011287e-06,
"loss": 0.6949,
"step": 900
},
{
"epoch": 0.028218296743608557,
"grad_norm": 34.73414993286133,
"learning_rate": 1.4009593679458242e-06,
"loss": 0.6737,
"step": 1000
},
{
"epoch": 0.03104012641796941,
"grad_norm": 33.15461349487305,
"learning_rate": 1.5420428893905193e-06,
"loss": 0.6346,
"step": 1100
},
{
"epoch": 0.03386195609233027,
"grad_norm": 61.0333251953125,
"learning_rate": 1.6831264108352146e-06,
"loss": 0.6682,
"step": 1200
},
{
"epoch": 0.036683785766691124,
"grad_norm": 28.32975196838379,
"learning_rate": 1.8242099322799097e-06,
"loss": 0.6747,
"step": 1300
},
{
"epoch": 0.03950561544105198,
"grad_norm": 26.81439781188965,
"learning_rate": 1.9652934537246053e-06,
"loss": 0.668,
"step": 1400
},
{
"epoch": 0.04232744511541283,
"grad_norm": 12.049301147460938,
"learning_rate": 2.1063769751693e-06,
"loss": 0.624,
"step": 1500
},
{
"epoch": 0.04514927478977369,
"grad_norm": 24.25949478149414,
"learning_rate": 2.2474604966139955e-06,
"loss": 0.5744,
"step": 1600
},
{
"epoch": 0.047971104464134544,
"grad_norm": 28.51569938659668,
"learning_rate": 2.3885440180586912e-06,
"loss": 0.6342,
"step": 1700
},
{
"epoch": 0.0507929341384954,
"grad_norm": 29.432788848876953,
"learning_rate": 2.5296275395033865e-06,
"loss": 0.6092,
"step": 1800
},
{
"epoch": 0.05361476381285626,
"grad_norm": 38.4299430847168,
"learning_rate": 2.6707110609480814e-06,
"loss": 0.5947,
"step": 1900
},
{
"epoch": 0.056436593487217114,
"grad_norm": 76.33415222167969,
"learning_rate": 2.8117945823927768e-06,
"loss": 0.6018,
"step": 2000
},
{
"epoch": 0.059258423161577964,
"grad_norm": 26.28867530822754,
"learning_rate": 2.952878103837472e-06,
"loss": 0.5609,
"step": 2100
},
{
"epoch": 0.06208025283593882,
"grad_norm": 27.49398422241211,
"learning_rate": 3.0939616252821674e-06,
"loss": 0.5916,
"step": 2200
},
{
"epoch": 0.06490208251029968,
"grad_norm": 27.57271385192871,
"learning_rate": 3.2350451467268623e-06,
"loss": 0.5775,
"step": 2300
},
{
"epoch": 0.06772391218466053,
"grad_norm": 40.42005157470703,
"learning_rate": 3.3761286681715576e-06,
"loss": 0.5847,
"step": 2400
},
{
"epoch": 0.07054574185902139,
"grad_norm": 23.6585693359375,
"learning_rate": 3.517212189616253e-06,
"loss": 0.5808,
"step": 2500
},
{
"epoch": 0.07336757153338225,
"grad_norm": 21.844501495361328,
"learning_rate": 3.6582957110609487e-06,
"loss": 0.562,
"step": 2600
},
{
"epoch": 0.0761894012077431,
"grad_norm": 26.929052352905273,
"learning_rate": 3.799379232505644e-06,
"loss": 0.569,
"step": 2700
},
{
"epoch": 0.07901123088210396,
"grad_norm": 44.13679504394531,
"learning_rate": 3.9404627539503385e-06,
"loss": 0.6041,
"step": 2800
},
{
"epoch": 0.08183306055646482,
"grad_norm": 26.586490631103516,
"learning_rate": 4.081546275395034e-06,
"loss": 0.5796,
"step": 2900
},
{
"epoch": 0.08465489023082566,
"grad_norm": 25.3786563873291,
"learning_rate": 4.222629796839729e-06,
"loss": 0.5912,
"step": 3000
},
{
"epoch": 0.08747671990518652,
"grad_norm": 31.699079513549805,
"learning_rate": 4.363713318284425e-06,
"loss": 0.5506,
"step": 3100
},
{
"epoch": 0.09029854957954737,
"grad_norm": 38.314876556396484,
"learning_rate": 4.50479683972912e-06,
"loss": 0.5687,
"step": 3200
},
{
"epoch": 0.09312037925390823,
"grad_norm": 31.68910789489746,
"learning_rate": 4.6458803611738155e-06,
"loss": 0.5463,
"step": 3300
},
{
"epoch": 0.09594220892826909,
"grad_norm": 34.52235412597656,
"learning_rate": 4.78696388261851e-06,
"loss": 0.5681,
"step": 3400
},
{
"epoch": 0.09876403860262994,
"grad_norm": 52.666526794433594,
"learning_rate": 4.928047404063206e-06,
"loss": 0.5819,
"step": 3500
},
{
"epoch": 0.1015858682769908,
"grad_norm": 55.63333511352539,
"learning_rate": 5.069130925507901e-06,
"loss": 0.5496,
"step": 3600
},
{
"epoch": 0.10440769795135166,
"grad_norm": 30.57868003845215,
"learning_rate": 5.210214446952596e-06,
"loss": 0.548,
"step": 3700
},
{
"epoch": 0.10722952762571251,
"grad_norm": 27.27458953857422,
"learning_rate": 5.3512979683972925e-06,
"loss": 0.5183,
"step": 3800
},
{
"epoch": 0.11005135730007337,
"grad_norm": 48.254302978515625,
"learning_rate": 5.4923814898419865e-06,
"loss": 0.5295,
"step": 3900
},
{
"epoch": 0.11287318697443423,
"grad_norm": 66.4515151977539,
"learning_rate": 5.6334650112866814e-06,
"loss": 0.5385,
"step": 4000
},
{
"epoch": 0.11569501664879508,
"grad_norm": 15.82089900970459,
"learning_rate": 5.774548532731378e-06,
"loss": 0.4862,
"step": 4100
},
{
"epoch": 0.11851684632315593,
"grad_norm": 14.725001335144043,
"learning_rate": 5.915632054176073e-06,
"loss": 0.5143,
"step": 4200
},
{
"epoch": 0.12133867599751678,
"grad_norm": 11.792068481445312,
"learning_rate": 6.056715575620769e-06,
"loss": 0.5314,
"step": 4300
},
{
"epoch": 0.12416050567187764,
"grad_norm": 29.84812355041504,
"learning_rate": 6.1977990970654636e-06,
"loss": 0.5633,
"step": 4400
},
{
"epoch": 0.1269823353462385,
"grad_norm": 27.085613250732422,
"learning_rate": 6.3388826185101585e-06,
"loss": 0.5071,
"step": 4500
},
{
"epoch": 0.12980416502059935,
"grad_norm": 31.483890533447266,
"learning_rate": 6.479966139954854e-06,
"loss": 0.5577,
"step": 4600
},
{
"epoch": 0.13262599469496023,
"grad_norm": 17.639286041259766,
"learning_rate": 6.621049661399549e-06,
"loss": 0.5347,
"step": 4700
},
{
"epoch": 0.13544782436932107,
"grad_norm": 27.730653762817383,
"learning_rate": 6.762133182844244e-06,
"loss": 0.516,
"step": 4800
},
{
"epoch": 0.1382696540436819,
"grad_norm": 36.5928840637207,
"learning_rate": 6.90321670428894e-06,
"loss": 0.4875,
"step": 4900
},
{
"epoch": 0.14109148371804278,
"grad_norm": 16.40862274169922,
"learning_rate": 7.044300225733635e-06,
"loss": 0.4683,
"step": 5000
},
{
"epoch": 0.14391331339240362,
"grad_norm": 26.178585052490234,
"learning_rate": 7.18538374717833e-06,
"loss": 0.5131,
"step": 5100
},
{
"epoch": 0.1467351430667645,
"grad_norm": 31.325714111328125,
"learning_rate": 7.326467268623025e-06,
"loss": 0.4667,
"step": 5200
},
{
"epoch": 0.14955697274112534,
"grad_norm": 32.41719055175781,
"learning_rate": 7.46755079006772e-06,
"loss": 0.5131,
"step": 5300
},
{
"epoch": 0.1523788024154862,
"grad_norm": 19.27738380432129,
"learning_rate": 7.608634311512416e-06,
"loss": 0.4934,
"step": 5400
},
{
"epoch": 0.15520063208984705,
"grad_norm": 18.57884407043457,
"learning_rate": 7.74971783295711e-06,
"loss": 0.5104,
"step": 5500
},
{
"epoch": 0.15802246176420792,
"grad_norm": 23.134544372558594,
"learning_rate": 7.890801354401807e-06,
"loss": 0.5355,
"step": 5600
},
{
"epoch": 0.16084429143856876,
"grad_norm": 31.898052215576172,
"learning_rate": 8.031884875846502e-06,
"loss": 0.4796,
"step": 5700
},
{
"epoch": 0.16366612111292964,
"grad_norm": 25.020967483520508,
"learning_rate": 8.172968397291197e-06,
"loss": 0.5195,
"step": 5800
},
{
"epoch": 0.16648795078729048,
"grad_norm": 33.08086395263672,
"learning_rate": 8.314051918735892e-06,
"loss": 0.5263,
"step": 5900
},
{
"epoch": 0.16930978046165132,
"grad_norm": 30.09421730041504,
"learning_rate": 8.455135440180587e-06,
"loss": 0.4806,
"step": 6000
},
{
"epoch": 0.1721316101360122,
"grad_norm": 27.095197677612305,
"learning_rate": 8.596218961625284e-06,
"loss": 0.5154,
"step": 6100
},
{
"epoch": 0.17495343981037303,
"grad_norm": 30.298784255981445,
"learning_rate": 8.737302483069978e-06,
"loss": 0.5072,
"step": 6200
},
{
"epoch": 0.1777752694847339,
"grad_norm": 21.500614166259766,
"learning_rate": 8.878386004514673e-06,
"loss": 0.4962,
"step": 6300
},
{
"epoch": 0.18059709915909475,
"grad_norm": 19.31947898864746,
"learning_rate": 9.019469525959368e-06,
"loss": 0.5303,
"step": 6400
},
{
"epoch": 0.18341892883345562,
"grad_norm": 43.50458526611328,
"learning_rate": 9.160553047404063e-06,
"loss": 0.4585,
"step": 6500
},
{
"epoch": 0.18624075850781646,
"grad_norm": 45.34175491333008,
"learning_rate": 9.30163656884876e-06,
"loss": 0.5084,
"step": 6600
},
{
"epoch": 0.18906258818217733,
"grad_norm": 45.26927185058594,
"learning_rate": 9.442720090293455e-06,
"loss": 0.4849,
"step": 6700
},
{
"epoch": 0.19188441785653818,
"grad_norm": 42.85562515258789,
"learning_rate": 9.58380361173815e-06,
"loss": 0.5519,
"step": 6800
},
{
"epoch": 0.19470624753089905,
"grad_norm": 21.38568687438965,
"learning_rate": 9.724887133182846e-06,
"loss": 0.5143,
"step": 6900
},
{
"epoch": 0.1975280772052599,
"grad_norm": 40.83327865600586,
"learning_rate": 9.865970654627541e-06,
"loss": 0.5234,
"step": 7000
},
{
"epoch": 0.20034990687962076,
"grad_norm": 18.91502571105957,
"learning_rate": 9.999216153508497e-06,
"loss": 0.5068,
"step": 7100
},
{
"epoch": 0.2031717365539816,
"grad_norm": 21.48363494873047,
"learning_rate": 9.983539223678435e-06,
"loss": 0.5049,
"step": 7200
},
{
"epoch": 0.20599356622834245,
"grad_norm": 36.2054443359375,
"learning_rate": 9.967862293848373e-06,
"loss": 0.5111,
"step": 7300
},
{
"epoch": 0.20881539590270332,
"grad_norm": 33.61429214477539,
"learning_rate": 9.952185364018311e-06,
"loss": 0.4963,
"step": 7400
},
{
"epoch": 0.21163722557706416,
"grad_norm": 26.054624557495117,
"learning_rate": 9.93650843418825e-06,
"loss": 0.4979,
"step": 7500
},
{
"epoch": 0.21445905525142503,
"grad_norm": 22.114206314086914,
"learning_rate": 9.920831504358187e-06,
"loss": 0.4938,
"step": 7600
},
{
"epoch": 0.21728088492578587,
"grad_norm": 25.162818908691406,
"learning_rate": 9.905154574528125e-06,
"loss": 0.5034,
"step": 7700
},
{
"epoch": 0.22010271460014674,
"grad_norm": 32.36638259887695,
"learning_rate": 9.889477644698063e-06,
"loss": 0.4968,
"step": 7800
},
{
"epoch": 0.22292454427450759,
"grad_norm": 12.753120422363281,
"learning_rate": 9.873800714868001e-06,
"loss": 0.5068,
"step": 7900
},
{
"epoch": 0.22574637394886846,
"grad_norm": 24.235403060913086,
"learning_rate": 9.85812378503794e-06,
"loss": 0.5157,
"step": 8000
},
{
"epoch": 0.2285682036232293,
"grad_norm": 29.90271759033203,
"learning_rate": 9.842446855207877e-06,
"loss": 0.5183,
"step": 8100
},
{
"epoch": 0.23139003329759017,
"grad_norm": 29.402860641479492,
"learning_rate": 9.826769925377815e-06,
"loss": 0.4724,
"step": 8200
},
{
"epoch": 0.234211862971951,
"grad_norm": 25.902576446533203,
"learning_rate": 9.811092995547753e-06,
"loss": 0.5222,
"step": 8300
},
{
"epoch": 0.23703369264631186,
"grad_norm": 22.64885711669922,
"learning_rate": 9.795416065717691e-06,
"loss": 0.5079,
"step": 8400
},
{
"epoch": 0.23985552232067273,
"grad_norm": 29.80979347229004,
"learning_rate": 9.779739135887628e-06,
"loss": 0.4755,
"step": 8500
},
{
"epoch": 0.24267735199503357,
"grad_norm": 31.037330627441406,
"learning_rate": 9.764062206057566e-06,
"loss": 0.4869,
"step": 8600
},
{
"epoch": 0.24549918166939444,
"grad_norm": 24.076200485229492,
"learning_rate": 9.748385276227504e-06,
"loss": 0.4911,
"step": 8700
},
{
"epoch": 0.24832101134375528,
"grad_norm": 26.947887420654297,
"learning_rate": 9.732708346397442e-06,
"loss": 0.4852,
"step": 8800
},
{
"epoch": 0.2511428410181161,
"grad_norm": 18.701513290405273,
"learning_rate": 9.71703141656738e-06,
"loss": 0.5125,
"step": 8900
},
{
"epoch": 0.253964670692477,
"grad_norm": 20.67565155029297,
"learning_rate": 9.701354486737318e-06,
"loss": 0.5037,
"step": 9000
},
{
"epoch": 0.25678650036683787,
"grad_norm": 17.804229736328125,
"learning_rate": 9.685677556907256e-06,
"loss": 0.5024,
"step": 9100
},
{
"epoch": 0.2596083300411987,
"grad_norm": 25.94413185119629,
"learning_rate": 9.670000627077194e-06,
"loss": 0.5133,
"step": 9200
},
{
"epoch": 0.26243015971555955,
"grad_norm": 20.22077178955078,
"learning_rate": 9.654323697247132e-06,
"loss": 0.4844,
"step": 9300
},
{
"epoch": 0.26525198938992045,
"grad_norm": 32.44327926635742,
"learning_rate": 9.63864676741707e-06,
"loss": 0.4573,
"step": 9400
},
{
"epoch": 0.2680738190642813,
"grad_norm": 10.91318130493164,
"learning_rate": 9.622969837587008e-06,
"loss": 0.4434,
"step": 9500
},
{
"epoch": 0.27089564873864214,
"grad_norm": 13.019550323486328,
"learning_rate": 9.607292907756946e-06,
"loss": 0.4943,
"step": 9600
},
{
"epoch": 0.273717478413003,
"grad_norm": 15.419180870056152,
"learning_rate": 9.591615977926884e-06,
"loss": 0.492,
"step": 9700
},
{
"epoch": 0.2765393080873638,
"grad_norm": 24.357208251953125,
"learning_rate": 9.575939048096822e-06,
"loss": 0.4474,
"step": 9800
},
{
"epoch": 0.2793611377617247,
"grad_norm": 33.39434051513672,
"learning_rate": 9.56026211826676e-06,
"loss": 0.4623,
"step": 9900
},
{
"epoch": 0.28218296743608556,
"grad_norm": 32.260337829589844,
"learning_rate": 9.544585188436696e-06,
"loss": 0.501,
"step": 10000
},
{
"epoch": 0.2850047971104464,
"grad_norm": 29.994497299194336,
"learning_rate": 9.528908258606634e-06,
"loss": 0.4795,
"step": 10100
},
{
"epoch": 0.28782662678480725,
"grad_norm": 51.09225845336914,
"learning_rate": 9.513388098074874e-06,
"loss": 0.4829,
"step": 10200
},
{
"epoch": 0.29064845645916815,
"grad_norm": 17.05582618713379,
"learning_rate": 9.497711168244812e-06,
"loss": 0.5194,
"step": 10300
},
{
"epoch": 0.293470286133529,
"grad_norm": 47.8975830078125,
"learning_rate": 9.48203423841475e-06,
"loss": 0.4935,
"step": 10400
},
{
"epoch": 0.29629211580788983,
"grad_norm": 20.064878463745117,
"learning_rate": 9.466514077882987e-06,
"loss": 0.4558,
"step": 10500
},
{
"epoch": 0.2991139454822507,
"grad_norm": 30.60138511657715,
"learning_rate": 9.450837148052927e-06,
"loss": 0.4908,
"step": 10600
},
{
"epoch": 0.3019357751566116,
"grad_norm": 12.470020294189453,
"learning_rate": 9.435160218222865e-06,
"loss": 0.4521,
"step": 10700
},
{
"epoch": 0.3047576048309724,
"grad_norm": 26.887653350830078,
"learning_rate": 9.419483288392803e-06,
"loss": 0.3894,
"step": 10800
},
{
"epoch": 0.30757943450533326,
"grad_norm": 26.63880157470703,
"learning_rate": 9.40380635856274e-06,
"loss": 0.4649,
"step": 10900
},
{
"epoch": 0.3104012641796941,
"grad_norm": 29.928979873657227,
"learning_rate": 9.388129428732677e-06,
"loss": 0.435,
"step": 11000
},
{
"epoch": 0.31322309385405495,
"grad_norm": 30.5889949798584,
"learning_rate": 9.372452498902615e-06,
"loss": 0.445,
"step": 11100
},
{
"epoch": 0.31604492352841584,
"grad_norm": 31.825000762939453,
"learning_rate": 9.356775569072553e-06,
"loss": 0.4913,
"step": 11200
},
{
"epoch": 0.3188667532027767,
"grad_norm": 40.66577911376953,
"learning_rate": 9.341098639242491e-06,
"loss": 0.4763,
"step": 11300
},
{
"epoch": 0.32168858287713753,
"grad_norm": 17.6318359375,
"learning_rate": 9.32542170941243e-06,
"loss": 0.4989,
"step": 11400
},
{
"epoch": 0.3245104125514984,
"grad_norm": 26.51487922668457,
"learning_rate": 9.309744779582367e-06,
"loss": 0.4486,
"step": 11500
},
{
"epoch": 0.32733224222585927,
"grad_norm": 3.3657779693603516,
"learning_rate": 9.294067849752305e-06,
"loss": 0.4233,
"step": 11600
},
{
"epoch": 0.3301540719002201,
"grad_norm": 34.170921325683594,
"learning_rate": 9.278390919922243e-06,
"loss": 0.4867,
"step": 11700
},
{
"epoch": 0.33297590157458096,
"grad_norm": 23.013031005859375,
"learning_rate": 9.262713990092181e-06,
"loss": 0.468,
"step": 11800
},
{
"epoch": 0.3357977312489418,
"grad_norm": 16.811792373657227,
"learning_rate": 9.24703706026212e-06,
"loss": 0.4735,
"step": 11900
},
{
"epoch": 0.33861956092330264,
"grad_norm": 29.813079833984375,
"learning_rate": 9.231360130432057e-06,
"loss": 0.4386,
"step": 12000
},
{
"epoch": 0.34144139059766354,
"grad_norm": 29.00061798095703,
"learning_rate": 9.215683200601995e-06,
"loss": 0.4741,
"step": 12100
},
{
"epoch": 0.3442632202720244,
"grad_norm": 17.047637939453125,
"learning_rate": 9.200006270771933e-06,
"loss": 0.4525,
"step": 12200
},
{
"epoch": 0.3470850499463852,
"grad_norm": 16.266225814819336,
"learning_rate": 9.184329340941871e-06,
"loss": 0.4348,
"step": 12300
},
{
"epoch": 0.34990687962074607,
"grad_norm": 31.315366744995117,
"learning_rate": 9.16865241111181e-06,
"loss": 0.4497,
"step": 12400
},
{
"epoch": 0.35272870929510697,
"grad_norm": 14.484158515930176,
"learning_rate": 9.152975481281746e-06,
"loss": 0.4492,
"step": 12500
},
{
"epoch": 0.3555505389694678,
"grad_norm": 14.117356300354004,
"learning_rate": 9.137298551451684e-06,
"loss": 0.4731,
"step": 12600
},
{
"epoch": 0.35837236864382865,
"grad_norm": 18.32455825805664,
"learning_rate": 9.121621621621622e-06,
"loss": 0.4952,
"step": 12700
},
{
"epoch": 0.3611941983181895,
"grad_norm": 22.009477615356445,
"learning_rate": 9.10594469179156e-06,
"loss": 0.4374,
"step": 12800
},
{
"epoch": 0.3640160279925504,
"grad_norm": 44.56202697753906,
"learning_rate": 9.0902677619615e-06,
"loss": 0.4576,
"step": 12900
},
{
"epoch": 0.36683785766691124,
"grad_norm": 32.11207962036133,
"learning_rate": 9.074590832131436e-06,
"loss": 0.4477,
"step": 13000
},
{
"epoch": 0.3696596873412721,
"grad_norm": 18.823020935058594,
"learning_rate": 9.058913902301374e-06,
"loss": 0.4628,
"step": 13100
},
{
"epoch": 0.3724815170156329,
"grad_norm": 6.9682793617248535,
"learning_rate": 9.043236972471312e-06,
"loss": 0.4405,
"step": 13200
},
{
"epoch": 0.37530334668999377,
"grad_norm": 56.02937698364258,
"learning_rate": 9.027873581237852e-06,
"loss": 0.4235,
"step": 13300
},
{
"epoch": 0.37812517636435466,
"grad_norm": 17.02280044555664,
"learning_rate": 9.012196651407788e-06,
"loss": 0.4659,
"step": 13400
},
{
"epoch": 0.3809470060387155,
"grad_norm": 127.55671691894531,
"learning_rate": 8.996519721577726e-06,
"loss": 0.4209,
"step": 13500
},
{
"epoch": 0.38376883571307635,
"grad_norm": 23.909799575805664,
"learning_rate": 8.980842791747665e-06,
"loss": 0.4412,
"step": 13600
},
{
"epoch": 0.3865906653874372,
"grad_norm": 42.155277252197266,
"learning_rate": 8.965165861917603e-06,
"loss": 0.4995,
"step": 13700
},
{
"epoch": 0.3894124950617981,
"grad_norm": 21.608457565307617,
"learning_rate": 8.94948893208754e-06,
"loss": 0.4857,
"step": 13800
},
{
"epoch": 0.39223432473615893,
"grad_norm": 25.70061492919922,
"learning_rate": 8.933812002257479e-06,
"loss": 0.4607,
"step": 13900
},
{
"epoch": 0.3950561544105198,
"grad_norm": 19.82216453552246,
"learning_rate": 8.918135072427417e-06,
"loss": 0.4585,
"step": 14000
},
{
"epoch": 0.3978779840848806,
"grad_norm": 17.299297332763672,
"learning_rate": 8.902458142597355e-06,
"loss": 0.4895,
"step": 14100
},
{
"epoch": 0.4006998137592415,
"grad_norm": 20.835285186767578,
"learning_rate": 8.886781212767293e-06,
"loss": 0.4418,
"step": 14200
},
{
"epoch": 0.40352164343360236,
"grad_norm": 14.876681327819824,
"learning_rate": 8.87110428293723e-06,
"loss": 0.4235,
"step": 14300
},
{
"epoch": 0.4063434731079632,
"grad_norm": 44.82238006591797,
"learning_rate": 8.855427353107167e-06,
"loss": 0.4812,
"step": 14400
},
{
"epoch": 0.40916530278232405,
"grad_norm": 30.035110473632812,
"learning_rate": 8.839750423277105e-06,
"loss": 0.4643,
"step": 14500
},
{
"epoch": 0.4119871324566849,
"grad_norm": 24.600914001464844,
"learning_rate": 8.824073493447045e-06,
"loss": 0.4672,
"step": 14600
},
{
"epoch": 0.4148089621310458,
"grad_norm": 7.020761966705322,
"learning_rate": 8.808396563616983e-06,
"loss": 0.4084,
"step": 14700
},
{
"epoch": 0.41763079180540663,
"grad_norm": 31.321611404418945,
"learning_rate": 8.79271963378692e-06,
"loss": 0.4673,
"step": 14800
},
{
"epoch": 0.4204526214797675,
"grad_norm": 26.225324630737305,
"learning_rate": 8.777042703956857e-06,
"loss": 0.4656,
"step": 14900
},
{
"epoch": 0.4232744511541283,
"grad_norm": 21.322677612304688,
"learning_rate": 8.761365774126795e-06,
"loss": 0.4201,
"step": 15000
},
{
"epoch": 0.4260962808284892,
"grad_norm": 9.067322731018066,
"learning_rate": 8.745688844296733e-06,
"loss": 0.4521,
"step": 15100
},
{
"epoch": 0.42891811050285006,
"grad_norm": 29.99122428894043,
"learning_rate": 8.730011914466671e-06,
"loss": 0.4767,
"step": 15200
},
{
"epoch": 0.4317399401772109,
"grad_norm": 19.088232040405273,
"learning_rate": 8.71433498463661e-06,
"loss": 0.4604,
"step": 15300
},
{
"epoch": 0.43456176985157174,
"grad_norm": 43.28144836425781,
"learning_rate": 8.698658054806547e-06,
"loss": 0.4886,
"step": 15400
},
{
"epoch": 0.4373835995259326,
"grad_norm": 23.3623046875,
"learning_rate": 8.682981124976485e-06,
"loss": 0.461,
"step": 15500
},
{
"epoch": 0.4402054292002935,
"grad_norm": 10.750836372375488,
"learning_rate": 8.667304195146423e-06,
"loss": 0.4882,
"step": 15600
},
{
"epoch": 0.44302725887465433,
"grad_norm": 20.971227645874023,
"learning_rate": 8.651627265316361e-06,
"loss": 0.4247,
"step": 15700
},
{
"epoch": 0.44584908854901517,
"grad_norm": 91.97208404541016,
"learning_rate": 8.6359503354863e-06,
"loss": 0.4089,
"step": 15800
},
{
"epoch": 0.448670918223376,
"grad_norm": 32.81388854980469,
"learning_rate": 8.620273405656236e-06,
"loss": 0.4393,
"step": 15900
},
{
"epoch": 0.4514927478977369,
"grad_norm": 47.73301315307617,
"learning_rate": 8.604596475826175e-06,
"loss": 0.4213,
"step": 16000
},
{
"epoch": 0.45431457757209776,
"grad_norm": 30.3223934173584,
"learning_rate": 8.588919545996113e-06,
"loss": 0.4443,
"step": 16100
},
{
"epoch": 0.4571364072464586,
"grad_norm": 31.105688095092773,
"learning_rate": 8.573242616166051e-06,
"loss": 0.456,
"step": 16200
},
{
"epoch": 0.45995823692081944,
"grad_norm": 31.305255889892578,
"learning_rate": 8.55756568633599e-06,
"loss": 0.4203,
"step": 16300
},
{
"epoch": 0.46278006659518034,
"grad_norm": 16.12314224243164,
"learning_rate": 8.541888756505926e-06,
"loss": 0.467,
"step": 16400
},
{
"epoch": 0.4656018962695412,
"grad_norm": 12.041780471801758,
"learning_rate": 8.526211826675864e-06,
"loss": 0.4549,
"step": 16500
},
{
"epoch": 0.468423725943902,
"grad_norm": 70.41606903076172,
"learning_rate": 8.510534896845802e-06,
"loss": 0.453,
"step": 16600
},
{
"epoch": 0.47124555561826287,
"grad_norm": 25.043682098388672,
"learning_rate": 8.49485796701574e-06,
"loss": 0.4863,
"step": 16700
},
{
"epoch": 0.4740673852926237,
"grad_norm": 35.47560501098633,
"learning_rate": 8.479181037185678e-06,
"loss": 0.3768,
"step": 16800
},
{
"epoch": 0.4768892149669846,
"grad_norm": 95.46676635742188,
"learning_rate": 8.463504107355616e-06,
"loss": 0.4667,
"step": 16900
},
{
"epoch": 0.47971104464134545,
"grad_norm": 11.174333572387695,
"learning_rate": 8.447827177525554e-06,
"loss": 0.4867,
"step": 17000
},
{
"epoch": 0.4825328743157063,
"grad_norm": 26.256561279296875,
"learning_rate": 8.432150247695492e-06,
"loss": 0.4473,
"step": 17100
},
{
"epoch": 0.48535470399006714,
"grad_norm": 37.75515365600586,
"learning_rate": 8.41647331786543e-06,
"loss": 0.4615,
"step": 17200
},
{
"epoch": 0.48817653366442804,
"grad_norm": 11.617927551269531,
"learning_rate": 8.400796388035368e-06,
"loss": 0.434,
"step": 17300
},
{
"epoch": 0.4909983633387889,
"grad_norm": 36.8308219909668,
"learning_rate": 8.385119458205306e-06,
"loss": 0.4153,
"step": 17400
},
{
"epoch": 0.4938201930131497,
"grad_norm": 21.96497344970703,
"learning_rate": 8.369442528375244e-06,
"loss": 0.4356,
"step": 17500
},
{
"epoch": 0.49664202268751056,
"grad_norm": 24.90114974975586,
"learning_rate": 8.353765598545182e-06,
"loss": 0.4432,
"step": 17600
},
{
"epoch": 0.49946385236187146,
"grad_norm": 35.501869201660156,
"learning_rate": 8.33808866871512e-06,
"loss": 0.4359,
"step": 17700
},
{
"epoch": 0.5022856820362323,
"grad_norm": 19.375484466552734,
"learning_rate": 8.322411738885058e-06,
"loss": 0.4736,
"step": 17800
},
{
"epoch": 0.5051075117105931,
"grad_norm": 33.64216995239258,
"learning_rate": 8.306734809054994e-06,
"loss": 0.4209,
"step": 17900
},
{
"epoch": 0.507929341384954,
"grad_norm": 26.897354125976562,
"learning_rate": 8.291057879224932e-06,
"loss": 0.4558,
"step": 18000
},
{
"epoch": 0.5107511710593149,
"grad_norm": 28.531604766845703,
"learning_rate": 8.27538094939487e-06,
"loss": 0.4091,
"step": 18100
},
{
"epoch": 0.5135730007336757,
"grad_norm": 17.724021911621094,
"learning_rate": 8.259704019564809e-06,
"loss": 0.4936,
"step": 18200
},
{
"epoch": 0.5163948304080366,
"grad_norm": 30.871612548828125,
"learning_rate": 8.244027089734748e-06,
"loss": 0.4301,
"step": 18300
},
{
"epoch": 0.5192166600823974,
"grad_norm": 10.603446006774902,
"learning_rate": 8.228350159904685e-06,
"loss": 0.4271,
"step": 18400
},
{
"epoch": 0.5220384897567583,
"grad_norm": 21.891576766967773,
"learning_rate": 8.212673230074623e-06,
"loss": 0.4397,
"step": 18500
},
{
"epoch": 0.5248603194311191,
"grad_norm": 18.257530212402344,
"learning_rate": 8.19699630024456e-06,
"loss": 0.4438,
"step": 18600
},
{
"epoch": 0.52768214910548,
"grad_norm": 38.37961959838867,
"learning_rate": 8.181319370414499e-06,
"loss": 0.457,
"step": 18700
},
{
"epoch": 0.5305039787798409,
"grad_norm": 43.02951431274414,
"learning_rate": 8.165642440584437e-06,
"loss": 0.4775,
"step": 18800
},
{
"epoch": 0.5333258084542017,
"grad_norm": 19.02762794494629,
"learning_rate": 8.149965510754375e-06,
"loss": 0.4016,
"step": 18900
},
{
"epoch": 0.5361476381285626,
"grad_norm": 15.892502784729004,
"learning_rate": 8.134288580924313e-06,
"loss": 0.4091,
"step": 19000
},
{
"epoch": 0.5389694678029234,
"grad_norm": 23.187881469726562,
"learning_rate": 8.11861165109425e-06,
"loss": 0.4637,
"step": 19100
},
{
"epoch": 0.5417912974772843,
"grad_norm": 22.69843101501465,
"learning_rate": 8.102934721264189e-06,
"loss": 0.4567,
"step": 19200
},
{
"epoch": 0.5446131271516451,
"grad_norm": 25.485212326049805,
"learning_rate": 8.087257791434127e-06,
"loss": 0.4329,
"step": 19300
},
{
"epoch": 0.547434956826006,
"grad_norm": 13.632718086242676,
"learning_rate": 8.071580861604063e-06,
"loss": 0.3943,
"step": 19400
},
{
"epoch": 0.5502567865003668,
"grad_norm": 28.55874252319336,
"learning_rate": 8.055903931774001e-06,
"loss": 0.4587,
"step": 19500
},
{
"epoch": 0.5530786161747276,
"grad_norm": 14.307991027832031,
"learning_rate": 8.040227001943939e-06,
"loss": 0.4297,
"step": 19600
},
{
"epoch": 0.5559004458490886,
"grad_norm": 29.6854305267334,
"learning_rate": 8.024550072113879e-06,
"loss": 0.4665,
"step": 19700
},
{
"epoch": 0.5587222755234494,
"grad_norm": 12.39411735534668,
"learning_rate": 8.008873142283817e-06,
"loss": 0.44,
"step": 19800
},
{
"epoch": 0.5615441051978103,
"grad_norm": 19.937421798706055,
"learning_rate": 7.993352981752054e-06,
"loss": 0.4209,
"step": 19900
},
{
"epoch": 0.5643659348721711,
"grad_norm": 42.9919548034668,
"learning_rate": 7.977676051921992e-06,
"loss": 0.4165,
"step": 20000
},
{
"epoch": 0.567187764546532,
"grad_norm": 36.624691009521484,
"learning_rate": 7.96199912209193e-06,
"loss": 0.4602,
"step": 20100
},
{
"epoch": 0.5700095942208928,
"grad_norm": 96.30526733398438,
"learning_rate": 7.946322192261868e-06,
"loss": 0.4202,
"step": 20200
},
{
"epoch": 0.5728314238952537,
"grad_norm": 23.55826187133789,
"learning_rate": 7.930802031730107e-06,
"loss": 0.4349,
"step": 20300
},
{
"epoch": 0.5756532535696145,
"grad_norm": 17.5443172454834,
"learning_rate": 7.915125101900044e-06,
"loss": 0.4351,
"step": 20400
},
{
"epoch": 0.5784750832439753,
"grad_norm": 33.78348922729492,
"learning_rate": 7.899448172069982e-06,
"loss": 0.4825,
"step": 20500
},
{
"epoch": 0.5812969129183363,
"grad_norm": 72.64608001708984,
"learning_rate": 7.88377124223992e-06,
"loss": 0.4596,
"step": 20600
},
{
"epoch": 0.5841187425926971,
"grad_norm": 29.05013656616211,
"learning_rate": 7.868094312409858e-06,
"loss": 0.436,
"step": 20700
},
{
"epoch": 0.586940572267058,
"grad_norm": 11.080700874328613,
"learning_rate": 7.852417382579796e-06,
"loss": 0.4302,
"step": 20800
},
{
"epoch": 0.5897624019414188,
"grad_norm": 12.57494068145752,
"learning_rate": 7.836740452749734e-06,
"loss": 0.4222,
"step": 20900
},
{
"epoch": 0.5925842316157797,
"grad_norm": 18.02518081665039,
"learning_rate": 7.821063522919672e-06,
"loss": 0.3936,
"step": 21000
},
{
"epoch": 0.5954060612901405,
"grad_norm": 23.98390007019043,
"learning_rate": 7.80538659308961e-06,
"loss": 0.429,
"step": 21100
},
{
"epoch": 0.5982278909645014,
"grad_norm": 30.125253677368164,
"learning_rate": 7.789709663259548e-06,
"loss": 0.4255,
"step": 21200
},
{
"epoch": 0.6010497206388622,
"grad_norm": 47.189022064208984,
"learning_rate": 7.774032733429486e-06,
"loss": 0.4276,
"step": 21300
},
{
"epoch": 0.6038715503132231,
"grad_norm": 23.556894302368164,
"learning_rate": 7.758355803599424e-06,
"loss": 0.4735,
"step": 21400
},
{
"epoch": 0.606693379987584,
"grad_norm": 18.217546463012695,
"learning_rate": 7.742678873769362e-06,
"loss": 0.4202,
"step": 21500
},
{
"epoch": 0.6095152096619448,
"grad_norm": 15.294795036315918,
"learning_rate": 7.7270019439393e-06,
"loss": 0.3781,
"step": 21600
},
{
"epoch": 0.6123370393363057,
"grad_norm": 27.4034366607666,
"learning_rate": 7.711325014109238e-06,
"loss": 0.4353,
"step": 21700
},
{
"epoch": 0.6151588690106665,
"grad_norm": 6.8590240478515625,
"learning_rate": 7.695648084279176e-06,
"loss": 0.4145,
"step": 21800
},
{
"epoch": 0.6179806986850274,
"grad_norm": 17.38852310180664,
"learning_rate": 7.679971154449112e-06,
"loss": 0.4146,
"step": 21900
},
{
"epoch": 0.6208025283593882,
"grad_norm": 25.68893051147461,
"learning_rate": 7.66429422461905e-06,
"loss": 0.4162,
"step": 22000
},
{
"epoch": 0.623624358033749,
"grad_norm": 40.78746795654297,
"learning_rate": 7.648617294788989e-06,
"loss": 0.4339,
"step": 22100
},
{
"epoch": 0.6264461877081099,
"grad_norm": 37.15106201171875,
"learning_rate": 7.632940364958927e-06,
"loss": 0.4507,
"step": 22200
},
{
"epoch": 0.6292680173824708,
"grad_norm": 16.895755767822266,
"learning_rate": 7.617263435128865e-06,
"loss": 0.4273,
"step": 22300
},
{
"epoch": 0.6320898470568317,
"grad_norm": 20.340709686279297,
"learning_rate": 7.6015865052988026e-06,
"loss": 0.4298,
"step": 22400
},
{
"epoch": 0.6349116767311925,
"grad_norm": 10.428862571716309,
"learning_rate": 7.585909575468741e-06,
"loss": 0.4425,
"step": 22500
},
{
"epoch": 0.6377335064055534,
"grad_norm": 18.21014404296875,
"learning_rate": 7.570232645638679e-06,
"loss": 0.4118,
"step": 22600
},
{
"epoch": 0.6405553360799142,
"grad_norm": 64.62041473388672,
"learning_rate": 7.554555715808617e-06,
"loss": 0.4506,
"step": 22700
},
{
"epoch": 0.6433771657542751,
"grad_norm": 18.57569694519043,
"learning_rate": 7.538878785978555e-06,
"loss": 0.4119,
"step": 22800
},
{
"epoch": 0.6461989954286359,
"grad_norm": 36.90799331665039,
"learning_rate": 7.523201856148492e-06,
"loss": 0.4595,
"step": 22900
},
{
"epoch": 0.6490208251029967,
"grad_norm": 18.904600143432617,
"learning_rate": 7.50752492631843e-06,
"loss": 0.4712,
"step": 23000
},
{
"epoch": 0.6518426547773576,
"grad_norm": 29.996702194213867,
"learning_rate": 7.491847996488369e-06,
"loss": 0.4554,
"step": 23100
},
{
"epoch": 0.6546644844517185,
"grad_norm": 32.28736877441406,
"learning_rate": 7.476171066658307e-06,
"loss": 0.4066,
"step": 23200
},
{
"epoch": 0.6574863141260794,
"grad_norm": 21.389598846435547,
"learning_rate": 7.460494136828245e-06,
"loss": 0.4264,
"step": 23300
},
{
"epoch": 0.6603081438004402,
"grad_norm": 20.693862915039062,
"learning_rate": 7.444817206998182e-06,
"loss": 0.4345,
"step": 23400
},
{
"epoch": 0.6631299734748011,
"grad_norm": 25.96352195739746,
"learning_rate": 7.42914027716812e-06,
"loss": 0.4207,
"step": 23500
},
{
"epoch": 0.6659518031491619,
"grad_norm": 7.279160022735596,
"learning_rate": 7.413463347338058e-06,
"loss": 0.4606,
"step": 23600
},
{
"epoch": 0.6687736328235228,
"grad_norm": 21.541379928588867,
"learning_rate": 7.397786417507996e-06,
"loss": 0.4424,
"step": 23700
},
{
"epoch": 0.6715954624978836,
"grad_norm": 20.808313369750977,
"learning_rate": 7.382109487677934e-06,
"loss": 0.4207,
"step": 23800
},
{
"epoch": 0.6744172921722444,
"grad_norm": 33.443336486816406,
"learning_rate": 7.366432557847871e-06,
"loss": 0.4192,
"step": 23900
},
{
"epoch": 0.6772391218466053,
"grad_norm": 4.05402946472168,
"learning_rate": 7.350755628017809e-06,
"loss": 0.3986,
"step": 24000
},
{
"epoch": 0.6800609515209662,
"grad_norm": 15.301130294799805,
"learning_rate": 7.335078698187747e-06,
"loss": 0.3992,
"step": 24100
},
{
"epoch": 0.6828827811953271,
"grad_norm": 17.400495529174805,
"learning_rate": 7.319401768357685e-06,
"loss": 0.3992,
"step": 24200
},
{
"epoch": 0.6857046108696879,
"grad_norm": 20.655588150024414,
"learning_rate": 7.303724838527624e-06,
"loss": 0.417,
"step": 24300
},
{
"epoch": 0.6885264405440488,
"grad_norm": 17.406002044677734,
"learning_rate": 7.2880479086975605e-06,
"loss": 0.4224,
"step": 24400
},
{
"epoch": 0.6913482702184096,
"grad_norm": 16.471834182739258,
"learning_rate": 7.272370978867499e-06,
"loss": 0.4193,
"step": 24500
},
{
"epoch": 0.6941700998927705,
"grad_norm": 30.130985260009766,
"learning_rate": 7.256694049037437e-06,
"loss": 0.3966,
"step": 24600
},
{
"epoch": 0.6969919295671313,
"grad_norm": 25.141477584838867,
"learning_rate": 7.2410171192073754e-06,
"loss": 0.4037,
"step": 24700
},
{
"epoch": 0.6998137592414921,
"grad_norm": 13.053258895874023,
"learning_rate": 7.2253401893773134e-06,
"loss": 0.4396,
"step": 24800
},
{
"epoch": 0.7026355889158531,
"grad_norm": 15.657031059265137,
"learning_rate": 7.209663259547251e-06,
"loss": 0.3962,
"step": 24900
},
{
"epoch": 0.7054574185902139,
"grad_norm": 28.782482147216797,
"learning_rate": 7.194143099015489e-06,
"loss": 0.4074,
"step": 25000
},
{
"epoch": 0.7082792482645748,
"grad_norm": 28.72026252746582,
"learning_rate": 7.178466169185428e-06,
"loss": 0.4422,
"step": 25100
},
{
"epoch": 0.7111010779389356,
"grad_norm": 14.960714340209961,
"learning_rate": 7.162789239355366e-06,
"loss": 0.4374,
"step": 25200
},
{
"epoch": 0.7139229076132965,
"grad_norm": 26.17475700378418,
"learning_rate": 7.147112309525304e-06,
"loss": 0.4295,
"step": 25300
},
{
"epoch": 0.7167447372876573,
"grad_norm": 17.277433395385742,
"learning_rate": 7.131435379695241e-06,
"loss": 0.4229,
"step": 25400
},
{
"epoch": 0.7195665669620182,
"grad_norm": 36.890193939208984,
"learning_rate": 7.115758449865179e-06,
"loss": 0.4005,
"step": 25500
},
{
"epoch": 0.722388396636379,
"grad_norm": 18.203617095947266,
"learning_rate": 7.100081520035117e-06,
"loss": 0.4241,
"step": 25600
},
{
"epoch": 0.7252102263107398,
"grad_norm": 26.79283905029297,
"learning_rate": 7.084404590205055e-06,
"loss": 0.447,
"step": 25700
},
{
"epoch": 0.7280320559851008,
"grad_norm": 24.888479232788086,
"learning_rate": 7.068727660374993e-06,
"loss": 0.4332,
"step": 25800
},
{
"epoch": 0.7308538856594616,
"grad_norm": 20.294689178466797,
"learning_rate": 7.05305073054493e-06,
"loss": 0.3704,
"step": 25900
},
{
"epoch": 0.7336757153338225,
"grad_norm": 20.747148513793945,
"learning_rate": 7.037373800714868e-06,
"loss": 0.4064,
"step": 26000
},
{
"epoch": 0.7364975450081833,
"grad_norm": 23.540077209472656,
"learning_rate": 7.021696870884806e-06,
"loss": 0.4507,
"step": 26100
},
{
"epoch": 0.7393193746825442,
"grad_norm": 19.756235122680664,
"learning_rate": 7.006019941054744e-06,
"loss": 0.4319,
"step": 26200
},
{
"epoch": 0.742141204356905,
"grad_norm": 23.604589462280273,
"learning_rate": 6.990343011224683e-06,
"loss": 0.4004,
"step": 26300
},
{
"epoch": 0.7449630340312658,
"grad_norm": 16.374685287475586,
"learning_rate": 6.9746660813946195e-06,
"loss": 0.3955,
"step": 26400
},
{
"epoch": 0.7477848637056267,
"grad_norm": 14.722450256347656,
"learning_rate": 6.958989151564558e-06,
"loss": 0.3847,
"step": 26500
},
{
"epoch": 0.7506066933799875,
"grad_norm": 25.750301361083984,
"learning_rate": 6.943312221734496e-06,
"loss": 0.3973,
"step": 26600
},
{
"epoch": 0.7534285230543485,
"grad_norm": 29.76510238647461,
"learning_rate": 6.9276352919044344e-06,
"loss": 0.4074,
"step": 26700
},
{
"epoch": 0.7562503527287093,
"grad_norm": 15.254782676696777,
"learning_rate": 6.9119583620743725e-06,
"loss": 0.4551,
"step": 26800
},
{
"epoch": 0.7590721824030702,
"grad_norm": 58.55656433105469,
"learning_rate": 6.89628143224431e-06,
"loss": 0.4214,
"step": 26900
},
{
"epoch": 0.761894012077431,
"grad_norm": 17.498140335083008,
"learning_rate": 6.880604502414248e-06,
"loss": 0.4295,
"step": 27000
},
{
"epoch": 0.7647158417517919,
"grad_norm": 17.728435516357422,
"learning_rate": 6.865084341882487e-06,
"loss": 0.4005,
"step": 27100
},
{
"epoch": 0.7675376714261527,
"grad_norm": 2.0892069339752197,
"learning_rate": 6.849407412052425e-06,
"loss": 0.3864,
"step": 27200
},
{
"epoch": 0.7703595011005135,
"grad_norm": 23.627687454223633,
"learning_rate": 6.833730482222363e-06,
"loss": 0.4168,
"step": 27300
},
{
"epoch": 0.7731813307748744,
"grad_norm": 3.664445161819458,
"learning_rate": 6.8180535523923e-06,
"loss": 0.4031,
"step": 27400
},
{
"epoch": 0.7760031604492352,
"grad_norm": 21.93037986755371,
"learning_rate": 6.802376622562238e-06,
"loss": 0.4043,
"step": 27500
},
{
"epoch": 0.7788249901235962,
"grad_norm": 22.63160514831543,
"learning_rate": 6.786699692732176e-06,
"loss": 0.3895,
"step": 27600
},
{
"epoch": 0.781646819797957,
"grad_norm": 27.74697494506836,
"learning_rate": 6.771022762902114e-06,
"loss": 0.444,
"step": 27700
},
{
"epoch": 0.7844686494723179,
"grad_norm": 60.84165573120117,
"learning_rate": 6.755345833072052e-06,
"loss": 0.4018,
"step": 27800
},
{
"epoch": 0.7872904791466787,
"grad_norm": 41.33998107910156,
"learning_rate": 6.739668903241989e-06,
"loss": 0.3751,
"step": 27900
},
{
"epoch": 0.7901123088210396,
"grad_norm": 7.033244609832764,
"learning_rate": 6.723991973411927e-06,
"loss": 0.4115,
"step": 28000
},
{
"epoch": 0.7929341384954004,
"grad_norm": 14.345335960388184,
"learning_rate": 6.708315043581865e-06,
"loss": 0.3875,
"step": 28100
},
{
"epoch": 0.7957559681697612,
"grad_norm": 18.797395706176758,
"learning_rate": 6.692638113751803e-06,
"loss": 0.385,
"step": 28200
},
{
"epoch": 0.7985777978441221,
"grad_norm": 16.46762466430664,
"learning_rate": 6.676961183921742e-06,
"loss": 0.4209,
"step": 28300
},
{
"epoch": 0.801399627518483,
"grad_norm": 42.29709243774414,
"learning_rate": 6.6612842540916785e-06,
"loss": 0.3697,
"step": 28400
},
{
"epoch": 0.8042214571928439,
"grad_norm": 9.446586608886719,
"learning_rate": 6.645607324261617e-06,
"loss": 0.3943,
"step": 28500
},
{
"epoch": 0.8070432868672047,
"grad_norm": 9.191097259521484,
"learning_rate": 6.629930394431555e-06,
"loss": 0.4095,
"step": 28600
},
{
"epoch": 0.8098651165415656,
"grad_norm": 27.068052291870117,
"learning_rate": 6.6142534646014934e-06,
"loss": 0.4112,
"step": 28700
},
{
"epoch": 0.8126869462159264,
"grad_norm": 7.080472946166992,
"learning_rate": 6.5985765347714315e-06,
"loss": 0.4225,
"step": 28800
},
{
"epoch": 0.8155087758902873,
"grad_norm": 14.27031421661377,
"learning_rate": 6.582899604941369e-06,
"loss": 0.3919,
"step": 28900
},
{
"epoch": 0.8183306055646481,
"grad_norm": 18.635713577270508,
"learning_rate": 6.567222675111307e-06,
"loss": 0.4208,
"step": 29000
},
{
"epoch": 0.8211524352390089,
"grad_norm": 15.811898231506348,
"learning_rate": 6.551545745281245e-06,
"loss": 0.3505,
"step": 29100
},
{
"epoch": 0.8239742649133698,
"grad_norm": 9.68923282623291,
"learning_rate": 6.535868815451183e-06,
"loss": 0.3879,
"step": 29200
},
{
"epoch": 0.8267960945877307,
"grad_norm": 20.192699432373047,
"learning_rate": 6.520191885621121e-06,
"loss": 0.3847,
"step": 29300
},
{
"epoch": 0.8296179242620916,
"grad_norm": 12.542318344116211,
"learning_rate": 6.504514955791058e-06,
"loss": 0.3754,
"step": 29400
},
{
"epoch": 0.8324397539364524,
"grad_norm": 38.85354232788086,
"learning_rate": 6.488838025960996e-06,
"loss": 0.4194,
"step": 29500
},
{
"epoch": 0.8352615836108133,
"grad_norm": 15.035304069519043,
"learning_rate": 6.473161096130934e-06,
"loss": 0.4417,
"step": 29600
},
{
"epoch": 0.8380834132851741,
"grad_norm": 32.99053955078125,
"learning_rate": 6.457484166300873e-06,
"loss": 0.404,
"step": 29700
},
{
"epoch": 0.840905242959535,
"grad_norm": 14.194585800170898,
"learning_rate": 6.441807236470811e-06,
"loss": 0.3853,
"step": 29800
},
{
"epoch": 0.8437270726338958,
"grad_norm": 12.659259796142578,
"learning_rate": 6.426130306640748e-06,
"loss": 0.4223,
"step": 29900
},
{
"epoch": 0.8465489023082566,
"grad_norm": 20.427833557128906,
"learning_rate": 6.410453376810686e-06,
"loss": 0.4356,
"step": 30000
},
{
"epoch": 0.8493707319826175,
"grad_norm": 16.472497940063477,
"learning_rate": 6.394776446980624e-06,
"loss": 0.3848,
"step": 30100
},
{
"epoch": 0.8521925616569784,
"grad_norm": 35.50379180908203,
"learning_rate": 6.379099517150562e-06,
"loss": 0.4256,
"step": 30200
},
{
"epoch": 0.8550143913313393,
"grad_norm": 21.51117706298828,
"learning_rate": 6.3634225873205e-06,
"loss": 0.3987,
"step": 30300
},
{
"epoch": 0.8578362210057001,
"grad_norm": 22.2384033203125,
"learning_rate": 6.347745657490437e-06,
"loss": 0.4016,
"step": 30400
},
{
"epoch": 0.860658050680061,
"grad_norm": 39.41447448730469,
"learning_rate": 6.332068727660375e-06,
"loss": 0.3865,
"step": 30500
},
{
"epoch": 0.8634798803544218,
"grad_norm": 7.72763729095459,
"learning_rate": 6.316391797830313e-06,
"loss": 0.4245,
"step": 30600
},
{
"epoch": 0.8663017100287826,
"grad_norm": 22.295452117919922,
"learning_rate": 6.3008716372985525e-06,
"loss": 0.4315,
"step": 30700
},
{
"epoch": 0.8691235397031435,
"grad_norm": 11.646142959594727,
"learning_rate": 6.28519470746849e-06,
"loss": 0.4399,
"step": 30800
},
{
"epoch": 0.8719453693775043,
"grad_norm": 26.82891082763672,
"learning_rate": 6.269517777638428e-06,
"loss": 0.3699,
"step": 30900
},
{
"epoch": 0.8747671990518652,
"grad_norm": 20.642230987548828,
"learning_rate": 6.253840847808366e-06,
"loss": 0.4167,
"step": 31000
},
{
"epoch": 0.8775890287262261,
"grad_norm": 14.905511856079102,
"learning_rate": 6.238163917978304e-06,
"loss": 0.4052,
"step": 31100
},
{
"epoch": 0.880410858400587,
"grad_norm": 69.13500213623047,
"learning_rate": 6.222486988148242e-06,
"loss": 0.3816,
"step": 31200
},
{
"epoch": 0.8832326880749478,
"grad_norm": 20.331384658813477,
"learning_rate": 6.206810058318179e-06,
"loss": 0.3954,
"step": 31300
},
{
"epoch": 0.8860545177493087,
"grad_norm": 18.5074405670166,
"learning_rate": 6.191133128488117e-06,
"loss": 0.4321,
"step": 31400
},
{
"epoch": 0.8888763474236695,
"grad_norm": 34.6412467956543,
"learning_rate": 6.175456198658055e-06,
"loss": 0.3988,
"step": 31500
},
{
"epoch": 0.8916981770980303,
"grad_norm": 15.479742050170898,
"learning_rate": 6.159779268827993e-06,
"loss": 0.4052,
"step": 31600
},
{
"epoch": 0.8945200067723912,
"grad_norm": 31.75602149963379,
"learning_rate": 6.144102338997932e-06,
"loss": 0.3739,
"step": 31700
},
{
"epoch": 0.897341836446752,
"grad_norm": 12.538125038146973,
"learning_rate": 6.128425409167868e-06,
"loss": 0.381,
"step": 31800
},
{
"epoch": 0.900163666121113,
"grad_norm": 18.574064254760742,
"learning_rate": 6.112748479337807e-06,
"loss": 0.4223,
"step": 31900
},
{
"epoch": 0.9029854957954738,
"grad_norm": 20.351797103881836,
"learning_rate": 6.097071549507745e-06,
"loss": 0.4108,
"step": 32000
},
{
"epoch": 0.9058073254698347,
"grad_norm": 16.650991439819336,
"learning_rate": 6.081394619677683e-06,
"loss": 0.3868,
"step": 32100
},
{
"epoch": 0.9086291551441955,
"grad_norm": 24.825759887695312,
"learning_rate": 6.065717689847621e-06,
"loss": 0.3987,
"step": 32200
},
{
"epoch": 0.9114509848185564,
"grad_norm": 7.137796401977539,
"learning_rate": 6.050040760017558e-06,
"loss": 0.3624,
"step": 32300
},
{
"epoch": 0.9142728144929172,
"grad_norm": 28.790571212768555,
"learning_rate": 6.034363830187496e-06,
"loss": 0.408,
"step": 32400
},
{
"epoch": 0.917094644167278,
"grad_norm": 24.61075782775879,
"learning_rate": 6.018686900357434e-06,
"loss": 0.4137,
"step": 32500
},
{
"epoch": 0.9199164738416389,
"grad_norm": 26.280406951904297,
"learning_rate": 6.003009970527372e-06,
"loss": 0.3809,
"step": 32600
},
{
"epoch": 0.9227383035159997,
"grad_norm": 13.658437728881836,
"learning_rate": 5.98733304069731e-06,
"loss": 0.3811,
"step": 32700
},
{
"epoch": 0.9255601331903607,
"grad_norm": 5.478005886077881,
"learning_rate": 5.9716561108672476e-06,
"loss": 0.3579,
"step": 32800
},
{
"epoch": 0.9283819628647215,
"grad_norm": 17.17485237121582,
"learning_rate": 5.955979181037186e-06,
"loss": 0.392,
"step": 32900
},
{
"epoch": 0.9312037925390824,
"grad_norm": 19.072818756103516,
"learning_rate": 5.940302251207124e-06,
"loss": 0.381,
"step": 33000
},
{
"epoch": 0.9340256222134432,
"grad_norm": 5.044217586517334,
"learning_rate": 5.9246253213770625e-06,
"loss": 0.4043,
"step": 33100
},
{
"epoch": 0.936847451887804,
"grad_norm": 20.710311889648438,
"learning_rate": 5.9089483915470005e-06,
"loss": 0.4074,
"step": 33200
},
{
"epoch": 0.9396692815621649,
"grad_norm": 16.337045669555664,
"learning_rate": 5.893271461716938e-06,
"loss": 0.3658,
"step": 33300
},
{
"epoch": 0.9424911112365257,
"grad_norm": 25.688541412353516,
"learning_rate": 5.877594531886876e-06,
"loss": 0.377,
"step": 33400
},
{
"epoch": 0.9453129409108866,
"grad_norm": 15.326305389404297,
"learning_rate": 5.862074371355114e-06,
"loss": 0.4225,
"step": 33500
},
{
"epoch": 0.9481347705852474,
"grad_norm": 23.59290313720703,
"learning_rate": 5.846397441525052e-06,
"loss": 0.387,
"step": 33600
},
{
"epoch": 0.9509566002596084,
"grad_norm": 50.01143264770508,
"learning_rate": 5.830720511694991e-06,
"loss": 0.3959,
"step": 33700
},
{
"epoch": 0.9537784299339692,
"grad_norm": 21.439271926879883,
"learning_rate": 5.815043581864927e-06,
"loss": 0.3403,
"step": 33800
},
{
"epoch": 0.9566002596083301,
"grad_norm": 26.8652286529541,
"learning_rate": 5.799366652034866e-06,
"loss": 0.3849,
"step": 33900
},
{
"epoch": 0.9594220892826909,
"grad_norm": 19.363805770874023,
"learning_rate": 5.783689722204804e-06,
"loss": 0.3969,
"step": 34000
},
{
"epoch": 0.9622439189570517,
"grad_norm": 11.499284744262695,
"learning_rate": 5.768012792374742e-06,
"loss": 0.4231,
"step": 34100
},
{
"epoch": 0.9650657486314126,
"grad_norm": 34.81698989868164,
"learning_rate": 5.75233586254468e-06,
"loss": 0.346,
"step": 34200
},
{
"epoch": 0.9678875783057734,
"grad_norm": 31.242355346679688,
"learning_rate": 5.736658932714617e-06,
"loss": 0.3799,
"step": 34300
},
{
"epoch": 0.9707094079801343,
"grad_norm": 27.321941375732422,
"learning_rate": 5.720982002884555e-06,
"loss": 0.3418,
"step": 34400
},
{
"epoch": 0.9735312376544952,
"grad_norm": 22.507356643676758,
"learning_rate": 5.705305073054493e-06,
"loss": 0.3828,
"step": 34500
},
{
"epoch": 0.9763530673288561,
"grad_norm": 19.005266189575195,
"learning_rate": 5.689628143224431e-06,
"loss": 0.4241,
"step": 34600
},
{
"epoch": 0.9791748970032169,
"grad_norm": 17.969890594482422,
"learning_rate": 5.673951213394369e-06,
"loss": 0.4084,
"step": 34700
},
{
"epoch": 0.9819967266775778,
"grad_norm": 16.145771026611328,
"learning_rate": 5.6582742835643066e-06,
"loss": 0.3818,
"step": 34800
},
{
"epoch": 0.9848185563519386,
"grad_norm": 27.250308990478516,
"learning_rate": 5.642597353734245e-06,
"loss": 0.3544,
"step": 34900
},
{
"epoch": 0.9876403860262994,
"grad_norm": 57.412166595458984,
"learning_rate": 5.626920423904183e-06,
"loss": 0.3829,
"step": 35000
},
{
"epoch": 0.9904622157006603,
"grad_norm": 33.557403564453125,
"learning_rate": 5.6112434940741215e-06,
"loss": 0.4252,
"step": 35100
},
{
"epoch": 0.9932840453750211,
"grad_norm": 21.193218231201172,
"learning_rate": 5.5955665642440595e-06,
"loss": 0.4006,
"step": 35200
},
{
"epoch": 0.996105875049382,
"grad_norm": 26.300689697265625,
"learning_rate": 5.579889634413997e-06,
"loss": 0.3857,
"step": 35300
},
{
"epoch": 0.9989277047237429,
"grad_norm": 13.547060012817383,
"learning_rate": 5.564212704583935e-06,
"loss": 0.3635,
"step": 35400
},
{
"epoch": 1.0017495343981038,
"grad_norm": 19.988895416259766,
"learning_rate": 5.548535774753873e-06,
"loss": 0.3543,
"step": 35500
},
{
"epoch": 1.0045713640724645,
"grad_norm": 12.582673072814941,
"learning_rate": 5.532858844923811e-06,
"loss": 0.3324,
"step": 35600
},
{
"epoch": 1.0073931937468255,
"grad_norm": 17.54091453552246,
"learning_rate": 5.517181915093749e-06,
"loss": 0.3733,
"step": 35700
},
{
"epoch": 1.0102150234211864,
"grad_norm": 12.848298072814941,
"learning_rate": 5.501661754561986e-06,
"loss": 0.3594,
"step": 35800
},
{
"epoch": 1.0130368530955471,
"grad_norm": 19.090791702270508,
"learning_rate": 5.485984824731925e-06,
"loss": 0.3468,
"step": 35900
},
{
"epoch": 1.015858682769908,
"grad_norm": 11.574841499328613,
"learning_rate": 5.470307894901863e-06,
"loss": 0.3442,
"step": 36000
},
{
"epoch": 1.0186805124442688,
"grad_norm": 27.498323440551758,
"learning_rate": 5.454630965071801e-06,
"loss": 0.3296,
"step": 36100
},
{
"epoch": 1.0215023421186298,
"grad_norm": 18.88401222229004,
"learning_rate": 5.438954035241739e-06,
"loss": 0.3814,
"step": 36200
},
{
"epoch": 1.0243241717929905,
"grad_norm": 32.382423400878906,
"learning_rate": 5.423277105411676e-06,
"loss": 0.3652,
"step": 36300
},
{
"epoch": 1.0271460014673515,
"grad_norm": 23.787736892700195,
"learning_rate": 5.407600175581614e-06,
"loss": 0.3308,
"step": 36400
},
{
"epoch": 1.0299678311417122,
"grad_norm": 16.6649112701416,
"learning_rate": 5.391923245751552e-06,
"loss": 0.3386,
"step": 36500
},
{
"epoch": 1.0327896608160732,
"grad_norm": 24.679080963134766,
"learning_rate": 5.37624631592149e-06,
"loss": 0.3248,
"step": 36600
},
{
"epoch": 1.0356114904904339,
"grad_norm": 30.37528419494629,
"learning_rate": 5.360569386091428e-06,
"loss": 0.3572,
"step": 36700
},
{
"epoch": 1.0384333201647948,
"grad_norm": 17.88707733154297,
"learning_rate": 5.344892456261366e-06,
"loss": 0.3473,
"step": 36800
},
{
"epoch": 1.0412551498391558,
"grad_norm": 46.716697692871094,
"learning_rate": 5.329215526431304e-06,
"loss": 0.3372,
"step": 36900
},
{
"epoch": 1.0440769795135165,
"grad_norm": 40.04937744140625,
"learning_rate": 5.313538596601242e-06,
"loss": 0.3848,
"step": 37000
},
{
"epoch": 1.0468988091878775,
"grad_norm": 24.893983840942383,
"learning_rate": 5.298018436069481e-06,
"loss": 0.3518,
"step": 37100
},
{
"epoch": 1.0497206388622382,
"grad_norm": 8.319790840148926,
"learning_rate": 5.282341506239419e-06,
"loss": 0.3164,
"step": 37200
},
{
"epoch": 1.0525424685365992,
"grad_norm": 26.47063636779785,
"learning_rate": 5.266664576409356e-06,
"loss": 0.3034,
"step": 37300
},
{
"epoch": 1.05536429821096,
"grad_norm": 41.414642333984375,
"learning_rate": 5.250987646579294e-06,
"loss": 0.3404,
"step": 37400
},
{
"epoch": 1.0581861278853208,
"grad_norm": 29.034229278564453,
"learning_rate": 5.235310716749232e-06,
"loss": 0.3271,
"step": 37500
},
{
"epoch": 1.0610079575596818,
"grad_norm": 12.065890312194824,
"learning_rate": 5.219633786919171e-06,
"loss": 0.3462,
"step": 37600
},
{
"epoch": 1.0638297872340425,
"grad_norm": 21.52030372619629,
"learning_rate": 5.203956857089109e-06,
"loss": 0.3499,
"step": 37700
},
{
"epoch": 1.0666516169084035,
"grad_norm": 14.449933052062988,
"learning_rate": 5.188279927259045e-06,
"loss": 0.2902,
"step": 37800
},
{
"epoch": 1.0694734465827642,
"grad_norm": 26.385765075683594,
"learning_rate": 5.172602997428984e-06,
"loss": 0.3312,
"step": 37900
},
{
"epoch": 1.0722952762571252,
"grad_norm": 30.85702896118164,
"learning_rate": 5.156926067598922e-06,
"loss": 0.3487,
"step": 38000
},
{
"epoch": 1.075117105931486,
"grad_norm": 23.276355743408203,
"learning_rate": 5.14124913776886e-06,
"loss": 0.311,
"step": 38100
},
{
"epoch": 1.0779389356058469,
"grad_norm": 15.212447166442871,
"learning_rate": 5.125572207938798e-06,
"loss": 0.3562,
"step": 38200
},
{
"epoch": 1.0807607652802076,
"grad_norm": 11.746698379516602,
"learning_rate": 5.1100520474070356e-06,
"loss": 0.3741,
"step": 38300
},
{
"epoch": 1.0835825949545685,
"grad_norm": 27.38930892944336,
"learning_rate": 5.094375117576974e-06,
"loss": 0.3321,
"step": 38400
},
{
"epoch": 1.0864044246289295,
"grad_norm": 8.432238578796387,
"learning_rate": 5.0786981877469124e-06,
"loss": 0.3253,
"step": 38500
},
{
"epoch": 1.0892262543032902,
"grad_norm": 20.7791690826416,
"learning_rate": 5.0630212579168505e-06,
"loss": 0.328,
"step": 38600
},
{
"epoch": 1.0920480839776512,
"grad_norm": 25.740007400512695,
"learning_rate": 5.0473443280867885e-06,
"loss": 0.3106,
"step": 38700
},
{
"epoch": 1.094869913652012,
"grad_norm": 16.10772132873535,
"learning_rate": 5.031667398256726e-06,
"loss": 0.3118,
"step": 38800
},
{
"epoch": 1.0976917433263729,
"grad_norm": 0.25727030634880066,
"learning_rate": 5.015990468426664e-06,
"loss": 0.3339,
"step": 38900
},
{
"epoch": 1.1005135730007336,
"grad_norm": 39.43502426147461,
"learning_rate": 5.000313538596602e-06,
"loss": 0.3497,
"step": 39000
},
{
"epoch": 1.1033354026750946,
"grad_norm": 14.343636512756348,
"learning_rate": 4.98463660876654e-06,
"loss": 0.3212,
"step": 39100
},
{
"epoch": 1.1061572323494553,
"grad_norm": 30.527856826782227,
"learning_rate": 4.968959678936478e-06,
"loss": 0.3732,
"step": 39200
},
{
"epoch": 1.1089790620238162,
"grad_norm": 30.169086456298828,
"learning_rate": 4.953282749106416e-06,
"loss": 0.3391,
"step": 39300
},
{
"epoch": 1.1118008916981772,
"grad_norm": 18.833539962768555,
"learning_rate": 4.937605819276353e-06,
"loss": 0.3477,
"step": 39400
},
{
"epoch": 1.114622721372538,
"grad_norm": 17.68643569946289,
"learning_rate": 4.921928889446291e-06,
"loss": 0.3545,
"step": 39500
},
{
"epoch": 1.1174445510468989,
"grad_norm": 24.589397430419922,
"learning_rate": 4.906251959616229e-06,
"loss": 0.3153,
"step": 39600
},
{
"epoch": 1.1202663807212596,
"grad_norm": 9.25218677520752,
"learning_rate": 4.890575029786167e-06,
"loss": 0.3535,
"step": 39700
},
{
"epoch": 1.1230882103956206,
"grad_norm": 21.546316146850586,
"learning_rate": 4.874898099956105e-06,
"loss": 0.3531,
"step": 39800
},
{
"epoch": 1.1259100400699813,
"grad_norm": 20.65337562561035,
"learning_rate": 4.859221170126043e-06,
"loss": 0.3675,
"step": 39900
},
{
"epoch": 1.1287318697443423,
"grad_norm": 11.405496597290039,
"learning_rate": 4.843544240295981e-06,
"loss": 0.3298,
"step": 40000
},
{
"epoch": 1.1315536994187032,
"grad_norm": 9.069683074951172,
"learning_rate": 4.827867310465918e-06,
"loss": 0.3224,
"step": 40100
},
{
"epoch": 1.134375529093064,
"grad_norm": 26.082910537719727,
"learning_rate": 4.812190380635856e-06,
"loss": 0.3633,
"step": 40200
},
{
"epoch": 1.137197358767425,
"grad_norm": 27.91574478149414,
"learning_rate": 4.796513450805795e-06,
"loss": 0.3403,
"step": 40300
},
{
"epoch": 1.1400191884417856,
"grad_norm": 14.8870267868042,
"learning_rate": 4.780836520975732e-06,
"loss": 0.3535,
"step": 40400
},
{
"epoch": 1.1428410181161466,
"grad_norm": 49.3880729675293,
"learning_rate": 4.76515959114567e-06,
"loss": 0.3833,
"step": 40500
},
{
"epoch": 1.1456628477905073,
"grad_norm": 26.83273696899414,
"learning_rate": 4.749482661315608e-06,
"loss": 0.3367,
"step": 40600
},
{
"epoch": 1.1484846774648683,
"grad_norm": 15.598971366882324,
"learning_rate": 4.7338057314855464e-06,
"loss": 0.308,
"step": 40700
},
{
"epoch": 1.151306507139229,
"grad_norm": 16.14190673828125,
"learning_rate": 4.7181288016554844e-06,
"loss": 0.3277,
"step": 40800
},
{
"epoch": 1.15412833681359,
"grad_norm": 23.744840621948242,
"learning_rate": 4.702451871825422e-06,
"loss": 0.3407,
"step": 40900
},
{
"epoch": 1.1569501664879507,
"grad_norm": 10.290372848510742,
"learning_rate": 4.6867749419953605e-06,
"loss": 0.3537,
"step": 41000
},
{
"epoch": 1.1597719961623116,
"grad_norm": 19.450990676879883,
"learning_rate": 4.671098012165298e-06,
"loss": 0.3517,
"step": 41100
},
{
"epoch": 1.1625938258366726,
"grad_norm": 7.8305463790893555,
"learning_rate": 4.655421082335236e-06,
"loss": 0.3366,
"step": 41200
},
{
"epoch": 1.1654156555110333,
"grad_norm": 21.166301727294922,
"learning_rate": 4.639744152505174e-06,
"loss": 0.329,
"step": 41300
},
{
"epoch": 1.1682374851853943,
"grad_norm": 33.572425842285156,
"learning_rate": 4.624067222675112e-06,
"loss": 0.3557,
"step": 41400
},
{
"epoch": 1.171059314859755,
"grad_norm": 30.565309524536133,
"learning_rate": 4.60839029284505e-06,
"loss": 0.3466,
"step": 41500
},
{
"epoch": 1.173881144534116,
"grad_norm": 32.635765075683594,
"learning_rate": 4.592713363014987e-06,
"loss": 0.3132,
"step": 41600
},
{
"epoch": 1.1767029742084767,
"grad_norm": 2.45202898979187,
"learning_rate": 4.577036433184926e-06,
"loss": 0.3707,
"step": 41700
},
{
"epoch": 1.1795248038828376,
"grad_norm": 4.587299823760986,
"learning_rate": 4.561359503354864e-06,
"loss": 0.3301,
"step": 41800
},
{
"epoch": 1.1823466335571986,
"grad_norm": 25.243562698364258,
"learning_rate": 4.545682573524801e-06,
"loss": 0.3245,
"step": 41900
},
{
"epoch": 1.1851684632315593,
"grad_norm": 22.650365829467773,
"learning_rate": 4.530005643694739e-06,
"loss": 0.3518,
"step": 42000
},
{
"epoch": 1.1879902929059203,
"grad_norm": 18.754680633544922,
"learning_rate": 4.514328713864677e-06,
"loss": 0.3297,
"step": 42100
},
{
"epoch": 1.190812122580281,
"grad_norm": 19.45864486694336,
"learning_rate": 4.498651784034615e-06,
"loss": 0.3446,
"step": 42200
},
{
"epoch": 1.193633952254642,
"grad_norm": 23.143203735351562,
"learning_rate": 4.482974854204553e-06,
"loss": 0.3494,
"step": 42300
},
{
"epoch": 1.1964557819290027,
"grad_norm": 33.07261276245117,
"learning_rate": 4.467297924374491e-06,
"loss": 0.3619,
"step": 42400
},
{
"epoch": 1.1992776116033637,
"grad_norm": 8.783498764038086,
"learning_rate": 4.451620994544429e-06,
"loss": 0.3596,
"step": 42500
},
{
"epoch": 1.2020994412777244,
"grad_norm": 19.049306869506836,
"learning_rate": 4.435944064714366e-06,
"loss": 0.3379,
"step": 42600
},
{
"epoch": 1.2049212709520853,
"grad_norm": 22.958511352539062,
"learning_rate": 4.420267134884304e-06,
"loss": 0.3661,
"step": 42700
},
{
"epoch": 1.207743100626446,
"grad_norm": 26.9248104095459,
"learning_rate": 4.404590205054242e-06,
"loss": 0.3483,
"step": 42800
},
{
"epoch": 1.210564930300807,
"grad_norm": 39.222469329833984,
"learning_rate": 4.389070044522481e-06,
"loss": 0.3221,
"step": 42900
},
{
"epoch": 1.213386759975168,
"grad_norm": 15.85262393951416,
"learning_rate": 4.3733931146924195e-06,
"loss": 0.3067,
"step": 43000
},
{
"epoch": 1.2162085896495287,
"grad_norm": 11.266934394836426,
"learning_rate": 4.357716184862357e-06,
"loss": 0.3341,
"step": 43100
},
{
"epoch": 1.2190304193238897,
"grad_norm": 30.2665958404541,
"learning_rate": 4.342039255032295e-06,
"loss": 0.3616,
"step": 43200
},
{
"epoch": 1.2218522489982504,
"grad_norm": 20.351150512695312,
"learning_rate": 4.326362325202233e-06,
"loss": 0.3349,
"step": 43300
},
{
"epoch": 1.2246740786726114,
"grad_norm": 14.63962173461914,
"learning_rate": 4.310685395372171e-06,
"loss": 0.3217,
"step": 43400
},
{
"epoch": 1.227495908346972,
"grad_norm": 7.7118048667907715,
"learning_rate": 4.295008465542109e-06,
"loss": 0.3708,
"step": 43500
},
{
"epoch": 1.230317738021333,
"grad_norm": 24.568256378173828,
"learning_rate": 4.279331535712046e-06,
"loss": 0.3349,
"step": 43600
},
{
"epoch": 1.233139567695694,
"grad_norm": 64.85833740234375,
"learning_rate": 4.263654605881985e-06,
"loss": 0.3571,
"step": 43700
},
{
"epoch": 1.2359613973700547,
"grad_norm": 6.7669596672058105,
"learning_rate": 4.247977676051923e-06,
"loss": 0.3257,
"step": 43800
},
{
"epoch": 1.2387832270444157,
"grad_norm": 23.950790405273438,
"learning_rate": 4.23230074622186e-06,
"loss": 0.3309,
"step": 43900
},
{
"epoch": 1.2416050567187764,
"grad_norm": 11.690123558044434,
"learning_rate": 4.216623816391798e-06,
"loss": 0.3109,
"step": 44000
},
{
"epoch": 1.2444268863931374,
"grad_norm": 22.820682525634766,
"learning_rate": 4.200946886561736e-06,
"loss": 0.3684,
"step": 44100
},
{
"epoch": 1.247248716067498,
"grad_norm": 7.326537609100342,
"learning_rate": 4.185269956731674e-06,
"loss": 0.3165,
"step": 44200
},
{
"epoch": 1.250070545741859,
"grad_norm": 28.421340942382812,
"learning_rate": 4.169593026901612e-06,
"loss": 0.3667,
"step": 44300
},
{
"epoch": 1.25289237541622,
"grad_norm": 16.26470947265625,
"learning_rate": 4.15391609707155e-06,
"loss": 0.3026,
"step": 44400
},
{
"epoch": 1.2557142050905807,
"grad_norm": 22.38811683654785,
"learning_rate": 4.138239167241488e-06,
"loss": 0.3474,
"step": 44500
},
{
"epoch": 1.2585360347649415,
"grad_norm": 10.734014511108398,
"learning_rate": 4.122562237411425e-06,
"loss": 0.3543,
"step": 44600
},
{
"epoch": 1.2613578644393024,
"grad_norm": 18.710065841674805,
"learning_rate": 4.106885307581363e-06,
"loss": 0.3496,
"step": 44700
},
{
"epoch": 1.2641796941136634,
"grad_norm": 2.5336763858795166,
"learning_rate": 4.091208377751301e-06,
"loss": 0.364,
"step": 44800
},
{
"epoch": 1.267001523788024,
"grad_norm": 34.18901062011719,
"learning_rate": 4.075531447921239e-06,
"loss": 0.3058,
"step": 44900
},
{
"epoch": 1.269823353462385,
"grad_norm": 22.022573471069336,
"learning_rate": 4.0598545180911774e-06,
"loss": 0.3144,
"step": 45000
},
{
"epoch": 1.2726451831367458,
"grad_norm": 8.996906280517578,
"learning_rate": 4.0441775882611155e-06,
"loss": 0.3609,
"step": 45100
},
{
"epoch": 1.2754670128111067,
"grad_norm": 25.605409622192383,
"learning_rate": 4.0285006584310535e-06,
"loss": 0.3217,
"step": 45200
},
{
"epoch": 1.2782888424854675,
"grad_norm": 22.709383010864258,
"learning_rate": 4.0128237286009915e-06,
"loss": 0.3398,
"step": 45300
},
{
"epoch": 1.2811106721598284,
"grad_norm": 17.49720001220703,
"learning_rate": 3.997146798770929e-06,
"loss": 0.3178,
"step": 45400
},
{
"epoch": 1.2839325018341894,
"grad_norm": 19.673856735229492,
"learning_rate": 3.981469868940867e-06,
"loss": 0.3138,
"step": 45500
},
{
"epoch": 1.2867543315085501,
"grad_norm": 21.33672332763672,
"learning_rate": 3.965792939110805e-06,
"loss": 0.343,
"step": 45600
},
{
"epoch": 1.289576161182911,
"grad_norm": 33.10939025878906,
"learning_rate": 3.950116009280743e-06,
"loss": 0.3301,
"step": 45700
},
{
"epoch": 1.2923979908572718,
"grad_norm": 26.29018211364746,
"learning_rate": 3.934439079450681e-06,
"loss": 0.3163,
"step": 45800
},
{
"epoch": 1.2952198205316328,
"grad_norm": 10.499427795410156,
"learning_rate": 3.918762149620619e-06,
"loss": 0.3164,
"step": 45900
},
{
"epoch": 1.2980416502059935,
"grad_norm": 35.09203338623047,
"learning_rate": 3.903085219790557e-06,
"loss": 0.3155,
"step": 46000
},
{
"epoch": 1.3008634798803544,
"grad_norm": 18.63652992248535,
"learning_rate": 3.887408289960494e-06,
"loss": 0.3354,
"step": 46100
},
{
"epoch": 1.3036853095547154,
"grad_norm": 17.310117721557617,
"learning_rate": 3.871731360130432e-06,
"loss": 0.3096,
"step": 46200
},
{
"epoch": 1.3065071392290761,
"grad_norm": 30.03758430480957,
"learning_rate": 3.856054430300371e-06,
"loss": 0.3484,
"step": 46300
},
{
"epoch": 1.3093289689034369,
"grad_norm": 15.059159278869629,
"learning_rate": 3.840377500470308e-06,
"loss": 0.3436,
"step": 46400
},
{
"epoch": 1.3121507985777978,
"grad_norm": 19.42188262939453,
"learning_rate": 3.824700570640246e-06,
"loss": 0.3254,
"step": 46500
},
{
"epoch": 1.3149726282521588,
"grad_norm": 21.08489990234375,
"learning_rate": 3.8090236408101837e-06,
"loss": 0.3292,
"step": 46600
},
{
"epoch": 1.3177944579265195,
"grad_norm": 11.324195861816406,
"learning_rate": 3.793346710980122e-06,
"loss": 0.3301,
"step": 46700
},
{
"epoch": 1.3206162876008805,
"grad_norm": 9.959671020507812,
"learning_rate": 3.77766978115006e-06,
"loss": 0.3029,
"step": 46800
},
{
"epoch": 1.3234381172752412,
"grad_norm": 22.626794815063477,
"learning_rate": 3.7619928513199978e-06,
"loss": 0.3139,
"step": 46900
},
{
"epoch": 1.3262599469496021,
"grad_norm": 21.049654006958008,
"learning_rate": 3.746315921489936e-06,
"loss": 0.3101,
"step": 47000
},
{
"epoch": 1.3290817766239629,
"grad_norm": 8.8051118850708,
"learning_rate": 3.730795760958174e-06,
"loss": 0.3302,
"step": 47100
},
{
"epoch": 1.3319036062983238,
"grad_norm": 20.859481811523438,
"learning_rate": 3.715118831128112e-06,
"loss": 0.3125,
"step": 47200
},
{
"epoch": 1.3347254359726848,
"grad_norm": 6.377940654754639,
"learning_rate": 3.6994419012980505e-06,
"loss": 0.324,
"step": 47300
},
{
"epoch": 1.3375472656470455,
"grad_norm": 20.301328659057617,
"learning_rate": 3.683764971467988e-06,
"loss": 0.3548,
"step": 47400
},
{
"epoch": 1.3403690953214065,
"grad_norm": 9.764044761657715,
"learning_rate": 3.668088041637926e-06,
"loss": 0.354,
"step": 47500
},
{
"epoch": 1.3431909249957672,
"grad_norm": 19.413543701171875,
"learning_rate": 3.6524111118078637e-06,
"loss": 0.3266,
"step": 47600
},
{
"epoch": 1.3460127546701282,
"grad_norm": 16.790817260742188,
"learning_rate": 3.6367341819778018e-06,
"loss": 0.3377,
"step": 47700
},
{
"epoch": 1.3488345843444889,
"grad_norm": 20.554304122924805,
"learning_rate": 3.62105725214774e-06,
"loss": 0.3209,
"step": 47800
},
{
"epoch": 1.3516564140188498,
"grad_norm": 18.711938858032227,
"learning_rate": 3.6053803223176774e-06,
"loss": 0.3324,
"step": 47900
},
{
"epoch": 1.3544782436932108,
"grad_norm": 23.41309356689453,
"learning_rate": 3.589703392487616e-06,
"loss": 0.3169,
"step": 48000
},
{
"epoch": 1.3573000733675715,
"grad_norm": 42.12315368652344,
"learning_rate": 3.5740264626575534e-06,
"loss": 0.3001,
"step": 48100
},
{
"epoch": 1.3601219030419323,
"grad_norm": 22.255311965942383,
"learning_rate": 3.5583495328274915e-06,
"loss": 0.3635,
"step": 48200
},
{
"epoch": 1.3629437327162932,
"grad_norm": 18.722684860229492,
"learning_rate": 3.5426726029974295e-06,
"loss": 0.3579,
"step": 48300
},
{
"epoch": 1.3657655623906542,
"grad_norm": 108.33175659179688,
"learning_rate": 3.526995673167367e-06,
"loss": 0.3021,
"step": 48400
},
{
"epoch": 1.368587392065015,
"grad_norm": 17.446874618530273,
"learning_rate": 3.511318743337305e-06,
"loss": 0.3227,
"step": 48500
},
{
"epoch": 1.3714092217393758,
"grad_norm": 15.62735366821289,
"learning_rate": 3.4956418135072427e-06,
"loss": 0.3042,
"step": 48600
},
{
"epoch": 1.3742310514137368,
"grad_norm": 35.5767936706543,
"learning_rate": 3.479964883677181e-06,
"loss": 0.3575,
"step": 48700
},
{
"epoch": 1.3770528810880975,
"grad_norm": 30.960519790649414,
"learning_rate": 3.464287953847119e-06,
"loss": 0.3681,
"step": 48800
},
{
"epoch": 1.3798747107624583,
"grad_norm": 21.970129013061523,
"learning_rate": 3.4486110240170568e-06,
"loss": 0.3517,
"step": 48900
},
{
"epoch": 1.3826965404368192,
"grad_norm": 10.104969024658203,
"learning_rate": 3.432934094186995e-06,
"loss": 0.3385,
"step": 49000
},
{
"epoch": 1.3855183701111802,
"grad_norm": 10.913897514343262,
"learning_rate": 3.4172571643569324e-06,
"loss": 0.3126,
"step": 49100
},
{
"epoch": 1.388340199785541,
"grad_norm": 12.78243350982666,
"learning_rate": 3.4015802345268704e-06,
"loss": 0.3202,
"step": 49200
},
{
"epoch": 1.3911620294599019,
"grad_norm": 47.281341552734375,
"learning_rate": 3.385903304696809e-06,
"loss": 0.3453,
"step": 49300
},
{
"epoch": 1.3939838591342626,
"grad_norm": 25.738739013671875,
"learning_rate": 3.3702263748667465e-06,
"loss": 0.3163,
"step": 49400
},
{
"epoch": 1.3968056888086235,
"grad_norm": 27.61959457397461,
"learning_rate": 3.354706214334985e-06,
"loss": 0.3169,
"step": 49500
},
{
"epoch": 1.3996275184829843,
"grad_norm": 21.792509078979492,
"learning_rate": 3.3390292845049228e-06,
"loss": 0.3353,
"step": 49600
},
{
"epoch": 1.4024493481573452,
"grad_norm": 27.541452407836914,
"learning_rate": 3.3233523546748608e-06,
"loss": 0.3456,
"step": 49700
},
{
"epoch": 1.4052711778317062,
"grad_norm": 15.29205322265625,
"learning_rate": 3.3076754248447984e-06,
"loss": 0.3542,
"step": 49800
},
{
"epoch": 1.408093007506067,
"grad_norm": 19.681842803955078,
"learning_rate": 3.2919984950147364e-06,
"loss": 0.3325,
"step": 49900
},
{
"epoch": 1.4109148371804279,
"grad_norm": 27.43589210510254,
"learning_rate": 3.276321565184675e-06,
"loss": 0.3252,
"step": 50000
}
],
"logging_steps": 100,
"max_steps": 70876,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}