vdos's picture
Training in progress, step 1327, checkpoint
72ede38 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0007067137809187,
"eval_steps": 332,
"global_step": 1327,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007538280329799764,
"grad_norm": 16.65915870666504,
"learning_rate": 2e-05,
"loss": 0.9643,
"step": 1
},
{
"epoch": 0.0015076560659599528,
"grad_norm": 18.07321548461914,
"learning_rate": 4e-05,
"loss": 0.99,
"step": 2
},
{
"epoch": 0.0022614840989399294,
"grad_norm": 20.50757598876953,
"learning_rate": 6e-05,
"loss": 0.9625,
"step": 3
},
{
"epoch": 0.0030153121319199056,
"grad_norm": 16.542922973632812,
"learning_rate": 8e-05,
"loss": 0.9352,
"step": 4
},
{
"epoch": 0.003769140164899882,
"grad_norm": 16.435678482055664,
"learning_rate": 0.0001,
"loss": 0.9104,
"step": 5
},
{
"epoch": 0.004522968197879859,
"grad_norm": 17.327836990356445,
"learning_rate": 0.00012,
"loss": 0.8723,
"step": 6
},
{
"epoch": 0.005276796230859835,
"grad_norm": 15.497602462768555,
"learning_rate": 0.00014,
"loss": 0.7953,
"step": 7
},
{
"epoch": 0.006030624263839811,
"grad_norm": 16.634872436523438,
"learning_rate": 0.00016,
"loss": 0.7444,
"step": 8
},
{
"epoch": 0.006784452296819788,
"grad_norm": 18.223051071166992,
"learning_rate": 0.00018,
"loss": 0.7296,
"step": 9
},
{
"epoch": 0.007538280329799764,
"grad_norm": 12.145986557006836,
"learning_rate": 0.0002,
"loss": 0.6559,
"step": 10
},
{
"epoch": 0.008292108362779741,
"grad_norm": 12.613237380981445,
"learning_rate": 0.00019999971548969982,
"loss": 0.6663,
"step": 11
},
{
"epoch": 0.009045936395759718,
"grad_norm": 12.572402954101562,
"learning_rate": 0.0001999988619604182,
"loss": 0.6575,
"step": 12
},
{
"epoch": 0.009799764428739694,
"grad_norm": 10.410853385925293,
"learning_rate": 0.00019999743941701188,
"loss": 0.6172,
"step": 13
},
{
"epoch": 0.01055359246171967,
"grad_norm": 9.993011474609375,
"learning_rate": 0.00019999544786757545,
"loss": 0.6205,
"step": 14
},
{
"epoch": 0.011307420494699646,
"grad_norm": 10.803098678588867,
"learning_rate": 0.00019999288732344122,
"loss": 0.5932,
"step": 15
},
{
"epoch": 0.012061248527679622,
"grad_norm": 10.612732887268066,
"learning_rate": 0.0001999897577991792,
"loss": 0.5989,
"step": 16
},
{
"epoch": 0.0128150765606596,
"grad_norm": 10.924768447875977,
"learning_rate": 0.0001999860593125971,
"loss": 0.5946,
"step": 17
},
{
"epoch": 0.013568904593639576,
"grad_norm": 9.804558753967285,
"learning_rate": 0.00019998179188473997,
"loss": 0.5321,
"step": 18
},
{
"epoch": 0.014322732626619553,
"grad_norm": 10.648846626281738,
"learning_rate": 0.00019997695553989042,
"loss": 0.5584,
"step": 19
},
{
"epoch": 0.015076560659599529,
"grad_norm": 10.692992210388184,
"learning_rate": 0.00019997155030556822,
"loss": 0.5603,
"step": 20
},
{
"epoch": 0.015830388692579505,
"grad_norm": 10.715287208557129,
"learning_rate": 0.00019996557621253027,
"loss": 0.584,
"step": 21
},
{
"epoch": 0.016584216725559483,
"grad_norm": 9.866957664489746,
"learning_rate": 0.0001999590332947704,
"loss": 0.5571,
"step": 22
},
{
"epoch": 0.017338044758539457,
"grad_norm": 9.68693733215332,
"learning_rate": 0.00019995192158951919,
"loss": 0.5415,
"step": 23
},
{
"epoch": 0.018091872791519435,
"grad_norm": 10.831818580627441,
"learning_rate": 0.00019994424113724363,
"loss": 0.5458,
"step": 24
},
{
"epoch": 0.01884570082449941,
"grad_norm": 9.90411376953125,
"learning_rate": 0.00019993599198164715,
"loss": 0.5368,
"step": 25
},
{
"epoch": 0.019599528857479388,
"grad_norm": 8.305344581604004,
"learning_rate": 0.0001999271741696691,
"loss": 0.5271,
"step": 26
},
{
"epoch": 0.020353356890459365,
"grad_norm": 9.10693645477295,
"learning_rate": 0.00019991778775148465,
"loss": 0.5038,
"step": 27
},
{
"epoch": 0.02110718492343934,
"grad_norm": 8.622981071472168,
"learning_rate": 0.00019990783278050448,
"loss": 0.5039,
"step": 28
},
{
"epoch": 0.021861012956419318,
"grad_norm": 10.21834945678711,
"learning_rate": 0.0001998973093133744,
"loss": 0.5499,
"step": 29
},
{
"epoch": 0.022614840989399292,
"grad_norm": 10.313283920288086,
"learning_rate": 0.00019988621740997512,
"loss": 0.5042,
"step": 30
},
{
"epoch": 0.02336866902237927,
"grad_norm": 8.889609336853027,
"learning_rate": 0.00019987455713342187,
"loss": 0.467,
"step": 31
},
{
"epoch": 0.024122497055359245,
"grad_norm": 8.749794006347656,
"learning_rate": 0.000199862328550064,
"loss": 0.5357,
"step": 32
},
{
"epoch": 0.024876325088339223,
"grad_norm": 8.97386360168457,
"learning_rate": 0.00019984953172948465,
"loss": 0.495,
"step": 33
},
{
"epoch": 0.0256301531213192,
"grad_norm": 9.057605743408203,
"learning_rate": 0.0001998361667445004,
"loss": 0.487,
"step": 34
},
{
"epoch": 0.026383981154299175,
"grad_norm": 9.346535682678223,
"learning_rate": 0.00019982223367116076,
"loss": 0.5348,
"step": 35
},
{
"epoch": 0.027137809187279153,
"grad_norm": 10.248679161071777,
"learning_rate": 0.00019980773258874778,
"loss": 0.5234,
"step": 36
},
{
"epoch": 0.027891637220259127,
"grad_norm": 9.637868881225586,
"learning_rate": 0.00019979266357977564,
"loss": 0.5,
"step": 37
},
{
"epoch": 0.028645465253239105,
"grad_norm": 10.087867736816406,
"learning_rate": 0.00019977702672999007,
"loss": 0.4772,
"step": 38
},
{
"epoch": 0.02939929328621908,
"grad_norm": 11.312880516052246,
"learning_rate": 0.00019976082212836793,
"loss": 0.4689,
"step": 39
},
{
"epoch": 0.030153121319199058,
"grad_norm": 10.71940803527832,
"learning_rate": 0.0001997440498671168,
"loss": 0.4961,
"step": 40
},
{
"epoch": 0.030906949352179035,
"grad_norm": 10.881596565246582,
"learning_rate": 0.00019972671004167433,
"loss": 0.5628,
"step": 41
},
{
"epoch": 0.03166077738515901,
"grad_norm": 11.740187644958496,
"learning_rate": 0.00019970880275070762,
"loss": 0.5096,
"step": 42
},
{
"epoch": 0.03241460541813899,
"grad_norm": 12.402807235717773,
"learning_rate": 0.00019969032809611287,
"loss": 0.6114,
"step": 43
},
{
"epoch": 0.033168433451118966,
"grad_norm": 13.945646286010742,
"learning_rate": 0.0001996712861830147,
"loss": 0.6009,
"step": 44
},
{
"epoch": 0.03392226148409894,
"grad_norm": 14.720582962036133,
"learning_rate": 0.00019965167711976552,
"loss": 0.5767,
"step": 45
},
{
"epoch": 0.034676089517078915,
"grad_norm": 15.46834945678711,
"learning_rate": 0.0001996315010179449,
"loss": 0.5566,
"step": 46
},
{
"epoch": 0.03542991755005889,
"grad_norm": 16.198299407958984,
"learning_rate": 0.00019961075799235903,
"loss": 0.6248,
"step": 47
},
{
"epoch": 0.03618374558303887,
"grad_norm": 36.07707214355469,
"learning_rate": 0.00019958944816104,
"loss": 0.5949,
"step": 48
},
{
"epoch": 0.03693757361601885,
"grad_norm": 25.19093894958496,
"learning_rate": 0.00019956757164524516,
"loss": 0.5619,
"step": 49
},
{
"epoch": 0.03769140164899882,
"grad_norm": 20.562816619873047,
"learning_rate": 0.00019954512856945632,
"loss": 0.662,
"step": 50
},
{
"epoch": 0.0384452296819788,
"grad_norm": 40.9443359375,
"learning_rate": 0.00019952211906137932,
"loss": 0.8705,
"step": 51
},
{
"epoch": 0.039199057714958775,
"grad_norm": 29.929576873779297,
"learning_rate": 0.00019949854325194294,
"loss": 0.7504,
"step": 52
},
{
"epoch": 0.03995288574793875,
"grad_norm": 18.468313217163086,
"learning_rate": 0.00019947440127529836,
"loss": 0.7158,
"step": 53
},
{
"epoch": 0.04070671378091873,
"grad_norm": 9.704629898071289,
"learning_rate": 0.00019944969326881845,
"loss": 0.5938,
"step": 54
},
{
"epoch": 0.0414605418138987,
"grad_norm": 9.891565322875977,
"learning_rate": 0.00019942441937309684,
"loss": 0.5693,
"step": 55
},
{
"epoch": 0.04221436984687868,
"grad_norm": 11.08341121673584,
"learning_rate": 0.00019939857973194717,
"loss": 0.5726,
"step": 56
},
{
"epoch": 0.04296819787985866,
"grad_norm": 10.182625770568848,
"learning_rate": 0.0001993721744924024,
"loss": 0.5854,
"step": 57
},
{
"epoch": 0.043722025912838636,
"grad_norm": 10.336113929748535,
"learning_rate": 0.00019934520380471372,
"loss": 0.5341,
"step": 58
},
{
"epoch": 0.04447585394581861,
"grad_norm": 7.881448745727539,
"learning_rate": 0.0001993176678223499,
"loss": 0.5013,
"step": 59
},
{
"epoch": 0.045229681978798585,
"grad_norm": 10.487141609191895,
"learning_rate": 0.0001992895667019964,
"loss": 0.5395,
"step": 60
},
{
"epoch": 0.04598351001177856,
"grad_norm": 10.38466739654541,
"learning_rate": 0.0001992609006035543,
"loss": 0.5214,
"step": 61
},
{
"epoch": 0.04673733804475854,
"grad_norm": 9.553030014038086,
"learning_rate": 0.0001992316696901397,
"loss": 0.4825,
"step": 62
},
{
"epoch": 0.04749116607773852,
"grad_norm": 8.298136711120605,
"learning_rate": 0.00019920187412808248,
"loss": 0.4797,
"step": 63
},
{
"epoch": 0.04824499411071849,
"grad_norm": 7.880730628967285,
"learning_rate": 0.0001991715140869255,
"loss": 0.4722,
"step": 64
},
{
"epoch": 0.04899882214369847,
"grad_norm": 8.825959205627441,
"learning_rate": 0.00019914058973942368,
"loss": 0.5022,
"step": 65
},
{
"epoch": 0.049752650176678445,
"grad_norm": 8.143362998962402,
"learning_rate": 0.00019910910126154293,
"loss": 0.4975,
"step": 66
},
{
"epoch": 0.05050647820965842,
"grad_norm": 7.704590797424316,
"learning_rate": 0.00019907704883245916,
"loss": 0.4973,
"step": 67
},
{
"epoch": 0.0512603062426384,
"grad_norm": 7.914122104644775,
"learning_rate": 0.00019904443263455728,
"loss": 0.5046,
"step": 68
},
{
"epoch": 0.05201413427561837,
"grad_norm": 8.946449279785156,
"learning_rate": 0.00019901125285343022,
"loss": 0.5124,
"step": 69
},
{
"epoch": 0.05276796230859835,
"grad_norm": 7.793578147888184,
"learning_rate": 0.0001989775096778777,
"loss": 0.5141,
"step": 70
},
{
"epoch": 0.05352179034157833,
"grad_norm": 7.8742756843566895,
"learning_rate": 0.0001989432032999054,
"loss": 0.5071,
"step": 71
},
{
"epoch": 0.054275618374558306,
"grad_norm": 8.052972793579102,
"learning_rate": 0.0001989083339147237,
"loss": 0.4938,
"step": 72
},
{
"epoch": 0.055029446407538284,
"grad_norm": 7.386316776275635,
"learning_rate": 0.0001988729017207465,
"loss": 0.4767,
"step": 73
},
{
"epoch": 0.055783274440518255,
"grad_norm": 7.526272773742676,
"learning_rate": 0.00019883690691959035,
"loss": 0.4642,
"step": 74
},
{
"epoch": 0.05653710247349823,
"grad_norm": 8.339061737060547,
"learning_rate": 0.00019880034971607308,
"loss": 0.4888,
"step": 75
},
{
"epoch": 0.05729093050647821,
"grad_norm": 8.045515060424805,
"learning_rate": 0.00019876323031821266,
"loss": 0.4478,
"step": 76
},
{
"epoch": 0.05804475853945819,
"grad_norm": 8.333029747009277,
"learning_rate": 0.00019872554893722618,
"loss": 0.4695,
"step": 77
},
{
"epoch": 0.05879858657243816,
"grad_norm": 8.050617218017578,
"learning_rate": 0.0001986873057875284,
"loss": 0.4532,
"step": 78
},
{
"epoch": 0.05955241460541814,
"grad_norm": 8.27062702178955,
"learning_rate": 0.00019864850108673073,
"loss": 0.4654,
"step": 79
},
{
"epoch": 0.060306242638398115,
"grad_norm": 8.429513931274414,
"learning_rate": 0.0001986091350556399,
"loss": 0.4829,
"step": 80
},
{
"epoch": 0.06106007067137809,
"grad_norm": 8.401616096496582,
"learning_rate": 0.00019856920791825683,
"loss": 0.5086,
"step": 81
},
{
"epoch": 0.06181389870435807,
"grad_norm": 8.308648109436035,
"learning_rate": 0.00019852871990177503,
"loss": 0.4758,
"step": 82
},
{
"epoch": 0.06256772673733804,
"grad_norm": 8.516093254089355,
"learning_rate": 0.00019848767123657976,
"loss": 0.4423,
"step": 83
},
{
"epoch": 0.06332155477031802,
"grad_norm": 8.437211990356445,
"learning_rate": 0.0001984460621562463,
"loss": 0.4429,
"step": 84
},
{
"epoch": 0.064075382803298,
"grad_norm": 8.637296676635742,
"learning_rate": 0.00019840389289753896,
"loss": 0.4956,
"step": 85
},
{
"epoch": 0.06482921083627798,
"grad_norm": 8.39278507232666,
"learning_rate": 0.00019836116370040944,
"loss": 0.4483,
"step": 86
},
{
"epoch": 0.06558303886925795,
"grad_norm": 9.617965698242188,
"learning_rate": 0.00019831787480799568,
"loss": 0.4714,
"step": 87
},
{
"epoch": 0.06633686690223793,
"grad_norm": 8.52342700958252,
"learning_rate": 0.00019827402646662047,
"loss": 0.4375,
"step": 88
},
{
"epoch": 0.06709069493521791,
"grad_norm": 9.882357597351074,
"learning_rate": 0.0001982296189257898,
"loss": 0.4796,
"step": 89
},
{
"epoch": 0.06784452296819787,
"grad_norm": 9.361654281616211,
"learning_rate": 0.00019818465243819184,
"loss": 0.4871,
"step": 90
},
{
"epoch": 0.06859835100117785,
"grad_norm": 9.959556579589844,
"learning_rate": 0.00019813912725969509,
"loss": 0.472,
"step": 91
},
{
"epoch": 0.06935217903415783,
"grad_norm": 9.579131126403809,
"learning_rate": 0.0001980930436493472,
"loss": 0.4906,
"step": 92
},
{
"epoch": 0.07010600706713781,
"grad_norm": 10.082910537719727,
"learning_rate": 0.00019804640186937343,
"loss": 0.537,
"step": 93
},
{
"epoch": 0.07085983510011779,
"grad_norm": 10.720930099487305,
"learning_rate": 0.0001979992021851751,
"loss": 0.5277,
"step": 94
},
{
"epoch": 0.07161366313309776,
"grad_norm": 10.86539363861084,
"learning_rate": 0.00019795144486532814,
"loss": 0.5511,
"step": 95
},
{
"epoch": 0.07236749116607774,
"grad_norm": 13.410208702087402,
"learning_rate": 0.00019790313018158156,
"loss": 0.5658,
"step": 96
},
{
"epoch": 0.07312131919905772,
"grad_norm": 14.898797988891602,
"learning_rate": 0.0001978542584088558,
"loss": 0.5529,
"step": 97
},
{
"epoch": 0.0738751472320377,
"grad_norm": 14.036768913269043,
"learning_rate": 0.00019780482982524142,
"loss": 0.5396,
"step": 98
},
{
"epoch": 0.07462897526501767,
"grad_norm": 15.9882173538208,
"learning_rate": 0.00019775484471199715,
"loss": 0.514,
"step": 99
},
{
"epoch": 0.07538280329799764,
"grad_norm": 17.01093864440918,
"learning_rate": 0.0001977043033535486,
"loss": 0.5262,
"step": 100
},
{
"epoch": 0.07613663133097762,
"grad_norm": 41.135196685791016,
"learning_rate": 0.00019765320603748655,
"loss": 0.7909,
"step": 101
},
{
"epoch": 0.0768904593639576,
"grad_norm": 25.291397094726562,
"learning_rate": 0.0001976015530545652,
"loss": 0.714,
"step": 102
},
{
"epoch": 0.07764428739693757,
"grad_norm": 12.169105529785156,
"learning_rate": 0.0001975493446987007,
"loss": 0.5999,
"step": 103
},
{
"epoch": 0.07839811542991755,
"grad_norm": 8.400662422180176,
"learning_rate": 0.00019749658126696934,
"loss": 0.5707,
"step": 104
},
{
"epoch": 0.07915194346289753,
"grad_norm": 10.622336387634277,
"learning_rate": 0.00019744326305960595,
"loss": 0.5798,
"step": 105
},
{
"epoch": 0.0799057714958775,
"grad_norm": 10.29685115814209,
"learning_rate": 0.00019738939038000205,
"loss": 0.5752,
"step": 106
},
{
"epoch": 0.08065959952885748,
"grad_norm": 7.853797435760498,
"learning_rate": 0.00019733496353470433,
"loss": 0.543,
"step": 107
},
{
"epoch": 0.08141342756183746,
"grad_norm": 7.910231113433838,
"learning_rate": 0.00019727998283341274,
"loss": 0.5155,
"step": 108
},
{
"epoch": 0.08216725559481743,
"grad_norm": 8.53306770324707,
"learning_rate": 0.00019722444858897878,
"loss": 0.5029,
"step": 109
},
{
"epoch": 0.0829210836277974,
"grad_norm": 8.579912185668945,
"learning_rate": 0.00019716836111740378,
"loss": 0.487,
"step": 110
},
{
"epoch": 0.08367491166077738,
"grad_norm": 8.553475379943848,
"learning_rate": 0.00019711172073783696,
"loss": 0.4853,
"step": 111
},
{
"epoch": 0.08442873969375736,
"grad_norm": 9.386043548583984,
"learning_rate": 0.00019705452777257377,
"loss": 0.4941,
"step": 112
},
{
"epoch": 0.08518256772673734,
"grad_norm": 8.391158103942871,
"learning_rate": 0.000196996782547054,
"loss": 0.4657,
"step": 113
},
{
"epoch": 0.08593639575971732,
"grad_norm": 8.52602481842041,
"learning_rate": 0.00019693848538985983,
"loss": 0.4744,
"step": 114
},
{
"epoch": 0.0866902237926973,
"grad_norm": 7.8026885986328125,
"learning_rate": 0.00019687963663271409,
"loss": 0.4742,
"step": 115
},
{
"epoch": 0.08744405182567727,
"grad_norm": 8.957297325134277,
"learning_rate": 0.00019682023661047836,
"loss": 0.4846,
"step": 116
},
{
"epoch": 0.08819787985865725,
"grad_norm": 8.33506965637207,
"learning_rate": 0.00019676028566115102,
"loss": 0.47,
"step": 117
},
{
"epoch": 0.08895170789163721,
"grad_norm": 7.720737934112549,
"learning_rate": 0.00019669978412586528,
"loss": 0.4512,
"step": 118
},
{
"epoch": 0.08970553592461719,
"grad_norm": 7.069596290588379,
"learning_rate": 0.00019663873234888733,
"loss": 0.4685,
"step": 119
},
{
"epoch": 0.09045936395759717,
"grad_norm": 7.589311599731445,
"learning_rate": 0.0001965771306776144,
"loss": 0.4702,
"step": 120
},
{
"epoch": 0.09121319199057715,
"grad_norm": 7.950814723968506,
"learning_rate": 0.00019651497946257266,
"loss": 0.4797,
"step": 121
},
{
"epoch": 0.09196702002355713,
"grad_norm": 7.834803581237793,
"learning_rate": 0.00019645227905741534,
"loss": 0.4512,
"step": 122
},
{
"epoch": 0.0927208480565371,
"grad_norm": 7.925727844238281,
"learning_rate": 0.00019638902981892068,
"loss": 0.4702,
"step": 123
},
{
"epoch": 0.09347467608951708,
"grad_norm": 7.2047038078308105,
"learning_rate": 0.00019632523210698987,
"loss": 0.4586,
"step": 124
},
{
"epoch": 0.09422850412249706,
"grad_norm": 8.701865196228027,
"learning_rate": 0.00019626088628464498,
"loss": 0.4629,
"step": 125
},
{
"epoch": 0.09498233215547704,
"grad_norm": 7.792990684509277,
"learning_rate": 0.00019619599271802706,
"loss": 0.4578,
"step": 126
},
{
"epoch": 0.09573616018845701,
"grad_norm": 7.0652642250061035,
"learning_rate": 0.00019613055177639384,
"loss": 0.4439,
"step": 127
},
{
"epoch": 0.09648998822143698,
"grad_norm": 7.519805431365967,
"learning_rate": 0.00019606456383211777,
"loss": 0.4371,
"step": 128
},
{
"epoch": 0.09724381625441696,
"grad_norm": 7.7905659675598145,
"learning_rate": 0.00019599802926068384,
"loss": 0.4631,
"step": 129
},
{
"epoch": 0.09799764428739693,
"grad_norm": 7.713627338409424,
"learning_rate": 0.00019593094844068748,
"loss": 0.4415,
"step": 130
},
{
"epoch": 0.09875147232037691,
"grad_norm": 7.864312171936035,
"learning_rate": 0.00019586332175383238,
"loss": 0.493,
"step": 131
},
{
"epoch": 0.09950530035335689,
"grad_norm": 7.424186706542969,
"learning_rate": 0.00019579514958492826,
"loss": 0.4105,
"step": 132
},
{
"epoch": 0.10025912838633687,
"grad_norm": 7.774516582489014,
"learning_rate": 0.0001957264323218889,
"loss": 0.4406,
"step": 133
},
{
"epoch": 0.10101295641931685,
"grad_norm": 8.56273365020752,
"learning_rate": 0.0001956571703557296,
"loss": 0.4743,
"step": 134
},
{
"epoch": 0.10176678445229682,
"grad_norm": 7.981069087982178,
"learning_rate": 0.00019558736408056525,
"loss": 0.4167,
"step": 135
},
{
"epoch": 0.1025206124852768,
"grad_norm": 7.851569175720215,
"learning_rate": 0.00019551701389360795,
"loss": 0.4582,
"step": 136
},
{
"epoch": 0.10327444051825677,
"grad_norm": 7.7381510734558105,
"learning_rate": 0.00019544612019516472,
"loss": 0.4336,
"step": 137
},
{
"epoch": 0.10402826855123674,
"grad_norm": 8.127756118774414,
"learning_rate": 0.00019537468338863537,
"loss": 0.4588,
"step": 138
},
{
"epoch": 0.10478209658421672,
"grad_norm": 7.989606857299805,
"learning_rate": 0.00019530270388050998,
"loss": 0.4269,
"step": 139
},
{
"epoch": 0.1055359246171967,
"grad_norm": 8.431105613708496,
"learning_rate": 0.00019523018208036677,
"loss": 0.4645,
"step": 140
},
{
"epoch": 0.10628975265017668,
"grad_norm": 8.575553894042969,
"learning_rate": 0.0001951571184008698,
"loss": 0.4587,
"step": 141
},
{
"epoch": 0.10704358068315666,
"grad_norm": 9.703766822814941,
"learning_rate": 0.00019508351325776642,
"loss": 0.4826,
"step": 142
},
{
"epoch": 0.10779740871613663,
"grad_norm": 10.319994926452637,
"learning_rate": 0.00019500936706988502,
"loss": 0.5255,
"step": 143
},
{
"epoch": 0.10855123674911661,
"grad_norm": 11.801458358764648,
"learning_rate": 0.00019493468025913276,
"loss": 0.5143,
"step": 144
},
{
"epoch": 0.10930506478209659,
"grad_norm": 11.02754020690918,
"learning_rate": 0.00019485945325049288,
"loss": 0.4947,
"step": 145
},
{
"epoch": 0.11005889281507657,
"grad_norm": 11.526784896850586,
"learning_rate": 0.00019478368647202264,
"loss": 0.5627,
"step": 146
},
{
"epoch": 0.11081272084805653,
"grad_norm": 11.704715728759766,
"learning_rate": 0.00019470738035485058,
"loss": 0.5015,
"step": 147
},
{
"epoch": 0.11156654888103651,
"grad_norm": 14.198360443115234,
"learning_rate": 0.00019463053533317425,
"loss": 0.5488,
"step": 148
},
{
"epoch": 0.11232037691401649,
"grad_norm": 14.75071907043457,
"learning_rate": 0.0001945531518442576,
"loss": 0.5327,
"step": 149
},
{
"epoch": 0.11307420494699646,
"grad_norm": 17.345752716064453,
"learning_rate": 0.0001944752303284287,
"loss": 0.4909,
"step": 150
},
{
"epoch": 0.11382803297997644,
"grad_norm": 25.253982543945312,
"learning_rate": 0.00019439677122907697,
"loss": 0.7106,
"step": 151
},
{
"epoch": 0.11458186101295642,
"grad_norm": 22.05714988708496,
"learning_rate": 0.00019431777499265087,
"loss": 0.6719,
"step": 152
},
{
"epoch": 0.1153356890459364,
"grad_norm": 14.386154174804688,
"learning_rate": 0.00019423824206865527,
"loss": 0.663,
"step": 153
},
{
"epoch": 0.11608951707891638,
"grad_norm": 8.701356887817383,
"learning_rate": 0.00019415817290964883,
"loss": 0.5581,
"step": 154
},
{
"epoch": 0.11684334511189635,
"grad_norm": 8.447550773620605,
"learning_rate": 0.00019407756797124164,
"loss": 0.5545,
"step": 155
},
{
"epoch": 0.11759717314487632,
"grad_norm": 9.116722106933594,
"learning_rate": 0.00019399642771209238,
"loss": 0.5284,
"step": 156
},
{
"epoch": 0.1183510011778563,
"grad_norm": 9.142845153808594,
"learning_rate": 0.00019391475259390584,
"loss": 0.5052,
"step": 157
},
{
"epoch": 0.11910482921083627,
"grad_norm": 9.175527572631836,
"learning_rate": 0.0001938325430814302,
"loss": 0.524,
"step": 158
},
{
"epoch": 0.11985865724381625,
"grad_norm": 8.684857368469238,
"learning_rate": 0.00019374979964245463,
"loss": 0.5387,
"step": 159
},
{
"epoch": 0.12061248527679623,
"grad_norm": 9.40937328338623,
"learning_rate": 0.00019366652274780628,
"loss": 0.5081,
"step": 160
},
{
"epoch": 0.12136631330977621,
"grad_norm": 9.983878135681152,
"learning_rate": 0.00019358271287134784,
"loss": 0.5234,
"step": 161
},
{
"epoch": 0.12212014134275619,
"grad_norm": 8.468266487121582,
"learning_rate": 0.00019349837048997478,
"loss": 0.5008,
"step": 162
},
{
"epoch": 0.12287396937573616,
"grad_norm": 7.315543174743652,
"learning_rate": 0.00019341349608361267,
"loss": 0.4778,
"step": 163
},
{
"epoch": 0.12362779740871614,
"grad_norm": 8.254434585571289,
"learning_rate": 0.00019332809013521428,
"loss": 0.4949,
"step": 164
},
{
"epoch": 0.12438162544169612,
"grad_norm": 9.409392356872559,
"learning_rate": 0.00019324215313075706,
"loss": 0.4842,
"step": 165
},
{
"epoch": 0.12513545347467608,
"grad_norm": 7.584166526794434,
"learning_rate": 0.00019315568555924035,
"loss": 0.4859,
"step": 166
},
{
"epoch": 0.12588928150765608,
"grad_norm": 7.280964374542236,
"learning_rate": 0.0001930686879126824,
"loss": 0.4436,
"step": 167
},
{
"epoch": 0.12664310954063604,
"grad_norm": 7.54876708984375,
"learning_rate": 0.0001929811606861177,
"loss": 0.4636,
"step": 168
},
{
"epoch": 0.12739693757361603,
"grad_norm": 8.36787223815918,
"learning_rate": 0.00019289310437759427,
"loss": 0.4862,
"step": 169
},
{
"epoch": 0.128150765606596,
"grad_norm": 8.098321914672852,
"learning_rate": 0.00019280451948817059,
"loss": 0.4558,
"step": 170
},
{
"epoch": 0.12890459363957596,
"grad_norm": 8.111252784729004,
"learning_rate": 0.00019271540652191296,
"loss": 0.461,
"step": 171
},
{
"epoch": 0.12965842167255595,
"grad_norm": 7.394045829772949,
"learning_rate": 0.0001926257659858925,
"loss": 0.4397,
"step": 172
},
{
"epoch": 0.13041224970553592,
"grad_norm": 7.361767768859863,
"learning_rate": 0.00019253559839018235,
"loss": 0.4811,
"step": 173
},
{
"epoch": 0.1311660777385159,
"grad_norm": 7.598999500274658,
"learning_rate": 0.00019244490424785468,
"loss": 0.4353,
"step": 174
},
{
"epoch": 0.13191990577149587,
"grad_norm": 7.871952056884766,
"learning_rate": 0.00019235368407497788,
"loss": 0.4847,
"step": 175
},
{
"epoch": 0.13267373380447586,
"grad_norm": 7.250602722167969,
"learning_rate": 0.00019226193839061347,
"loss": 0.4482,
"step": 176
},
{
"epoch": 0.13342756183745583,
"grad_norm": 7.890292644500732,
"learning_rate": 0.0001921696677168133,
"loss": 0.4475,
"step": 177
},
{
"epoch": 0.13418138987043582,
"grad_norm": 7.192571640014648,
"learning_rate": 0.00019207687257861655,
"loss": 0.4093,
"step": 178
},
{
"epoch": 0.13493521790341578,
"grad_norm": 8.001566886901855,
"learning_rate": 0.00019198355350404667,
"loss": 0.4729,
"step": 179
},
{
"epoch": 0.13568904593639575,
"grad_norm": 7.559464454650879,
"learning_rate": 0.00019188971102410837,
"loss": 0.4455,
"step": 180
},
{
"epoch": 0.13644287396937574,
"grad_norm": 7.921515941619873,
"learning_rate": 0.00019179534567278475,
"loss": 0.4421,
"step": 181
},
{
"epoch": 0.1371967020023557,
"grad_norm": 7.778410911560059,
"learning_rate": 0.00019170045798703406,
"loss": 0.4485,
"step": 182
},
{
"epoch": 0.1379505300353357,
"grad_norm": 7.606152534484863,
"learning_rate": 0.0001916050485067868,
"loss": 0.4235,
"step": 183
},
{
"epoch": 0.13870435806831566,
"grad_norm": 7.29620361328125,
"learning_rate": 0.00019150911777494258,
"loss": 0.4433,
"step": 184
},
{
"epoch": 0.13945818610129565,
"grad_norm": 7.7016072273254395,
"learning_rate": 0.00019141266633736697,
"loss": 0.444,
"step": 185
},
{
"epoch": 0.14021201413427561,
"grad_norm": 7.524323463439941,
"learning_rate": 0.0001913156947428886,
"loss": 0.4481,
"step": 186
},
{
"epoch": 0.1409658421672556,
"grad_norm": 7.7455525398254395,
"learning_rate": 0.00019121820354329577,
"loss": 0.4152,
"step": 187
},
{
"epoch": 0.14171967020023557,
"grad_norm": 8.12897777557373,
"learning_rate": 0.00019112019329333346,
"loss": 0.4443,
"step": 188
},
{
"epoch": 0.14247349823321553,
"grad_norm": 7.774250507354736,
"learning_rate": 0.00019102166455070024,
"loss": 0.4442,
"step": 189
},
{
"epoch": 0.14322732626619553,
"grad_norm": 8.02647876739502,
"learning_rate": 0.00019092261787604492,
"loss": 0.4489,
"step": 190
},
{
"epoch": 0.1439811542991755,
"grad_norm": 7.7611799240112305,
"learning_rate": 0.00019082305383296352,
"loss": 0.4122,
"step": 191
},
{
"epoch": 0.14473498233215548,
"grad_norm": 9.484501838684082,
"learning_rate": 0.00019072297298799589,
"loss": 0.4725,
"step": 192
},
{
"epoch": 0.14548881036513545,
"grad_norm": 9.696186065673828,
"learning_rate": 0.00019062237591062272,
"loss": 0.4913,
"step": 193
},
{
"epoch": 0.14624263839811544,
"grad_norm": 11.048422813415527,
"learning_rate": 0.00019052126317326207,
"loss": 0.5425,
"step": 194
},
{
"epoch": 0.1469964664310954,
"grad_norm": 10.327349662780762,
"learning_rate": 0.00019041963535126625,
"loss": 0.5171,
"step": 195
},
{
"epoch": 0.1477502944640754,
"grad_norm": 11.808932304382324,
"learning_rate": 0.0001903174930229185,
"loss": 0.504,
"step": 196
},
{
"epoch": 0.14850412249705536,
"grad_norm": 11.13871955871582,
"learning_rate": 0.00019021483676942973,
"loss": 0.5261,
"step": 197
},
{
"epoch": 0.14925795053003535,
"grad_norm": 11.771498680114746,
"learning_rate": 0.00019011166717493517,
"loss": 0.5062,
"step": 198
},
{
"epoch": 0.1500117785630153,
"grad_norm": 13.0664644241333,
"learning_rate": 0.000190007984826491,
"loss": 0.5488,
"step": 199
},
{
"epoch": 0.15076560659599528,
"grad_norm": 15.87386417388916,
"learning_rate": 0.00018990379031407124,
"loss": 0.547,
"step": 200
},
{
"epoch": 0.15151943462897527,
"grad_norm": 20.688980102539062,
"learning_rate": 0.00018979908423056408,
"loss": 0.7222,
"step": 201
},
{
"epoch": 0.15227326266195523,
"grad_norm": 16.90519905090332,
"learning_rate": 0.0001896938671717687,
"loss": 0.6582,
"step": 202
},
{
"epoch": 0.15302709069493522,
"grad_norm": 11.236451148986816,
"learning_rate": 0.00018958813973639184,
"loss": 0.6151,
"step": 203
},
{
"epoch": 0.1537809187279152,
"grad_norm": 8.368070602416992,
"learning_rate": 0.0001894819025260444,
"loss": 0.5729,
"step": 204
},
{
"epoch": 0.15453474676089518,
"grad_norm": 7.891096115112305,
"learning_rate": 0.00018937515614523797,
"loss": 0.5132,
"step": 205
},
{
"epoch": 0.15528857479387514,
"grad_norm": 8.290247917175293,
"learning_rate": 0.0001892679012013815,
"loss": 0.5311,
"step": 206
},
{
"epoch": 0.15604240282685514,
"grad_norm": 8.068524360656738,
"learning_rate": 0.00018916013830477766,
"loss": 0.5038,
"step": 207
},
{
"epoch": 0.1567962308598351,
"grad_norm": 7.199114799499512,
"learning_rate": 0.00018905186806861957,
"loss": 0.4933,
"step": 208
},
{
"epoch": 0.15755005889281506,
"grad_norm": 6.769901275634766,
"learning_rate": 0.00018894309110898712,
"loss": 0.4743,
"step": 209
},
{
"epoch": 0.15830388692579506,
"grad_norm": 7.485007286071777,
"learning_rate": 0.00018883380804484367,
"loss": 0.4832,
"step": 210
},
{
"epoch": 0.15905771495877502,
"grad_norm": 7.059638500213623,
"learning_rate": 0.00018872401949803237,
"loss": 0.4544,
"step": 211
},
{
"epoch": 0.159811542991755,
"grad_norm": 7.6718549728393555,
"learning_rate": 0.00018861372609327263,
"loss": 0.4727,
"step": 212
},
{
"epoch": 0.16056537102473498,
"grad_norm": 7.764082431793213,
"learning_rate": 0.00018850292845815672,
"loss": 0.4645,
"step": 213
},
{
"epoch": 0.16131919905771497,
"grad_norm": 8.037138938903809,
"learning_rate": 0.0001883916272231459,
"loss": 0.4712,
"step": 214
},
{
"epoch": 0.16207302709069493,
"grad_norm": 7.26751184463501,
"learning_rate": 0.0001882798230215672,
"loss": 0.4477,
"step": 215
},
{
"epoch": 0.16282685512367492,
"grad_norm": 7.747137069702148,
"learning_rate": 0.00018816751648960956,
"loss": 0.4544,
"step": 216
},
{
"epoch": 0.1635806831566549,
"grad_norm": 7.478286266326904,
"learning_rate": 0.00018805470826632024,
"loss": 0.4539,
"step": 217
},
{
"epoch": 0.16433451118963485,
"grad_norm": 7.051617622375488,
"learning_rate": 0.0001879413989936013,
"loss": 0.4688,
"step": 218
},
{
"epoch": 0.16508833922261484,
"grad_norm": 7.303111553192139,
"learning_rate": 0.00018782758931620584,
"loss": 0.4551,
"step": 219
},
{
"epoch": 0.1658421672555948,
"grad_norm": 7.094053745269775,
"learning_rate": 0.00018771327988173435,
"loss": 0.4398,
"step": 220
},
{
"epoch": 0.1665959952885748,
"grad_norm": 7.781626224517822,
"learning_rate": 0.00018759847134063108,
"loss": 0.4719,
"step": 221
},
{
"epoch": 0.16734982332155476,
"grad_norm": 7.860665321350098,
"learning_rate": 0.0001874831643461803,
"loss": 0.4573,
"step": 222
},
{
"epoch": 0.16810365135453476,
"grad_norm": 7.380893707275391,
"learning_rate": 0.00018736735955450251,
"loss": 0.4341,
"step": 223
},
{
"epoch": 0.16885747938751472,
"grad_norm": 7.672417163848877,
"learning_rate": 0.0001872510576245509,
"loss": 0.4511,
"step": 224
},
{
"epoch": 0.1696113074204947,
"grad_norm": 7.173273086547852,
"learning_rate": 0.00018713425921810733,
"loss": 0.4374,
"step": 225
},
{
"epoch": 0.17036513545347468,
"grad_norm": 7.41825532913208,
"learning_rate": 0.00018701696499977884,
"loss": 0.4464,
"step": 226
},
{
"epoch": 0.17111896348645464,
"grad_norm": 8.151430130004883,
"learning_rate": 0.0001868991756369937,
"loss": 0.4535,
"step": 227
},
{
"epoch": 0.17187279151943463,
"grad_norm": 7.760961532592773,
"learning_rate": 0.00018678089179999762,
"loss": 0.4731,
"step": 228
},
{
"epoch": 0.1726266195524146,
"grad_norm": 8.02840518951416,
"learning_rate": 0.00018666211416184999,
"loss": 0.4745,
"step": 229
},
{
"epoch": 0.1733804475853946,
"grad_norm": 7.38688850402832,
"learning_rate": 0.00018654284339842013,
"loss": 0.4341,
"step": 230
},
{
"epoch": 0.17413427561837455,
"grad_norm": 7.492348670959473,
"learning_rate": 0.00018642308018838316,
"loss": 0.4147,
"step": 231
},
{
"epoch": 0.17488810365135454,
"grad_norm": 7.687479019165039,
"learning_rate": 0.00018630282521321645,
"loss": 0.4404,
"step": 232
},
{
"epoch": 0.1756419316843345,
"grad_norm": 7.790548324584961,
"learning_rate": 0.0001861820791571956,
"loss": 0.4389,
"step": 233
},
{
"epoch": 0.1763957597173145,
"grad_norm": 7.557417392730713,
"learning_rate": 0.00018606084270739049,
"loss": 0.4467,
"step": 234
},
{
"epoch": 0.17714958775029446,
"grad_norm": 7.971850872039795,
"learning_rate": 0.0001859391165536615,
"loss": 0.415,
"step": 235
},
{
"epoch": 0.17790341578327443,
"grad_norm": 8.08571720123291,
"learning_rate": 0.0001858169013886556,
"loss": 0.4488,
"step": 236
},
{
"epoch": 0.17865724381625442,
"grad_norm": 7.706898212432861,
"learning_rate": 0.00018569419790780218,
"loss": 0.4296,
"step": 237
},
{
"epoch": 0.17941107184923438,
"grad_norm": 7.6118245124816895,
"learning_rate": 0.00018557100680930937,
"loss": 0.4223,
"step": 238
},
{
"epoch": 0.18016489988221437,
"grad_norm": 8.255146980285645,
"learning_rate": 0.00018544732879415986,
"loss": 0.4802,
"step": 239
},
{
"epoch": 0.18091872791519434,
"grad_norm": 9.077119827270508,
"learning_rate": 0.00018532316456610704,
"loss": 0.4376,
"step": 240
},
{
"epoch": 0.18167255594817433,
"grad_norm": 8.465483665466309,
"learning_rate": 0.00018519851483167097,
"loss": 0.4339,
"step": 241
},
{
"epoch": 0.1824263839811543,
"grad_norm": 9.302364349365234,
"learning_rate": 0.00018507338030013427,
"loss": 0.4429,
"step": 242
},
{
"epoch": 0.18318021201413429,
"grad_norm": 10.150344848632812,
"learning_rate": 0.00018494776168353827,
"loss": 0.4768,
"step": 243
},
{
"epoch": 0.18393404004711425,
"grad_norm": 10.960404396057129,
"learning_rate": 0.00018482165969667874,
"loss": 0.5072,
"step": 244
},
{
"epoch": 0.18468786808009424,
"grad_norm": 10.028700828552246,
"learning_rate": 0.00018469507505710194,
"loss": 0.5194,
"step": 245
},
{
"epoch": 0.1854416961130742,
"grad_norm": 10.371344566345215,
"learning_rate": 0.00018456800848510056,
"loss": 0.4974,
"step": 246
},
{
"epoch": 0.18619552414605417,
"grad_norm": 11.256722450256348,
"learning_rate": 0.00018444046070370963,
"loss": 0.4655,
"step": 247
},
{
"epoch": 0.18694935217903416,
"grad_norm": 11.339438438415527,
"learning_rate": 0.00018431243243870223,
"loss": 0.5004,
"step": 248
},
{
"epoch": 0.18770318021201413,
"grad_norm": 12.51115894317627,
"learning_rate": 0.00018418392441858555,
"loss": 0.5498,
"step": 249
},
{
"epoch": 0.18845700824499412,
"grad_norm": 12.920282363891602,
"learning_rate": 0.0001840549373745968,
"loss": 0.4545,
"step": 250
},
{
"epoch": 0.18921083627797408,
"grad_norm": 17.809480667114258,
"learning_rate": 0.0001839254720406987,
"loss": 0.6779,
"step": 251
},
{
"epoch": 0.18996466431095407,
"grad_norm": 14.654753684997559,
"learning_rate": 0.00018379552915357575,
"loss": 0.639,
"step": 252
},
{
"epoch": 0.19071849234393404,
"grad_norm": 10.703547477722168,
"learning_rate": 0.00018366510945262972,
"loss": 0.6024,
"step": 253
},
{
"epoch": 0.19147232037691403,
"grad_norm": 9.329981803894043,
"learning_rate": 0.00018353421367997563,
"loss": 0.5221,
"step": 254
},
{
"epoch": 0.192226148409894,
"grad_norm": 7.0998663902282715,
"learning_rate": 0.00018340284258043732,
"loss": 0.5203,
"step": 255
},
{
"epoch": 0.19297997644287396,
"grad_norm": 8.919529914855957,
"learning_rate": 0.00018327099690154344,
"loss": 0.5286,
"step": 256
},
{
"epoch": 0.19373380447585395,
"grad_norm": 8.378999710083008,
"learning_rate": 0.00018313867739352304,
"loss": 0.4929,
"step": 257
},
{
"epoch": 0.1944876325088339,
"grad_norm": 7.437035083770752,
"learning_rate": 0.00018300588480930143,
"loss": 0.4622,
"step": 258
},
{
"epoch": 0.1952414605418139,
"grad_norm": 7.368019104003906,
"learning_rate": 0.0001828726199044957,
"loss": 0.4824,
"step": 259
},
{
"epoch": 0.19599528857479387,
"grad_norm": 7.174773693084717,
"learning_rate": 0.0001827388834374107,
"loss": 0.4601,
"step": 260
},
{
"epoch": 0.19674911660777386,
"grad_norm": 7.612614154815674,
"learning_rate": 0.0001826046761690344,
"loss": 0.474,
"step": 261
},
{
"epoch": 0.19750294464075382,
"grad_norm": 8.047442436218262,
"learning_rate": 0.00018246999886303383,
"loss": 0.4594,
"step": 262
},
{
"epoch": 0.19825677267373382,
"grad_norm": 7.06972599029541,
"learning_rate": 0.00018233485228575063,
"loss": 0.4944,
"step": 263
},
{
"epoch": 0.19901060070671378,
"grad_norm": 7.2451324462890625,
"learning_rate": 0.00018219923720619663,
"loss": 0.4748,
"step": 264
},
{
"epoch": 0.19976442873969374,
"grad_norm": 8.119038581848145,
"learning_rate": 0.0001820631543960496,
"loss": 0.4286,
"step": 265
},
{
"epoch": 0.20051825677267374,
"grad_norm": 8.046279907226562,
"learning_rate": 0.0001819266046296487,
"loss": 0.4566,
"step": 266
},
{
"epoch": 0.2012720848056537,
"grad_norm": 6.79647970199585,
"learning_rate": 0.00018178958868399033,
"loss": 0.4214,
"step": 267
},
{
"epoch": 0.2020259128386337,
"grad_norm": 6.761276721954346,
"learning_rate": 0.00018165210733872336,
"loss": 0.4272,
"step": 268
},
{
"epoch": 0.20277974087161366,
"grad_norm": 7.771080493927002,
"learning_rate": 0.000181514161376145,
"loss": 0.4602,
"step": 269
},
{
"epoch": 0.20353356890459365,
"grad_norm": 7.610669136047363,
"learning_rate": 0.0001813757515811962,
"loss": 0.4413,
"step": 270
},
{
"epoch": 0.2042873969375736,
"grad_norm": 7.277632236480713,
"learning_rate": 0.00018123687874145721,
"loss": 0.417,
"step": 271
},
{
"epoch": 0.2050412249705536,
"grad_norm": 7.344987869262695,
"learning_rate": 0.00018109754364714305,
"loss": 0.4326,
"step": 272
},
{
"epoch": 0.20579505300353357,
"grad_norm": 7.373658180236816,
"learning_rate": 0.0001809577470910992,
"loss": 0.4107,
"step": 273
},
{
"epoch": 0.20654888103651353,
"grad_norm": 8.498446464538574,
"learning_rate": 0.00018081748986879679,
"loss": 0.4463,
"step": 274
},
{
"epoch": 0.20730270906949352,
"grad_norm": 7.138429164886475,
"learning_rate": 0.00018067677277832834,
"loss": 0.4354,
"step": 275
},
{
"epoch": 0.2080565371024735,
"grad_norm": 7.916346073150635,
"learning_rate": 0.00018053559662040302,
"loss": 0.448,
"step": 276
},
{
"epoch": 0.20881036513545348,
"grad_norm": 6.8389201164245605,
"learning_rate": 0.00018039396219834237,
"loss": 0.4095,
"step": 277
},
{
"epoch": 0.20956419316843344,
"grad_norm": 7.184628009796143,
"learning_rate": 0.00018025187031807532,
"loss": 0.421,
"step": 278
},
{
"epoch": 0.21031802120141344,
"grad_norm": 6.9601569175720215,
"learning_rate": 0.00018010932178813397,
"loss": 0.4329,
"step": 279
},
{
"epoch": 0.2110718492343934,
"grad_norm": 7.579134464263916,
"learning_rate": 0.00017996631741964888,
"loss": 0.439,
"step": 280
},
{
"epoch": 0.2118256772673734,
"grad_norm": 7.37368106842041,
"learning_rate": 0.00017982285802634426,
"loss": 0.4225,
"step": 281
},
{
"epoch": 0.21257950530035336,
"grad_norm": 7.1782145500183105,
"learning_rate": 0.0001796789444245337,
"loss": 0.4094,
"step": 282
},
{
"epoch": 0.21333333333333335,
"grad_norm": 7.470993995666504,
"learning_rate": 0.00017953457743311523,
"loss": 0.4267,
"step": 283
},
{
"epoch": 0.2140871613663133,
"grad_norm": 7.285700798034668,
"learning_rate": 0.00017938975787356673,
"loss": 0.4113,
"step": 284
},
{
"epoch": 0.21484098939929328,
"grad_norm": 7.5254130363464355,
"learning_rate": 0.00017924448656994133,
"loss": 0.4362,
"step": 285
},
{
"epoch": 0.21559481743227327,
"grad_norm": 7.6265411376953125,
"learning_rate": 0.00017909876434886273,
"loss": 0.443,
"step": 286
},
{
"epoch": 0.21634864546525323,
"grad_norm": 7.822786808013916,
"learning_rate": 0.00017895259203952032,
"loss": 0.4385,
"step": 287
},
{
"epoch": 0.21710247349823322,
"grad_norm": 7.836915969848633,
"learning_rate": 0.0001788059704736647,
"loss": 0.4509,
"step": 288
},
{
"epoch": 0.2178563015312132,
"grad_norm": 8.352907180786133,
"learning_rate": 0.00017865890048560277,
"loss": 0.4747,
"step": 289
},
{
"epoch": 0.21861012956419318,
"grad_norm": 8.010136604309082,
"learning_rate": 0.00017851138291219301,
"loss": 0.4662,
"step": 290
},
{
"epoch": 0.21936395759717314,
"grad_norm": 8.264348983764648,
"learning_rate": 0.00017836341859284093,
"loss": 0.4473,
"step": 291
},
{
"epoch": 0.22011778563015313,
"grad_norm": 8.917752265930176,
"learning_rate": 0.00017821500836949386,
"loss": 0.4909,
"step": 292
},
{
"epoch": 0.2208716136631331,
"grad_norm": 9.103057861328125,
"learning_rate": 0.0001780661530866366,
"loss": 0.4885,
"step": 293
},
{
"epoch": 0.22162544169611306,
"grad_norm": 10.667252540588379,
"learning_rate": 0.00017791685359128633,
"loss": 0.5175,
"step": 294
},
{
"epoch": 0.22237926972909305,
"grad_norm": 9.840495109558105,
"learning_rate": 0.000177767110732988,
"loss": 0.5175,
"step": 295
},
{
"epoch": 0.22313309776207302,
"grad_norm": 10.290101051330566,
"learning_rate": 0.00017761692536380928,
"loss": 0.4749,
"step": 296
},
{
"epoch": 0.223886925795053,
"grad_norm": 10.657001495361328,
"learning_rate": 0.00017746629833833585,
"loss": 0.534,
"step": 297
},
{
"epoch": 0.22464075382803297,
"grad_norm": 10.042377471923828,
"learning_rate": 0.00017731523051366658,
"loss": 0.454,
"step": 298
},
{
"epoch": 0.22539458186101297,
"grad_norm": 12.303505897521973,
"learning_rate": 0.00017716372274940843,
"loss": 0.5157,
"step": 299
},
{
"epoch": 0.22614840989399293,
"grad_norm": 16.197650909423828,
"learning_rate": 0.00017701177590767183,
"loss": 0.5521,
"step": 300
},
{
"epoch": 0.22690223792697292,
"grad_norm": 15.125090599060059,
"learning_rate": 0.00017685939085306562,
"loss": 0.6868,
"step": 301
},
{
"epoch": 0.22765606595995289,
"grad_norm": 13.107701301574707,
"learning_rate": 0.00017670656845269214,
"loss": 0.6326,
"step": 302
},
{
"epoch": 0.22840989399293285,
"grad_norm": 9.953380584716797,
"learning_rate": 0.00017655330957614234,
"loss": 0.596,
"step": 303
},
{
"epoch": 0.22916372202591284,
"grad_norm": 7.864305019378662,
"learning_rate": 0.00017639961509549078,
"loss": 0.5477,
"step": 304
},
{
"epoch": 0.2299175500588928,
"grad_norm": 6.731385707855225,
"learning_rate": 0.00017624548588529072,
"loss": 0.4891,
"step": 305
},
{
"epoch": 0.2306713780918728,
"grad_norm": 6.979381084442139,
"learning_rate": 0.00017609092282256912,
"loss": 0.4611,
"step": 306
},
{
"epoch": 0.23142520612485276,
"grad_norm": 8.147210121154785,
"learning_rate": 0.00017593592678682166,
"loss": 0.5077,
"step": 307
},
{
"epoch": 0.23217903415783275,
"grad_norm": 7.303165435791016,
"learning_rate": 0.0001757804986600077,
"loss": 0.4771,
"step": 308
},
{
"epoch": 0.23293286219081272,
"grad_norm": 7.042153358459473,
"learning_rate": 0.0001756246393265453,
"loss": 0.4718,
"step": 309
},
{
"epoch": 0.2336866902237927,
"grad_norm": 7.572822570800781,
"learning_rate": 0.00017546834967330617,
"loss": 0.4719,
"step": 310
},
{
"epoch": 0.23444051825677267,
"grad_norm": 7.078078269958496,
"learning_rate": 0.00017531163058961066,
"loss": 0.4345,
"step": 311
},
{
"epoch": 0.23519434628975264,
"grad_norm": 7.183956623077393,
"learning_rate": 0.00017515448296722262,
"loss": 0.4631,
"step": 312
},
{
"epoch": 0.23594817432273263,
"grad_norm": 7.140283584594727,
"learning_rate": 0.00017499690770034443,
"loss": 0.4554,
"step": 313
},
{
"epoch": 0.2367020023557126,
"grad_norm": 7.176611423492432,
"learning_rate": 0.00017483890568561173,
"loss": 0.4603,
"step": 314
},
{
"epoch": 0.23745583038869258,
"grad_norm": 6.916821002960205,
"learning_rate": 0.00017468047782208865,
"loss": 0.4406,
"step": 315
},
{
"epoch": 0.23820965842167255,
"grad_norm": 7.564478874206543,
"learning_rate": 0.00017452162501126227,
"loss": 0.4608,
"step": 316
},
{
"epoch": 0.23896348645465254,
"grad_norm": 7.078012466430664,
"learning_rate": 0.00017436234815703788,
"loss": 0.4254,
"step": 317
},
{
"epoch": 0.2397173144876325,
"grad_norm": 7.39133358001709,
"learning_rate": 0.0001742026481657335,
"loss": 0.4412,
"step": 318
},
{
"epoch": 0.2404711425206125,
"grad_norm": 7.540102005004883,
"learning_rate": 0.0001740425259460751,
"loss": 0.4444,
"step": 319
},
{
"epoch": 0.24122497055359246,
"grad_norm": 7.027541160583496,
"learning_rate": 0.00017388198240919102,
"loss": 0.439,
"step": 320
},
{
"epoch": 0.24197879858657242,
"grad_norm": 7.218184947967529,
"learning_rate": 0.00017372101846860707,
"loss": 0.4239,
"step": 321
},
{
"epoch": 0.24273262661955242,
"grad_norm": 7.92561674118042,
"learning_rate": 0.00017355963504024123,
"loss": 0.4278,
"step": 322
},
{
"epoch": 0.24348645465253238,
"grad_norm": 7.72558069229126,
"learning_rate": 0.00017339783304239843,
"loss": 0.4498,
"step": 323
},
{
"epoch": 0.24424028268551237,
"grad_norm": 7.2504096031188965,
"learning_rate": 0.00017323561339576543,
"loss": 0.4355,
"step": 324
},
{
"epoch": 0.24499411071849234,
"grad_norm": 7.207572937011719,
"learning_rate": 0.0001730729770234054,
"loss": 0.4192,
"step": 325
},
{
"epoch": 0.24574793875147233,
"grad_norm": 7.010448455810547,
"learning_rate": 0.00017290992485075282,
"loss": 0.3983,
"step": 326
},
{
"epoch": 0.2465017667844523,
"grad_norm": 7.16871452331543,
"learning_rate": 0.0001727464578056081,
"loss": 0.4454,
"step": 327
},
{
"epoch": 0.24725559481743228,
"grad_norm": 7.185717582702637,
"learning_rate": 0.00017258257681813244,
"loss": 0.426,
"step": 328
},
{
"epoch": 0.24800942285041225,
"grad_norm": 7.441746234893799,
"learning_rate": 0.0001724182828208424,
"loss": 0.4394,
"step": 329
},
{
"epoch": 0.24876325088339224,
"grad_norm": 7.429843902587891,
"learning_rate": 0.0001722535767486047,
"loss": 0.4377,
"step": 330
},
{
"epoch": 0.2495170789163722,
"grad_norm": 7.528452396392822,
"learning_rate": 0.00017208845953863076,
"loss": 0.4256,
"step": 331
},
{
"epoch": 0.25027090694935217,
"grad_norm": 6.993783473968506,
"learning_rate": 0.0001719229321304716,
"loss": 0.4337,
"step": 332
},
{
"epoch": 0.25027090694935217,
"eval_loss": 0.47317659854888916,
"eval_runtime": 126.4401,
"eval_samples_per_second": 17.676,
"eval_steps_per_second": 8.842,
"step": 332
},
{
"epoch": 0.25102473498233213,
"grad_norm": 7.080078601837158,
"learning_rate": 0.00017175699546601223,
"loss": 0.443,
"step": 333
},
{
"epoch": 0.25177856301531215,
"grad_norm": 7.021576404571533,
"learning_rate": 0.00017159065048946644,
"loss": 0.4211,
"step": 334
},
{
"epoch": 0.2525323910482921,
"grad_norm": 7.684916019439697,
"learning_rate": 0.00017142389814737142,
"loss": 0.4115,
"step": 335
},
{
"epoch": 0.2532862190812721,
"grad_norm": 7.011744976043701,
"learning_rate": 0.00017125673938858237,
"loss": 0.4057,
"step": 336
},
{
"epoch": 0.25404004711425204,
"grad_norm": 7.142672538757324,
"learning_rate": 0.00017108917516426704,
"loss": 0.4485,
"step": 337
},
{
"epoch": 0.25479387514723206,
"grad_norm": 7.860468864440918,
"learning_rate": 0.00017092120642790042,
"loss": 0.4134,
"step": 338
},
{
"epoch": 0.255547703180212,
"grad_norm": 8.12804889678955,
"learning_rate": 0.00017075283413525916,
"loss": 0.4449,
"step": 339
},
{
"epoch": 0.256301531213192,
"grad_norm": 7.87144136428833,
"learning_rate": 0.00017058405924441636,
"loss": 0.3987,
"step": 340
},
{
"epoch": 0.25705535924617195,
"grad_norm": 7.7459588050842285,
"learning_rate": 0.00017041488271573587,
"loss": 0.4271,
"step": 341
},
{
"epoch": 0.2578091872791519,
"grad_norm": 8.934653282165527,
"learning_rate": 0.00017024530551186702,
"loss": 0.4722,
"step": 342
},
{
"epoch": 0.25856301531213194,
"grad_norm": 8.811241149902344,
"learning_rate": 0.000170075328597739,
"loss": 0.4719,
"step": 343
},
{
"epoch": 0.2593168433451119,
"grad_norm": 9.294290542602539,
"learning_rate": 0.00016990495294055548,
"loss": 0.4963,
"step": 344
},
{
"epoch": 0.26007067137809187,
"grad_norm": 11.440875053405762,
"learning_rate": 0.00016973417950978906,
"loss": 0.5236,
"step": 345
},
{
"epoch": 0.26082449941107183,
"grad_norm": 10.008340835571289,
"learning_rate": 0.00016956300927717575,
"loss": 0.5081,
"step": 346
},
{
"epoch": 0.26157832744405185,
"grad_norm": 10.798213958740234,
"learning_rate": 0.0001693914432167094,
"loss": 0.5252,
"step": 347
},
{
"epoch": 0.2623321554770318,
"grad_norm": 12.772528648376465,
"learning_rate": 0.00016921948230463625,
"loss": 0.5073,
"step": 348
},
{
"epoch": 0.2630859835100118,
"grad_norm": 12.81511402130127,
"learning_rate": 0.00016904712751944931,
"loss": 0.4699,
"step": 349
},
{
"epoch": 0.26383981154299174,
"grad_norm": 13.554988861083984,
"learning_rate": 0.00016887437984188286,
"loss": 0.4963,
"step": 350
},
{
"epoch": 0.2645936395759717,
"grad_norm": 17.339111328125,
"learning_rate": 0.00016870124025490673,
"loss": 0.6331,
"step": 351
},
{
"epoch": 0.2653474676089517,
"grad_norm": 14.55565357208252,
"learning_rate": 0.0001685277097437208,
"loss": 0.6053,
"step": 352
},
{
"epoch": 0.2661012956419317,
"grad_norm": 11.207347869873047,
"learning_rate": 0.0001683537892957495,
"loss": 0.5787,
"step": 353
},
{
"epoch": 0.26685512367491165,
"grad_norm": 8.820387840270996,
"learning_rate": 0.00016817947990063598,
"loss": 0.5605,
"step": 354
},
{
"epoch": 0.2676089517078916,
"grad_norm": 7.382798194885254,
"learning_rate": 0.0001680047825502366,
"loss": 0.4917,
"step": 355
},
{
"epoch": 0.26836277974087164,
"grad_norm": 7.330126762390137,
"learning_rate": 0.00016782969823861526,
"loss": 0.4976,
"step": 356
},
{
"epoch": 0.2691166077738516,
"grad_norm": 8.046545028686523,
"learning_rate": 0.0001676542279620378,
"loss": 0.4864,
"step": 357
},
{
"epoch": 0.26987043580683157,
"grad_norm": 7.838155746459961,
"learning_rate": 0.00016747837271896622,
"loss": 0.4797,
"step": 358
},
{
"epoch": 0.27062426383981153,
"grad_norm": 7.075133323669434,
"learning_rate": 0.00016730213351005303,
"loss": 0.4655,
"step": 359
},
{
"epoch": 0.2713780918727915,
"grad_norm": 6.840551853179932,
"learning_rate": 0.00016712551133813572,
"loss": 0.4453,
"step": 360
},
{
"epoch": 0.2721319199057715,
"grad_norm": 7.175273418426514,
"learning_rate": 0.0001669485072082308,
"loss": 0.447,
"step": 361
},
{
"epoch": 0.2728857479387515,
"grad_norm": 8.195796012878418,
"learning_rate": 0.00016677112212752824,
"loss": 0.4869,
"step": 362
},
{
"epoch": 0.27363957597173144,
"grad_norm": 7.310915946960449,
"learning_rate": 0.00016659335710538564,
"loss": 0.4447,
"step": 363
},
{
"epoch": 0.2743934040047114,
"grad_norm": 7.676048755645752,
"learning_rate": 0.00016641521315332265,
"loss": 0.4507,
"step": 364
},
{
"epoch": 0.2751472320376914,
"grad_norm": 7.88531494140625,
"learning_rate": 0.00016623669128501504,
"loss": 0.4411,
"step": 365
},
{
"epoch": 0.2759010600706714,
"grad_norm": 7.499680995941162,
"learning_rate": 0.00016605779251628903,
"loss": 0.4629,
"step": 366
},
{
"epoch": 0.27665488810365135,
"grad_norm": 6.773830890655518,
"learning_rate": 0.00016587851786511543,
"loss": 0.4571,
"step": 367
},
{
"epoch": 0.2774087161366313,
"grad_norm": 7.170431613922119,
"learning_rate": 0.00016569886835160399,
"loss": 0.4313,
"step": 368
},
{
"epoch": 0.2781625441696113,
"grad_norm": 6.66681432723999,
"learning_rate": 0.0001655188449979974,
"loss": 0.425,
"step": 369
},
{
"epoch": 0.2789163722025913,
"grad_norm": 6.042294025421143,
"learning_rate": 0.00016533844882866568,
"loss": 0.4236,
"step": 370
},
{
"epoch": 0.27967020023557126,
"grad_norm": 6.5642924308776855,
"learning_rate": 0.00016515768087010013,
"loss": 0.4404,
"step": 371
},
{
"epoch": 0.28042402826855123,
"grad_norm": 7.063207626342773,
"learning_rate": 0.00016497654215090772,
"loss": 0.428,
"step": 372
},
{
"epoch": 0.2811778563015312,
"grad_norm": 6.705799579620361,
"learning_rate": 0.00016479503370180507,
"loss": 0.431,
"step": 373
},
{
"epoch": 0.2819316843345112,
"grad_norm": 6.578817367553711,
"learning_rate": 0.00016461315655561263,
"loss": 0.4126,
"step": 374
},
{
"epoch": 0.2826855123674912,
"grad_norm": 6.545943260192871,
"learning_rate": 0.00016443091174724885,
"loss": 0.4198,
"step": 375
},
{
"epoch": 0.28343934040047114,
"grad_norm": 6.834047794342041,
"learning_rate": 0.00016424830031372425,
"loss": 0.4378,
"step": 376
},
{
"epoch": 0.2841931684334511,
"grad_norm": 7.931153774261475,
"learning_rate": 0.00016406532329413546,
"loss": 0.4529,
"step": 377
},
{
"epoch": 0.28494699646643107,
"grad_norm": 7.077485084533691,
"learning_rate": 0.00016388198172965942,
"loss": 0.4281,
"step": 378
},
{
"epoch": 0.2857008244994111,
"grad_norm": 7.532230854034424,
"learning_rate": 0.00016369827666354745,
"loss": 0.4064,
"step": 379
},
{
"epoch": 0.28645465253239105,
"grad_norm": 7.111504554748535,
"learning_rate": 0.00016351420914111916,
"loss": 0.4392,
"step": 380
},
{
"epoch": 0.287208480565371,
"grad_norm": 7.107287883758545,
"learning_rate": 0.0001633297802097567,
"loss": 0.3896,
"step": 381
},
{
"epoch": 0.287962308598351,
"grad_norm": 6.906205654144287,
"learning_rate": 0.0001631449909188987,
"loss": 0.4263,
"step": 382
},
{
"epoch": 0.288716136631331,
"grad_norm": 7.226500034332275,
"learning_rate": 0.00016295984232003426,
"loss": 0.4482,
"step": 383
},
{
"epoch": 0.28946996466431096,
"grad_norm": 6.622352123260498,
"learning_rate": 0.00016277433546669703,
"loss": 0.4044,
"step": 384
},
{
"epoch": 0.2902237926972909,
"grad_norm": 7.164252281188965,
"learning_rate": 0.00016258847141445928,
"loss": 0.4253,
"step": 385
},
{
"epoch": 0.2909776207302709,
"grad_norm": 7.356839656829834,
"learning_rate": 0.00016240225122092573,
"loss": 0.427,
"step": 386
},
{
"epoch": 0.29173144876325086,
"grad_norm": 8.345090866088867,
"learning_rate": 0.00016221567594572762,
"loss": 0.4204,
"step": 387
},
{
"epoch": 0.2924852767962309,
"grad_norm": 7.662243366241455,
"learning_rate": 0.00016202874665051674,
"loss": 0.393,
"step": 388
},
{
"epoch": 0.29323910482921084,
"grad_norm": 7.708904266357422,
"learning_rate": 0.00016184146439895928,
"loss": 0.411,
"step": 389
},
{
"epoch": 0.2939929328621908,
"grad_norm": 7.000946044921875,
"learning_rate": 0.00016165383025672981,
"loss": 0.3893,
"step": 390
},
{
"epoch": 0.29474676089517077,
"grad_norm": 7.401767253875732,
"learning_rate": 0.00016146584529150526,
"loss": 0.3869,
"step": 391
},
{
"epoch": 0.2955005889281508,
"grad_norm": 7.715709209442139,
"learning_rate": 0.0001612775105729588,
"loss": 0.402,
"step": 392
},
{
"epoch": 0.29625441696113075,
"grad_norm": 8.78487491607666,
"learning_rate": 0.00016108882717275384,
"loss": 0.4899,
"step": 393
},
{
"epoch": 0.2970082449941107,
"grad_norm": 9.631272315979004,
"learning_rate": 0.0001608997961645377,
"loss": 0.4919,
"step": 394
},
{
"epoch": 0.2977620730270907,
"grad_norm": 9.458671569824219,
"learning_rate": 0.00016071041862393578,
"loss": 0.4955,
"step": 395
},
{
"epoch": 0.2985159010600707,
"grad_norm": 10.232501029968262,
"learning_rate": 0.0001605206956285454,
"loss": 0.4977,
"step": 396
},
{
"epoch": 0.29926972909305066,
"grad_norm": 9.963619232177734,
"learning_rate": 0.00016033062825792935,
"loss": 0.4679,
"step": 397
},
{
"epoch": 0.3000235571260306,
"grad_norm": 12.23200798034668,
"learning_rate": 0.0001601402175936102,
"loss": 0.5541,
"step": 398
},
{
"epoch": 0.3007773851590106,
"grad_norm": 11.938904762268066,
"learning_rate": 0.00015994946471906382,
"loss": 0.4678,
"step": 399
},
{
"epoch": 0.30153121319199055,
"grad_norm": 14.236066818237305,
"learning_rate": 0.0001597583707197134,
"loss": 0.534,
"step": 400
},
{
"epoch": 0.3022850412249706,
"grad_norm": 12.790224075317383,
"learning_rate": 0.00015956693668292313,
"loss": 0.6361,
"step": 401
},
{
"epoch": 0.30303886925795054,
"grad_norm": 14.324430465698242,
"learning_rate": 0.00015937516369799216,
"loss": 0.6471,
"step": 402
},
{
"epoch": 0.3037926972909305,
"grad_norm": 10.209970474243164,
"learning_rate": 0.00015918305285614822,
"loss": 0.5906,
"step": 403
},
{
"epoch": 0.30454652532391047,
"grad_norm": 7.869755744934082,
"learning_rate": 0.00015899060525054157,
"loss": 0.5408,
"step": 404
},
{
"epoch": 0.3053003533568905,
"grad_norm": 6.786082744598389,
"learning_rate": 0.0001587978219762388,
"loss": 0.5095,
"step": 405
},
{
"epoch": 0.30605418138987045,
"grad_norm": 8.50927448272705,
"learning_rate": 0.00015860470413021642,
"loss": 0.5117,
"step": 406
},
{
"epoch": 0.3068080094228504,
"grad_norm": 7.6895833015441895,
"learning_rate": 0.00015841125281135473,
"loss": 0.4919,
"step": 407
},
{
"epoch": 0.3075618374558304,
"grad_norm": 7.566605567932129,
"learning_rate": 0.00015821746912043165,
"loss": 0.4561,
"step": 408
},
{
"epoch": 0.30831566548881034,
"grad_norm": 7.5333452224731445,
"learning_rate": 0.00015802335416011625,
"loss": 0.4735,
"step": 409
},
{
"epoch": 0.30906949352179036,
"grad_norm": 7.508667469024658,
"learning_rate": 0.00015782890903496264,
"loss": 0.4461,
"step": 410
},
{
"epoch": 0.3098233215547703,
"grad_norm": 6.778057098388672,
"learning_rate": 0.00015763413485140365,
"loss": 0.4589,
"step": 411
},
{
"epoch": 0.3105771495877503,
"grad_norm": 6.7967915534973145,
"learning_rate": 0.00015743903271774455,
"loss": 0.4438,
"step": 412
},
{
"epoch": 0.31133097762073025,
"grad_norm": 7.60194730758667,
"learning_rate": 0.0001572436037441566,
"loss": 0.4371,
"step": 413
},
{
"epoch": 0.3120848056537103,
"grad_norm": 7.298644065856934,
"learning_rate": 0.00015704784904267097,
"loss": 0.4678,
"step": 414
},
{
"epoch": 0.31283863368669024,
"grad_norm": 6.711719036102295,
"learning_rate": 0.00015685176972717223,
"loss": 0.4511,
"step": 415
},
{
"epoch": 0.3135924617196702,
"grad_norm": 8.647915840148926,
"learning_rate": 0.00015665536691339207,
"loss": 0.4697,
"step": 416
},
{
"epoch": 0.31434628975265017,
"grad_norm": 7.388605117797852,
"learning_rate": 0.00015645864171890295,
"loss": 0.4322,
"step": 417
},
{
"epoch": 0.31510011778563013,
"grad_norm": 7.3222198486328125,
"learning_rate": 0.00015626159526311174,
"loss": 0.4366,
"step": 418
},
{
"epoch": 0.31585394581861015,
"grad_norm": 6.875087738037109,
"learning_rate": 0.00015606422866725343,
"loss": 0.4464,
"step": 419
},
{
"epoch": 0.3166077738515901,
"grad_norm": 6.434317111968994,
"learning_rate": 0.00015586654305438456,
"loss": 0.4161,
"step": 420
},
{
"epoch": 0.3173616018845701,
"grad_norm": 7.1308488845825195,
"learning_rate": 0.00015566853954937694,
"loss": 0.4558,
"step": 421
},
{
"epoch": 0.31811542991755004,
"grad_norm": 7.582878112792969,
"learning_rate": 0.00015547021927891144,
"loss": 0.4789,
"step": 422
},
{
"epoch": 0.31886925795053006,
"grad_norm": 6.73392391204834,
"learning_rate": 0.00015527158337147112,
"loss": 0.45,
"step": 423
},
{
"epoch": 0.31962308598351,
"grad_norm": 7.364933967590332,
"learning_rate": 0.00015507263295733528,
"loss": 0.4156,
"step": 424
},
{
"epoch": 0.32037691401649,
"grad_norm": 6.4493842124938965,
"learning_rate": 0.00015487336916857278,
"loss": 0.4147,
"step": 425
},
{
"epoch": 0.32113074204946995,
"grad_norm": 6.886701583862305,
"learning_rate": 0.00015467379313903557,
"loss": 0.4271,
"step": 426
},
{
"epoch": 0.3218845700824499,
"grad_norm": 6.938616752624512,
"learning_rate": 0.00015447390600435238,
"loss": 0.4356,
"step": 427
},
{
"epoch": 0.32263839811542994,
"grad_norm": 7.1376214027404785,
"learning_rate": 0.00015427370890192224,
"loss": 0.411,
"step": 428
},
{
"epoch": 0.3233922261484099,
"grad_norm": 7.260872840881348,
"learning_rate": 0.00015407320297090786,
"loss": 0.4505,
"step": 429
},
{
"epoch": 0.32414605418138986,
"grad_norm": 7.035525321960449,
"learning_rate": 0.00015387238935222927,
"loss": 0.4032,
"step": 430
},
{
"epoch": 0.32489988221436983,
"grad_norm": 6.7771782875061035,
"learning_rate": 0.00015367126918855738,
"loss": 0.4135,
"step": 431
},
{
"epoch": 0.32565371024734985,
"grad_norm": 7.255315780639648,
"learning_rate": 0.0001534698436243073,
"loss": 0.4376,
"step": 432
},
{
"epoch": 0.3264075382803298,
"grad_norm": 6.563286781311035,
"learning_rate": 0.00015326811380563204,
"loss": 0.3936,
"step": 433
},
{
"epoch": 0.3271613663133098,
"grad_norm": 8.582233428955078,
"learning_rate": 0.0001530660808804158,
"loss": 0.3979,
"step": 434
},
{
"epoch": 0.32791519434628974,
"grad_norm": 6.628231048583984,
"learning_rate": 0.00015286374599826754,
"loss": 0.4143,
"step": 435
},
{
"epoch": 0.3286690223792697,
"grad_norm": 6.581121921539307,
"learning_rate": 0.00015266111031051442,
"loss": 0.4313,
"step": 436
},
{
"epoch": 0.3294228504122497,
"grad_norm": 6.923291206359863,
"learning_rate": 0.00015245817497019524,
"loss": 0.3921,
"step": 437
},
{
"epoch": 0.3301766784452297,
"grad_norm": 7.172369480133057,
"learning_rate": 0.00015225494113205393,
"loss": 0.4249,
"step": 438
},
{
"epoch": 0.33093050647820965,
"grad_norm": 7.134575843811035,
"learning_rate": 0.00015205140995253283,
"loss": 0.4148,
"step": 439
},
{
"epoch": 0.3316843345111896,
"grad_norm": 8.403553009033203,
"learning_rate": 0.00015184758258976637,
"loss": 0.447,
"step": 440
},
{
"epoch": 0.33243816254416964,
"grad_norm": 7.707136154174805,
"learning_rate": 0.00015164346020357417,
"loss": 0.4165,
"step": 441
},
{
"epoch": 0.3331919905771496,
"grad_norm": 8.08395004272461,
"learning_rate": 0.00015143904395545466,
"loss": 0.461,
"step": 442
},
{
"epoch": 0.33394581861012956,
"grad_norm": 9.609329223632812,
"learning_rate": 0.0001512343350085784,
"loss": 0.5137,
"step": 443
},
{
"epoch": 0.3346996466431095,
"grad_norm": 9.876978874206543,
"learning_rate": 0.0001510293345277815,
"loss": 0.5053,
"step": 444
},
{
"epoch": 0.3354534746760895,
"grad_norm": 9.40042495727539,
"learning_rate": 0.0001508240436795589,
"loss": 0.5114,
"step": 445
},
{
"epoch": 0.3362073027090695,
"grad_norm": 10.623950958251953,
"learning_rate": 0.00015061846363205784,
"loss": 0.497,
"step": 446
},
{
"epoch": 0.3369611307420495,
"grad_norm": 10.993450164794922,
"learning_rate": 0.00015041259555507108,
"loss": 0.49,
"step": 447
},
{
"epoch": 0.33771495877502944,
"grad_norm": 11.963092803955078,
"learning_rate": 0.00015020644062003046,
"loss": 0.5261,
"step": 448
},
{
"epoch": 0.3384687868080094,
"grad_norm": 11.985857963562012,
"learning_rate": 0.00015000000000000001,
"loss": 0.5063,
"step": 449
},
{
"epoch": 0.3392226148409894,
"grad_norm": 13.582792282104492,
"learning_rate": 0.00014979327486966938,
"loss": 0.4568,
"step": 450
},
{
"epoch": 0.3399764428739694,
"grad_norm": 10.956193923950195,
"learning_rate": 0.0001495862664053471,
"loss": 0.6271,
"step": 451
},
{
"epoch": 0.34073027090694935,
"grad_norm": 10.826944351196289,
"learning_rate": 0.0001493789757849541,
"loss": 0.5646,
"step": 452
},
{
"epoch": 0.3414840989399293,
"grad_norm": 9.086105346679688,
"learning_rate": 0.00014917140418801655,
"loss": 0.5347,
"step": 453
},
{
"epoch": 0.3422379269729093,
"grad_norm": 7.542895317077637,
"learning_rate": 0.00014896355279565976,
"loss": 0.547,
"step": 454
},
{
"epoch": 0.3429917550058893,
"grad_norm": 6.925205707550049,
"learning_rate": 0.00014875542279060085,
"loss": 0.5174,
"step": 455
},
{
"epoch": 0.34374558303886926,
"grad_norm": 6.2740159034729,
"learning_rate": 0.00014854701535714244,
"loss": 0.4569,
"step": 456
},
{
"epoch": 0.3444994110718492,
"grad_norm": 6.751154899597168,
"learning_rate": 0.00014833833168116582,
"loss": 0.4848,
"step": 457
},
{
"epoch": 0.3452532391048292,
"grad_norm": 6.805966854095459,
"learning_rate": 0.00014812937295012406,
"loss": 0.454,
"step": 458
},
{
"epoch": 0.3460070671378092,
"grad_norm": 6.805473327636719,
"learning_rate": 0.00014792014035303535,
"loss": 0.4459,
"step": 459
},
{
"epoch": 0.3467608951707892,
"grad_norm": 6.896597385406494,
"learning_rate": 0.00014771063508047636,
"loss": 0.4492,
"step": 460
},
{
"epoch": 0.34751472320376914,
"grad_norm": 6.992384433746338,
"learning_rate": 0.00014750085832457519,
"loss": 0.4737,
"step": 461
},
{
"epoch": 0.3482685512367491,
"grad_norm": 7.02846622467041,
"learning_rate": 0.00014729081127900476,
"loss": 0.4786,
"step": 462
},
{
"epoch": 0.34902237926972907,
"grad_norm": 7.123291015625,
"learning_rate": 0.0001470804951389761,
"loss": 0.4397,
"step": 463
},
{
"epoch": 0.3497762073027091,
"grad_norm": 6.681251049041748,
"learning_rate": 0.00014686991110123135,
"loss": 0.4398,
"step": 464
},
{
"epoch": 0.35053003533568905,
"grad_norm": 7.414073944091797,
"learning_rate": 0.00014665906036403706,
"loss": 0.4626,
"step": 465
},
{
"epoch": 0.351283863368669,
"grad_norm": 6.917845726013184,
"learning_rate": 0.00014644794412717736,
"loss": 0.4312,
"step": 466
},
{
"epoch": 0.352037691401649,
"grad_norm": 6.451867580413818,
"learning_rate": 0.00014623656359194712,
"loss": 0.4101,
"step": 467
},
{
"epoch": 0.352791519434629,
"grad_norm": 7.152139663696289,
"learning_rate": 0.00014602491996114516,
"loss": 0.4518,
"step": 468
},
{
"epoch": 0.35354534746760896,
"grad_norm": 7.701825141906738,
"learning_rate": 0.0001458130144390673,
"loss": 0.4568,
"step": 469
},
{
"epoch": 0.3542991755005889,
"grad_norm": 7.278562545776367,
"learning_rate": 0.00014560084823149965,
"loss": 0.4222,
"step": 470
},
{
"epoch": 0.3550530035335689,
"grad_norm": 6.47285270690918,
"learning_rate": 0.0001453884225457116,
"loss": 0.465,
"step": 471
},
{
"epoch": 0.35580683156654885,
"grad_norm": 6.140552520751953,
"learning_rate": 0.00014517573859044907,
"loss": 0.4219,
"step": 472
},
{
"epoch": 0.3565606595995289,
"grad_norm": 6.481984615325928,
"learning_rate": 0.00014496279757592766,
"loss": 0.4446,
"step": 473
},
{
"epoch": 0.35731448763250884,
"grad_norm": 6.575818061828613,
"learning_rate": 0.0001447496007138255,
"loss": 0.4297,
"step": 474
},
{
"epoch": 0.3580683156654888,
"grad_norm": 6.637454509735107,
"learning_rate": 0.00014453614921727668,
"loss": 0.4311,
"step": 475
},
{
"epoch": 0.35882214369846877,
"grad_norm": 6.832921981811523,
"learning_rate": 0.00014432244430086423,
"loss": 0.4469,
"step": 476
},
{
"epoch": 0.3595759717314488,
"grad_norm": 7.260216236114502,
"learning_rate": 0.00014410848718061312,
"loss": 0.4206,
"step": 477
},
{
"epoch": 0.36032979976442875,
"grad_norm": 6.812548637390137,
"learning_rate": 0.00014389427907398342,
"loss": 0.4146,
"step": 478
},
{
"epoch": 0.3610836277974087,
"grad_norm": 6.668044090270996,
"learning_rate": 0.00014367982119986342,
"loss": 0.4333,
"step": 479
},
{
"epoch": 0.3618374558303887,
"grad_norm": 7.100220680236816,
"learning_rate": 0.00014346511477856259,
"loss": 0.4174,
"step": 480
},
{
"epoch": 0.3625912838633687,
"grad_norm": 7.15718936920166,
"learning_rate": 0.0001432501610318047,
"loss": 0.4258,
"step": 481
},
{
"epoch": 0.36334511189634866,
"grad_norm": 7.051331520080566,
"learning_rate": 0.00014303496118272084,
"loss": 0.4048,
"step": 482
},
{
"epoch": 0.3640989399293286,
"grad_norm": 7.344452381134033,
"learning_rate": 0.0001428195164558425,
"loss": 0.4137,
"step": 483
},
{
"epoch": 0.3648527679623086,
"grad_norm": 7.5303850173950195,
"learning_rate": 0.00014260382807709457,
"loss": 0.421,
"step": 484
},
{
"epoch": 0.36560659599528855,
"grad_norm": 6.944647789001465,
"learning_rate": 0.0001423878972737883,
"loss": 0.4059,
"step": 485
},
{
"epoch": 0.36636042402826857,
"grad_norm": 7.10966682434082,
"learning_rate": 0.0001421717252746145,
"loss": 0.4038,
"step": 486
},
{
"epoch": 0.36711425206124854,
"grad_norm": 6.702695369720459,
"learning_rate": 0.00014195531330963635,
"loss": 0.3999,
"step": 487
},
{
"epoch": 0.3678680800942285,
"grad_norm": 8.255915641784668,
"learning_rate": 0.0001417386626102825,
"loss": 0.3961,
"step": 488
},
{
"epoch": 0.36862190812720846,
"grad_norm": 8.199605941772461,
"learning_rate": 0.00014152177440934012,
"loss": 0.4079,
"step": 489
},
{
"epoch": 0.3693757361601885,
"grad_norm": 7.717386245727539,
"learning_rate": 0.0001413046499409477,
"loss": 0.3932,
"step": 490
},
{
"epoch": 0.37012956419316845,
"grad_norm": 7.842260837554932,
"learning_rate": 0.0001410872904405882,
"loss": 0.4383,
"step": 491
},
{
"epoch": 0.3708833922261484,
"grad_norm": 8.819681167602539,
"learning_rate": 0.00014086969714508196,
"loss": 0.4763,
"step": 492
},
{
"epoch": 0.3716372202591284,
"grad_norm": 8.904485702514648,
"learning_rate": 0.00014065187129257964,
"loss": 0.4711,
"step": 493
},
{
"epoch": 0.37239104829210834,
"grad_norm": 9.481599807739258,
"learning_rate": 0.00014043381412255526,
"loss": 0.5002,
"step": 494
},
{
"epoch": 0.37314487632508836,
"grad_norm": 9.55698013305664,
"learning_rate": 0.00014021552687579902,
"loss": 0.454,
"step": 495
},
{
"epoch": 0.3738987043580683,
"grad_norm": 9.685362815856934,
"learning_rate": 0.00013999701079441028,
"loss": 0.4687,
"step": 496
},
{
"epoch": 0.3746525323910483,
"grad_norm": 10.087312698364258,
"learning_rate": 0.00013977826712179058,
"loss": 0.4855,
"step": 497
},
{
"epoch": 0.37540636042402825,
"grad_norm": 10.978914260864258,
"learning_rate": 0.00013955929710263653,
"loss": 0.485,
"step": 498
},
{
"epoch": 0.37616018845700827,
"grad_norm": 11.427350044250488,
"learning_rate": 0.00013934010198293257,
"loss": 0.4536,
"step": 499
},
{
"epoch": 0.37691401648998824,
"grad_norm": 12.61874771118164,
"learning_rate": 0.00013912068300994413,
"loss": 0.4844,
"step": 500
},
{
"epoch": 0.3776678445229682,
"grad_norm": 11.156290054321289,
"learning_rate": 0.0001389010414322104,
"loss": 0.6025,
"step": 501
},
{
"epoch": 0.37842167255594816,
"grad_norm": 10.892552375793457,
"learning_rate": 0.0001386811784995371,
"loss": 0.6063,
"step": 502
},
{
"epoch": 0.3791755005889281,
"grad_norm": 9.48608112335205,
"learning_rate": 0.00013846109546298971,
"loss": 0.5153,
"step": 503
},
{
"epoch": 0.37992932862190815,
"grad_norm": 7.735827922821045,
"learning_rate": 0.00013824079357488598,
"loss": 0.5102,
"step": 504
},
{
"epoch": 0.3806831566548881,
"grad_norm": 6.837904453277588,
"learning_rate": 0.0001380202740887891,
"loss": 0.4952,
"step": 505
},
{
"epoch": 0.3814369846878681,
"grad_norm": 6.260585308074951,
"learning_rate": 0.00013779953825950034,
"loss": 0.4751,
"step": 506
},
{
"epoch": 0.38219081272084804,
"grad_norm": 6.398446083068848,
"learning_rate": 0.00013757858734305203,
"loss": 0.4449,
"step": 507
},
{
"epoch": 0.38294464075382806,
"grad_norm": 7.3623881340026855,
"learning_rate": 0.0001373574225967004,
"loss": 0.4859,
"step": 508
},
{
"epoch": 0.383698468786808,
"grad_norm": 7.673310279846191,
"learning_rate": 0.00013713604527891844,
"loss": 0.4804,
"step": 509
},
{
"epoch": 0.384452296819788,
"grad_norm": 6.531475067138672,
"learning_rate": 0.00013691445664938866,
"loss": 0.4491,
"step": 510
},
{
"epoch": 0.38520612485276795,
"grad_norm": 6.5302300453186035,
"learning_rate": 0.00013669265796899607,
"loss": 0.4277,
"step": 511
},
{
"epoch": 0.3859599528857479,
"grad_norm": 6.498359680175781,
"learning_rate": 0.00013647065049982078,
"loss": 0.4473,
"step": 512
},
{
"epoch": 0.38671378091872793,
"grad_norm": 7.777768135070801,
"learning_rate": 0.0001362484355051311,
"loss": 0.4485,
"step": 513
},
{
"epoch": 0.3874676089517079,
"grad_norm": 6.4952192306518555,
"learning_rate": 0.00013602601424937604,
"loss": 0.4144,
"step": 514
},
{
"epoch": 0.38822143698468786,
"grad_norm": 7.111438274383545,
"learning_rate": 0.00013580338799817844,
"loss": 0.4314,
"step": 515
},
{
"epoch": 0.3889752650176678,
"grad_norm": 6.711978435516357,
"learning_rate": 0.00013558055801832748,
"loss": 0.4476,
"step": 516
},
{
"epoch": 0.38972909305064785,
"grad_norm": 6.2299370765686035,
"learning_rate": 0.0001353575255777717,
"loss": 0.4211,
"step": 517
},
{
"epoch": 0.3904829210836278,
"grad_norm": 6.2404046058654785,
"learning_rate": 0.0001351342919456116,
"loss": 0.4195,
"step": 518
},
{
"epoch": 0.3912367491166078,
"grad_norm": 7.3141679763793945,
"learning_rate": 0.0001349108583920925,
"loss": 0.4473,
"step": 519
},
{
"epoch": 0.39199057714958774,
"grad_norm": 7.678971767425537,
"learning_rate": 0.00013468722618859743,
"loss": 0.4102,
"step": 520
},
{
"epoch": 0.3927444051825677,
"grad_norm": 6.773143291473389,
"learning_rate": 0.0001344633966076396,
"loss": 0.4518,
"step": 521
},
{
"epoch": 0.3934982332155477,
"grad_norm": 6.161088943481445,
"learning_rate": 0.00013423937092285555,
"loss": 0.4,
"step": 522
},
{
"epoch": 0.3942520612485277,
"grad_norm": 6.478328227996826,
"learning_rate": 0.00013401515040899746,
"loss": 0.4607,
"step": 523
},
{
"epoch": 0.39500588928150765,
"grad_norm": 6.1380157470703125,
"learning_rate": 0.00013379073634192632,
"loss": 0.4108,
"step": 524
},
{
"epoch": 0.3957597173144876,
"grad_norm": 6.8945441246032715,
"learning_rate": 0.00013356612999860436,
"loss": 0.4032,
"step": 525
},
{
"epoch": 0.39651354534746763,
"grad_norm": 6.745527267456055,
"learning_rate": 0.000133341332657088,
"loss": 0.402,
"step": 526
},
{
"epoch": 0.3972673733804476,
"grad_norm": 6.959543704986572,
"learning_rate": 0.00013311634559652036,
"loss": 0.4258,
"step": 527
},
{
"epoch": 0.39802120141342756,
"grad_norm": 6.5237298011779785,
"learning_rate": 0.00013289117009712418,
"loss": 0.4042,
"step": 528
},
{
"epoch": 0.3987750294464075,
"grad_norm": 6.997231483459473,
"learning_rate": 0.00013266580744019445,
"loss": 0.424,
"step": 529
},
{
"epoch": 0.3995288574793875,
"grad_norm": 7.053787708282471,
"learning_rate": 0.00013244025890809112,
"loss": 0.4436,
"step": 530
},
{
"epoch": 0.4002826855123675,
"grad_norm": 6.5921831130981445,
"learning_rate": 0.00013221452578423176,
"loss": 0.4262,
"step": 531
},
{
"epoch": 0.4010365135453475,
"grad_norm": 7.524543285369873,
"learning_rate": 0.00013198860935308444,
"loss": 0.4205,
"step": 532
},
{
"epoch": 0.40179034157832744,
"grad_norm": 6.691077709197998,
"learning_rate": 0.00013176251090016007,
"loss": 0.4303,
"step": 533
},
{
"epoch": 0.4025441696113074,
"grad_norm": 6.8649749755859375,
"learning_rate": 0.0001315362317120055,
"loss": 0.4293,
"step": 534
},
{
"epoch": 0.4032979976442874,
"grad_norm": 7.226325035095215,
"learning_rate": 0.00013130977307619594,
"loss": 0.4118,
"step": 535
},
{
"epoch": 0.4040518256772674,
"grad_norm": 6.9132843017578125,
"learning_rate": 0.0001310831362813276,
"loss": 0.4086,
"step": 536
},
{
"epoch": 0.40480565371024735,
"grad_norm": 6.638665199279785,
"learning_rate": 0.00013085632261701063,
"loss": 0.404,
"step": 537
},
{
"epoch": 0.4055594817432273,
"grad_norm": 6.809209823608398,
"learning_rate": 0.00013062933337386142,
"loss": 0.378,
"step": 538
},
{
"epoch": 0.4063133097762073,
"grad_norm": 6.697812557220459,
"learning_rate": 0.00013040216984349555,
"loss": 0.4068,
"step": 539
},
{
"epoch": 0.4070671378091873,
"grad_norm": 7.231639862060547,
"learning_rate": 0.00013017483331852035,
"loss": 0.4167,
"step": 540
},
{
"epoch": 0.40782096584216726,
"grad_norm": 7.607770919799805,
"learning_rate": 0.00012994732509252744,
"loss": 0.4298,
"step": 541
},
{
"epoch": 0.4085747938751472,
"grad_norm": 7.685420989990234,
"learning_rate": 0.00012971964646008542,
"loss": 0.4435,
"step": 542
},
{
"epoch": 0.4093286219081272,
"grad_norm": 9.00213623046875,
"learning_rate": 0.00012949179871673278,
"loss": 0.5072,
"step": 543
},
{
"epoch": 0.4100824499411072,
"grad_norm": 9.699268341064453,
"learning_rate": 0.00012926378315896998,
"loss": 0.5158,
"step": 544
},
{
"epoch": 0.41083627797408717,
"grad_norm": 10.096549987792969,
"learning_rate": 0.00012903560108425258,
"loss": 0.479,
"step": 545
},
{
"epoch": 0.41159010600706714,
"grad_norm": 9.205822944641113,
"learning_rate": 0.00012880725379098352,
"loss": 0.4844,
"step": 546
},
{
"epoch": 0.4123439340400471,
"grad_norm": 10.534090995788574,
"learning_rate": 0.00012857874257850605,
"loss": 0.4998,
"step": 547
},
{
"epoch": 0.41309776207302706,
"grad_norm": 11.49348258972168,
"learning_rate": 0.00012835006874709594,
"loss": 0.4969,
"step": 548
},
{
"epoch": 0.4138515901060071,
"grad_norm": 11.891164779663086,
"learning_rate": 0.00012812123359795446,
"loss": 0.5109,
"step": 549
},
{
"epoch": 0.41460541813898705,
"grad_norm": 12.372316360473633,
"learning_rate": 0.00012789223843320073,
"loss": 0.4808,
"step": 550
},
{
"epoch": 0.415359246171967,
"grad_norm": 9.265199661254883,
"learning_rate": 0.0001276630845558644,
"loss": 0.6065,
"step": 551
},
{
"epoch": 0.416113074204947,
"grad_norm": 10.428581237792969,
"learning_rate": 0.00012743377326987826,
"loss": 0.5885,
"step": 552
},
{
"epoch": 0.416866902237927,
"grad_norm": 8.8326997756958,
"learning_rate": 0.00012720430588007077,
"loss": 0.5599,
"step": 553
},
{
"epoch": 0.41762073027090696,
"grad_norm": 6.87199592590332,
"learning_rate": 0.00012697468369215863,
"loss": 0.5212,
"step": 554
},
{
"epoch": 0.4183745583038869,
"grad_norm": 6.59550142288208,
"learning_rate": 0.00012674490801273938,
"loss": 0.5265,
"step": 555
},
{
"epoch": 0.4191283863368669,
"grad_norm": 5.809760093688965,
"learning_rate": 0.00012651498014928402,
"loss": 0.4861,
"step": 556
},
{
"epoch": 0.41988221436984685,
"grad_norm": 5.872656345367432,
"learning_rate": 0.00012628490141012937,
"loss": 0.4476,
"step": 557
},
{
"epoch": 0.42063604240282687,
"grad_norm": 6.835720062255859,
"learning_rate": 0.000126054673104471,
"loss": 0.4838,
"step": 558
},
{
"epoch": 0.42138987043580683,
"grad_norm": 6.669496059417725,
"learning_rate": 0.00012582429654235523,
"loss": 0.4167,
"step": 559
},
{
"epoch": 0.4221436984687868,
"grad_norm": 6.77216100692749,
"learning_rate": 0.00012559377303467226,
"loss": 0.4469,
"step": 560
},
{
"epoch": 0.42289752650176676,
"grad_norm": 6.118035793304443,
"learning_rate": 0.00012536310389314832,
"loss": 0.439,
"step": 561
},
{
"epoch": 0.4236513545347468,
"grad_norm": 6.0063886642456055,
"learning_rate": 0.0001251322904303383,
"loss": 0.4246,
"step": 562
},
{
"epoch": 0.42440518256772675,
"grad_norm": 6.384454727172852,
"learning_rate": 0.00012490133395961844,
"loss": 0.4427,
"step": 563
},
{
"epoch": 0.4251590106007067,
"grad_norm": 6.875798225402832,
"learning_rate": 0.00012467023579517856,
"loss": 0.4746,
"step": 564
},
{
"epoch": 0.4259128386336867,
"grad_norm": 6.876395225524902,
"learning_rate": 0.00012443899725201482,
"loss": 0.4639,
"step": 565
},
{
"epoch": 0.4266666666666667,
"grad_norm": 7.060841083526611,
"learning_rate": 0.00012420761964592223,
"loss": 0.4449,
"step": 566
},
{
"epoch": 0.42742049469964666,
"grad_norm": 6.859095573425293,
"learning_rate": 0.000123976104293487,
"loss": 0.4127,
"step": 567
},
{
"epoch": 0.4281743227326266,
"grad_norm": 6.3295135498046875,
"learning_rate": 0.00012374445251207914,
"loss": 0.4436,
"step": 568
},
{
"epoch": 0.4289281507656066,
"grad_norm": 6.203479766845703,
"learning_rate": 0.00012351266561984507,
"loss": 0.4493,
"step": 569
},
{
"epoch": 0.42968197879858655,
"grad_norm": 6.393275737762451,
"learning_rate": 0.00012328074493569993,
"loss": 0.451,
"step": 570
},
{
"epoch": 0.43043580683156657,
"grad_norm": 6.78492546081543,
"learning_rate": 0.0001230486917793202,
"loss": 0.4278,
"step": 571
},
{
"epoch": 0.43118963486454653,
"grad_norm": 6.327200889587402,
"learning_rate": 0.00012281650747113612,
"loss": 0.4422,
"step": 572
},
{
"epoch": 0.4319434628975265,
"grad_norm": 6.7098822593688965,
"learning_rate": 0.0001225841933323242,
"loss": 0.4556,
"step": 573
},
{
"epoch": 0.43269729093050646,
"grad_norm": 6.249898910522461,
"learning_rate": 0.00012235175068479984,
"loss": 0.4184,
"step": 574
},
{
"epoch": 0.4334511189634865,
"grad_norm": 6.380219459533691,
"learning_rate": 0.00012211918085120954,
"loss": 0.437,
"step": 575
},
{
"epoch": 0.43420494699646645,
"grad_norm": 6.367920875549316,
"learning_rate": 0.00012188648515492355,
"loss": 0.4269,
"step": 576
},
{
"epoch": 0.4349587750294464,
"grad_norm": 6.438598155975342,
"learning_rate": 0.00012165366492002832,
"loss": 0.4298,
"step": 577
},
{
"epoch": 0.4357126030624264,
"grad_norm": 6.798791408538818,
"learning_rate": 0.00012142072147131898,
"loss": 0.4204,
"step": 578
},
{
"epoch": 0.43646643109540634,
"grad_norm": 6.528103828430176,
"learning_rate": 0.00012118765613429173,
"loss": 0.4448,
"step": 579
},
{
"epoch": 0.43722025912838636,
"grad_norm": 6.5673909187316895,
"learning_rate": 0.0001209544702351363,
"loss": 0.432,
"step": 580
},
{
"epoch": 0.4379740871613663,
"grad_norm": 7.303831577301025,
"learning_rate": 0.00012072116510072858,
"loss": 0.4125,
"step": 581
},
{
"epoch": 0.4387279151943463,
"grad_norm": 6.5421576499938965,
"learning_rate": 0.00012048774205862279,
"loss": 0.4171,
"step": 582
},
{
"epoch": 0.43948174322732625,
"grad_norm": 6.537741661071777,
"learning_rate": 0.0001202542024370441,
"loss": 0.385,
"step": 583
},
{
"epoch": 0.44023557126030627,
"grad_norm": 6.6051530838012695,
"learning_rate": 0.00012002054756488115,
"loss": 0.3888,
"step": 584
},
{
"epoch": 0.44098939929328623,
"grad_norm": 6.796999454498291,
"learning_rate": 0.00011978677877167822,
"loss": 0.4049,
"step": 585
},
{
"epoch": 0.4417432273262662,
"grad_norm": 7.154036521911621,
"learning_rate": 0.00011955289738762796,
"loss": 0.4168,
"step": 586
},
{
"epoch": 0.44249705535924616,
"grad_norm": 6.852260112762451,
"learning_rate": 0.00011931890474356358,
"loss": 0.381,
"step": 587
},
{
"epoch": 0.4432508833922261,
"grad_norm": 6.91892671585083,
"learning_rate": 0.00011908480217095141,
"loss": 0.3895,
"step": 588
},
{
"epoch": 0.44400471142520614,
"grad_norm": 7.690057277679443,
"learning_rate": 0.00011885059100188341,
"loss": 0.4504,
"step": 589
},
{
"epoch": 0.4447585394581861,
"grad_norm": 7.000772476196289,
"learning_rate": 0.00011861627256906929,
"loss": 0.3868,
"step": 590
},
{
"epoch": 0.4455123674911661,
"grad_norm": 7.221988201141357,
"learning_rate": 0.00011838184820582923,
"loss": 0.4119,
"step": 591
},
{
"epoch": 0.44626619552414604,
"grad_norm": 8.583606719970703,
"learning_rate": 0.00011814731924608616,
"loss": 0.4087,
"step": 592
},
{
"epoch": 0.44702002355712606,
"grad_norm": 8.559534072875977,
"learning_rate": 0.00011791268702435816,
"loss": 0.4469,
"step": 593
},
{
"epoch": 0.447773851590106,
"grad_norm": 8.477254867553711,
"learning_rate": 0.0001176779528757509,
"loss": 0.476,
"step": 594
},
{
"epoch": 0.448527679623086,
"grad_norm": 9.82533073425293,
"learning_rate": 0.00011744311813595006,
"loss": 0.5395,
"step": 595
},
{
"epoch": 0.44928150765606595,
"grad_norm": 9.407917022705078,
"learning_rate": 0.00011720818414121368,
"loss": 0.4716,
"step": 596
},
{
"epoch": 0.4500353356890459,
"grad_norm": 11.39129638671875,
"learning_rate": 0.00011697315222836458,
"loss": 0.4827,
"step": 597
},
{
"epoch": 0.45078916372202593,
"grad_norm": 11.540337562561035,
"learning_rate": 0.0001167380237347828,
"loss": 0.4713,
"step": 598
},
{
"epoch": 0.4515429917550059,
"grad_norm": 10.345648765563965,
"learning_rate": 0.00011650279999839787,
"loss": 0.4148,
"step": 599
},
{
"epoch": 0.45229681978798586,
"grad_norm": 12.826940536499023,
"learning_rate": 0.00011626748235768128,
"loss": 0.487,
"step": 600
},
{
"epoch": 0.4530506478209658,
"grad_norm": 9.553250312805176,
"learning_rate": 0.00011603207215163894,
"loss": 0.5809,
"step": 601
},
{
"epoch": 0.45380447585394584,
"grad_norm": 9.77419662475586,
"learning_rate": 0.0001157965707198034,
"loss": 0.5538,
"step": 602
},
{
"epoch": 0.4545583038869258,
"grad_norm": 8.743382453918457,
"learning_rate": 0.00011556097940222628,
"loss": 0.5516,
"step": 603
},
{
"epoch": 0.45531213191990577,
"grad_norm": 7.538958549499512,
"learning_rate": 0.00011532529953947075,
"loss": 0.5119,
"step": 604
},
{
"epoch": 0.45606595995288574,
"grad_norm": 6.539525032043457,
"learning_rate": 0.00011508953247260379,
"loss": 0.499,
"step": 605
},
{
"epoch": 0.4568197879858657,
"grad_norm": 6.682277679443359,
"learning_rate": 0.00011485367954318856,
"loss": 0.4594,
"step": 606
},
{
"epoch": 0.4575736160188457,
"grad_norm": 5.594506740570068,
"learning_rate": 0.0001146177420932768,
"loss": 0.4609,
"step": 607
},
{
"epoch": 0.4583274440518257,
"grad_norm": 6.195127964019775,
"learning_rate": 0.00011438172146540123,
"loss": 0.4413,
"step": 608
},
{
"epoch": 0.45908127208480565,
"grad_norm": 6.665927410125732,
"learning_rate": 0.00011414561900256784,
"loss": 0.4492,
"step": 609
},
{
"epoch": 0.4598351001177856,
"grad_norm": 7.045360088348389,
"learning_rate": 0.00011390943604824826,
"loss": 0.4508,
"step": 610
},
{
"epoch": 0.46058892815076563,
"grad_norm": 7.470615386962891,
"learning_rate": 0.00011367317394637218,
"loss": 0.46,
"step": 611
},
{
"epoch": 0.4613427561837456,
"grad_norm": 6.948364734649658,
"learning_rate": 0.00011343683404131964,
"loss": 0.477,
"step": 612
},
{
"epoch": 0.46209658421672556,
"grad_norm": 6.797374248504639,
"learning_rate": 0.00011320041767791336,
"loss": 0.4726,
"step": 613
},
{
"epoch": 0.4628504122497055,
"grad_norm": 6.488336563110352,
"learning_rate": 0.00011296392620141114,
"loss": 0.4403,
"step": 614
},
{
"epoch": 0.4636042402826855,
"grad_norm": 7.050676345825195,
"learning_rate": 0.00011272736095749823,
"loss": 0.475,
"step": 615
},
{
"epoch": 0.4643580683156655,
"grad_norm": 6.4435038566589355,
"learning_rate": 0.00011249072329227959,
"loss": 0.4188,
"step": 616
},
{
"epoch": 0.46511189634864547,
"grad_norm": 6.662125110626221,
"learning_rate": 0.0001122540145522723,
"loss": 0.4365,
"step": 617
},
{
"epoch": 0.46586572438162543,
"grad_norm": 6.387564659118652,
"learning_rate": 0.00011201723608439778,
"loss": 0.4237,
"step": 618
},
{
"epoch": 0.4666195524146054,
"grad_norm": 6.151999473571777,
"learning_rate": 0.0001117803892359744,
"loss": 0.3967,
"step": 619
},
{
"epoch": 0.4673733804475854,
"grad_norm": 6.0764055252075195,
"learning_rate": 0.00011154347535470947,
"loss": 0.4032,
"step": 620
},
{
"epoch": 0.4681272084805654,
"grad_norm": 6.069274425506592,
"learning_rate": 0.00011130649578869173,
"loss": 0.4234,
"step": 621
},
{
"epoch": 0.46888103651354535,
"grad_norm": 6.283833980560303,
"learning_rate": 0.00011106945188638378,
"loss": 0.4115,
"step": 622
},
{
"epoch": 0.4696348645465253,
"grad_norm": 6.327964782714844,
"learning_rate": 0.00011083234499661426,
"loss": 0.4293,
"step": 623
},
{
"epoch": 0.4703886925795053,
"grad_norm": 6.516750812530518,
"learning_rate": 0.00011059517646857023,
"loss": 0.3893,
"step": 624
},
{
"epoch": 0.4711425206124853,
"grad_norm": 7.370739936828613,
"learning_rate": 0.00011035794765178941,
"loss": 0.4385,
"step": 625
},
{
"epoch": 0.47189634864546526,
"grad_norm": 7.1700568199157715,
"learning_rate": 0.0001101206598961527,
"loss": 0.4221,
"step": 626
},
{
"epoch": 0.4726501766784452,
"grad_norm": 6.261050701141357,
"learning_rate": 0.00010988331455187628,
"loss": 0.4389,
"step": 627
},
{
"epoch": 0.4734040047114252,
"grad_norm": 6.810924530029297,
"learning_rate": 0.00010964591296950406,
"loss": 0.4653,
"step": 628
},
{
"epoch": 0.4741578327444052,
"grad_norm": 6.419404983520508,
"learning_rate": 0.00010940845649989994,
"loss": 0.4074,
"step": 629
},
{
"epoch": 0.47491166077738517,
"grad_norm": 6.0266008377075195,
"learning_rate": 0.00010917094649424018,
"loss": 0.3729,
"step": 630
},
{
"epoch": 0.47566548881036513,
"grad_norm": 6.674122333526611,
"learning_rate": 0.00010893338430400562,
"loss": 0.4016,
"step": 631
},
{
"epoch": 0.4764193168433451,
"grad_norm": 6.93697452545166,
"learning_rate": 0.00010869577128097404,
"loss": 0.3884,
"step": 632
},
{
"epoch": 0.47717314487632506,
"grad_norm": 6.370805263519287,
"learning_rate": 0.00010845810877721252,
"loss": 0.3835,
"step": 633
},
{
"epoch": 0.4779269729093051,
"grad_norm": 6.402405738830566,
"learning_rate": 0.00010822039814506964,
"loss": 0.396,
"step": 634
},
{
"epoch": 0.47868080094228505,
"grad_norm": 6.631165027618408,
"learning_rate": 0.00010798264073716791,
"loss": 0.4034,
"step": 635
},
{
"epoch": 0.479434628975265,
"grad_norm": 7.069218635559082,
"learning_rate": 0.00010774483790639591,
"loss": 0.4071,
"step": 636
},
{
"epoch": 0.480188457008245,
"grad_norm": 6.614718914031982,
"learning_rate": 0.00010750699100590076,
"loss": 0.3959,
"step": 637
},
{
"epoch": 0.480942285041225,
"grad_norm": 6.693352699279785,
"learning_rate": 0.00010726910138908032,
"loss": 0.3853,
"step": 638
},
{
"epoch": 0.48169611307420496,
"grad_norm": 6.8856940269470215,
"learning_rate": 0.00010703117040957553,
"loss": 0.3904,
"step": 639
},
{
"epoch": 0.4824499411071849,
"grad_norm": 7.3366522789001465,
"learning_rate": 0.00010679319942126264,
"loss": 0.4061,
"step": 640
},
{
"epoch": 0.4832037691401649,
"grad_norm": 7.205180644989014,
"learning_rate": 0.00010655518977824566,
"loss": 0.4066,
"step": 641
},
{
"epoch": 0.48395759717314485,
"grad_norm": 9.314166069030762,
"learning_rate": 0.00010631714283484842,
"loss": 0.4507,
"step": 642
},
{
"epoch": 0.48471142520612487,
"grad_norm": 8.445844650268555,
"learning_rate": 0.0001060790599456071,
"loss": 0.4467,
"step": 643
},
{
"epoch": 0.48546525323910483,
"grad_norm": 8.920785903930664,
"learning_rate": 0.00010584094246526237,
"loss": 0.4593,
"step": 644
},
{
"epoch": 0.4862190812720848,
"grad_norm": 9.759257316589355,
"learning_rate": 0.00010560279174875179,
"loss": 0.5054,
"step": 645
},
{
"epoch": 0.48697290930506476,
"grad_norm": 9.649422645568848,
"learning_rate": 0.0001053646091512019,
"loss": 0.4891,
"step": 646
},
{
"epoch": 0.4877267373380448,
"grad_norm": 9.831908226013184,
"learning_rate": 0.00010512639602792088,
"loss": 0.4805,
"step": 647
},
{
"epoch": 0.48848056537102474,
"grad_norm": 11.026556968688965,
"learning_rate": 0.00010488815373439036,
"loss": 0.4875,
"step": 648
},
{
"epoch": 0.4892343934040047,
"grad_norm": 10.98789119720459,
"learning_rate": 0.00010464988362625812,
"loss": 0.4852,
"step": 649
},
{
"epoch": 0.48998822143698467,
"grad_norm": 12.804154396057129,
"learning_rate": 0.00010441158705933016,
"loss": 0.5069,
"step": 650
},
{
"epoch": 0.4907420494699647,
"grad_norm": 7.31414270401001,
"learning_rate": 0.00010417326538956305,
"loss": 0.5666,
"step": 651
},
{
"epoch": 0.49149587750294466,
"grad_norm": 7.537758827209473,
"learning_rate": 0.00010393491997305613,
"loss": 0.5592,
"step": 652
},
{
"epoch": 0.4922497055359246,
"grad_norm": 7.580841064453125,
"learning_rate": 0.00010369655216604397,
"loss": 0.4984,
"step": 653
},
{
"epoch": 0.4930035335689046,
"grad_norm": 7.048511028289795,
"learning_rate": 0.0001034581633248885,
"loss": 0.5271,
"step": 654
},
{
"epoch": 0.49375736160188455,
"grad_norm": 6.32865047454834,
"learning_rate": 0.00010321975480607129,
"loss": 0.4999,
"step": 655
},
{
"epoch": 0.49451118963486457,
"grad_norm": 5.981396675109863,
"learning_rate": 0.00010298132796618596,
"loss": 0.4717,
"step": 656
},
{
"epoch": 0.49526501766784453,
"grad_norm": 5.971866130828857,
"learning_rate": 0.00010274288416193034,
"loss": 0.4357,
"step": 657
},
{
"epoch": 0.4960188457008245,
"grad_norm": 5.870616912841797,
"learning_rate": 0.0001025044247500988,
"loss": 0.4475,
"step": 658
},
{
"epoch": 0.49677267373380446,
"grad_norm": 6.04547119140625,
"learning_rate": 0.00010226595108757451,
"loss": 0.4641,
"step": 659
},
{
"epoch": 0.4975265017667845,
"grad_norm": 6.311388969421387,
"learning_rate": 0.00010202746453132172,
"loss": 0.4697,
"step": 660
},
{
"epoch": 0.49828032979976444,
"grad_norm": 5.957773208618164,
"learning_rate": 0.00010178896643837809,
"loss": 0.4381,
"step": 661
},
{
"epoch": 0.4990341578327444,
"grad_norm": 6.014715671539307,
"learning_rate": 0.00010155045816584691,
"loss": 0.4429,
"step": 662
},
{
"epoch": 0.49978798586572437,
"grad_norm": 5.99500846862793,
"learning_rate": 0.00010131194107088935,
"loss": 0.4544,
"step": 663
},
{
"epoch": 0.5005418138987043,
"grad_norm": 6.102397918701172,
"learning_rate": 0.00010107341651071684,
"loss": 0.4437,
"step": 664
},
{
"epoch": 0.5005418138987043,
"eval_loss": 0.44807884097099304,
"eval_runtime": 126.4853,
"eval_samples_per_second": 17.67,
"eval_steps_per_second": 8.839,
"step": 664
},
{
"epoch": 0.5012956419316843,
"grad_norm": 5.838627338409424,
"learning_rate": 0.00010083488584258326,
"loss": 0.3961,
"step": 665
},
{
"epoch": 0.5020494699646643,
"grad_norm": 6.225624084472656,
"learning_rate": 0.00010059635042377725,
"loss": 0.4199,
"step": 666
},
{
"epoch": 0.5028032979976443,
"grad_norm": 5.906275749206543,
"learning_rate": 0.00010035781161161446,
"loss": 0.4164,
"step": 667
},
{
"epoch": 0.5035571260306243,
"grad_norm": 5.818455696105957,
"learning_rate": 0.0001001192707634299,
"loss": 0.3753,
"step": 668
},
{
"epoch": 0.5043109540636043,
"grad_norm": 6.505937099456787,
"learning_rate": 9.988072923657012e-05,
"loss": 0.4058,
"step": 669
},
{
"epoch": 0.5050647820965842,
"grad_norm": 6.205794811248779,
"learning_rate": 9.964218838838554e-05,
"loss": 0.4176,
"step": 670
},
{
"epoch": 0.5058186101295642,
"grad_norm": 6.019129753112793,
"learning_rate": 9.940364957622276e-05,
"loss": 0.4253,
"step": 671
},
{
"epoch": 0.5065724381625442,
"grad_norm": 5.988311290740967,
"learning_rate": 9.916511415741676e-05,
"loss": 0.399,
"step": 672
},
{
"epoch": 0.5073262661955241,
"grad_norm": 6.607666492462158,
"learning_rate": 9.892658348928316e-05,
"loss": 0.4154,
"step": 673
},
{
"epoch": 0.5080800942285041,
"grad_norm": 5.99027156829834,
"learning_rate": 9.868805892911067e-05,
"loss": 0.387,
"step": 674
},
{
"epoch": 0.508833922261484,
"grad_norm": 6.09193229675293,
"learning_rate": 9.84495418341531e-05,
"loss": 0.3817,
"step": 675
},
{
"epoch": 0.5095877502944641,
"grad_norm": 6.635573863983154,
"learning_rate": 9.821103356162189e-05,
"loss": 0.4021,
"step": 676
},
{
"epoch": 0.5103415783274441,
"grad_norm": 6.2010884284973145,
"learning_rate": 9.797253546867831e-05,
"loss": 0.3915,
"step": 677
},
{
"epoch": 0.511095406360424,
"grad_norm": 6.824472427368164,
"learning_rate": 9.773404891242551e-05,
"loss": 0.3946,
"step": 678
},
{
"epoch": 0.511849234393404,
"grad_norm": 7.179849147796631,
"learning_rate": 9.749557524990121e-05,
"loss": 0.4281,
"step": 679
},
{
"epoch": 0.512603062426384,
"grad_norm": 6.765272617340088,
"learning_rate": 9.72571158380697e-05,
"loss": 0.4113,
"step": 680
},
{
"epoch": 0.513356890459364,
"grad_norm": 6.409517765045166,
"learning_rate": 9.701867203381405e-05,
"loss": 0.387,
"step": 681
},
{
"epoch": 0.5141107184923439,
"grad_norm": 6.494263172149658,
"learning_rate": 9.678024519392871e-05,
"loss": 0.3783,
"step": 682
},
{
"epoch": 0.5148645465253239,
"grad_norm": 6.259777545928955,
"learning_rate": 9.654183667511154e-05,
"loss": 0.3996,
"step": 683
},
{
"epoch": 0.5156183745583038,
"grad_norm": 6.5478363037109375,
"learning_rate": 9.630344783395604e-05,
"loss": 0.3838,
"step": 684
},
{
"epoch": 0.5163722025912839,
"grad_norm": 7.6854071617126465,
"learning_rate": 9.606508002694386e-05,
"loss": 0.4235,
"step": 685
},
{
"epoch": 0.5171260306242639,
"grad_norm": 7.029118537902832,
"learning_rate": 9.5826734610437e-05,
"loss": 0.418,
"step": 686
},
{
"epoch": 0.5178798586572438,
"grad_norm": 7.062952518463135,
"learning_rate": 9.558841294066985e-05,
"loss": 0.4281,
"step": 687
},
{
"epoch": 0.5186336866902238,
"grad_norm": 6.547257900238037,
"learning_rate": 9.535011637374189e-05,
"loss": 0.4008,
"step": 688
},
{
"epoch": 0.5193875147232038,
"grad_norm": 7.128522872924805,
"learning_rate": 9.511184626560968e-05,
"loss": 0.4072,
"step": 689
},
{
"epoch": 0.5201413427561837,
"grad_norm": 6.604221343994141,
"learning_rate": 9.487360397207916e-05,
"loss": 0.3906,
"step": 690
},
{
"epoch": 0.5208951707891637,
"grad_norm": 7.471280574798584,
"learning_rate": 9.463539084879809e-05,
"loss": 0.4373,
"step": 691
},
{
"epoch": 0.5216489988221437,
"grad_norm": 7.444307804107666,
"learning_rate": 9.439720825124827e-05,
"loss": 0.4245,
"step": 692
},
{
"epoch": 0.5224028268551236,
"grad_norm": 7.748506546020508,
"learning_rate": 9.415905753473765e-05,
"loss": 0.4267,
"step": 693
},
{
"epoch": 0.5231566548881037,
"grad_norm": 8.47761344909668,
"learning_rate": 9.392094005439291e-05,
"loss": 0.4861,
"step": 694
},
{
"epoch": 0.5239104829210837,
"grad_norm": 9.239935874938965,
"learning_rate": 9.368285716515162e-05,
"loss": 0.45,
"step": 695
},
{
"epoch": 0.5246643109540636,
"grad_norm": 9.59188461303711,
"learning_rate": 9.344481022175436e-05,
"loss": 0.4876,
"step": 696
},
{
"epoch": 0.5254181389870436,
"grad_norm": 10.498910903930664,
"learning_rate": 9.320680057873735e-05,
"loss": 0.5021,
"step": 697
},
{
"epoch": 0.5261719670200236,
"grad_norm": 11.162120819091797,
"learning_rate": 9.29688295904245e-05,
"loss": 0.5001,
"step": 698
},
{
"epoch": 0.5269257950530035,
"grad_norm": 11.781893730163574,
"learning_rate": 9.273089861091969e-05,
"loss": 0.456,
"step": 699
},
{
"epoch": 0.5276796230859835,
"grad_norm": 15.090996742248535,
"learning_rate": 9.249300899409924e-05,
"loss": 0.5593,
"step": 700
},
{
"epoch": 0.5284334511189634,
"grad_norm": 9.527992248535156,
"learning_rate": 9.225516209360413e-05,
"loss": 0.5803,
"step": 701
},
{
"epoch": 0.5291872791519434,
"grad_norm": 8.856983184814453,
"learning_rate": 9.201735926283213e-05,
"loss": 0.5268,
"step": 702
},
{
"epoch": 0.5299411071849235,
"grad_norm": 7.78725528717041,
"learning_rate": 9.177960185493036e-05,
"loss": 0.5227,
"step": 703
},
{
"epoch": 0.5306949352179035,
"grad_norm": 7.152993679046631,
"learning_rate": 9.154189122278754e-05,
"loss": 0.5067,
"step": 704
},
{
"epoch": 0.5314487632508834,
"grad_norm": 6.18569278717041,
"learning_rate": 9.1304228719026e-05,
"loss": 0.476,
"step": 705
},
{
"epoch": 0.5322025912838634,
"grad_norm": 6.376234531402588,
"learning_rate": 9.106661569599442e-05,
"loss": 0.4734,
"step": 706
},
{
"epoch": 0.5329564193168433,
"grad_norm": 6.275115489959717,
"learning_rate": 9.082905350575986e-05,
"loss": 0.4468,
"step": 707
},
{
"epoch": 0.5337102473498233,
"grad_norm": 5.899405479431152,
"learning_rate": 9.059154350010008e-05,
"loss": 0.4738,
"step": 708
},
{
"epoch": 0.5344640753828033,
"grad_norm": 6.213337421417236,
"learning_rate": 9.035408703049596e-05,
"loss": 0.4732,
"step": 709
},
{
"epoch": 0.5352179034157832,
"grad_norm": 6.043967247009277,
"learning_rate": 9.011668544812377e-05,
"loss": 0.4514,
"step": 710
},
{
"epoch": 0.5359717314487632,
"grad_norm": 6.495950698852539,
"learning_rate": 8.987934010384733e-05,
"loss": 0.4468,
"step": 711
},
{
"epoch": 0.5367255594817433,
"grad_norm": 6.062058448791504,
"learning_rate": 8.96420523482106e-05,
"loss": 0.4311,
"step": 712
},
{
"epoch": 0.5374793875147232,
"grad_norm": 6.561244964599609,
"learning_rate": 8.940482353142983e-05,
"loss": 0.4621,
"step": 713
},
{
"epoch": 0.5382332155477032,
"grad_norm": 5.8635029792785645,
"learning_rate": 8.916765500338575e-05,
"loss": 0.4189,
"step": 714
},
{
"epoch": 0.5389870435806832,
"grad_norm": 6.959576606750488,
"learning_rate": 8.893054811361624e-05,
"loss": 0.4382,
"step": 715
},
{
"epoch": 0.5397408716136631,
"grad_norm": 5.93906307220459,
"learning_rate": 8.869350421130831e-05,
"loss": 0.4202,
"step": 716
},
{
"epoch": 0.5404946996466431,
"grad_norm": 5.888154029846191,
"learning_rate": 8.845652464529057e-05,
"loss": 0.4098,
"step": 717
},
{
"epoch": 0.5412485276796231,
"grad_norm": 6.113773345947266,
"learning_rate": 8.821961076402563e-05,
"loss": 0.412,
"step": 718
},
{
"epoch": 0.542002355712603,
"grad_norm": 6.2954607009887695,
"learning_rate": 8.79827639156022e-05,
"loss": 0.4472,
"step": 719
},
{
"epoch": 0.542756183745583,
"grad_norm": 6.085266590118408,
"learning_rate": 8.774598544772774e-05,
"loss": 0.4134,
"step": 720
},
{
"epoch": 0.5435100117785631,
"grad_norm": 5.995761871337891,
"learning_rate": 8.750927670772044e-05,
"loss": 0.4236,
"step": 721
},
{
"epoch": 0.544263839811543,
"grad_norm": 6.094368934631348,
"learning_rate": 8.727263904250178e-05,
"loss": 0.4344,
"step": 722
},
{
"epoch": 0.545017667844523,
"grad_norm": 6.14577579498291,
"learning_rate": 8.703607379858889e-05,
"loss": 0.396,
"step": 723
},
{
"epoch": 0.545771495877503,
"grad_norm": 5.814198970794678,
"learning_rate": 8.679958232208668e-05,
"loss": 0.3987,
"step": 724
},
{
"epoch": 0.5465253239104829,
"grad_norm": 6.348716735839844,
"learning_rate": 8.656316595868037e-05,
"loss": 0.4263,
"step": 725
},
{
"epoch": 0.5472791519434629,
"grad_norm": 6.51011323928833,
"learning_rate": 8.632682605362784e-05,
"loss": 0.4361,
"step": 726
},
{
"epoch": 0.5480329799764428,
"grad_norm": 6.134734630584717,
"learning_rate": 8.609056395175175e-05,
"loss": 0.3946,
"step": 727
},
{
"epoch": 0.5487868080094228,
"grad_norm": 6.129810333251953,
"learning_rate": 8.585438099743217e-05,
"loss": 0.3948,
"step": 728
},
{
"epoch": 0.5495406360424028,
"grad_norm": 6.51365852355957,
"learning_rate": 8.56182785345988e-05,
"loss": 0.4182,
"step": 729
},
{
"epoch": 0.5502944640753828,
"grad_norm": 6.257938861846924,
"learning_rate": 8.538225790672322e-05,
"loss": 0.4041,
"step": 730
},
{
"epoch": 0.5510482921083628,
"grad_norm": 6.626195430755615,
"learning_rate": 8.514632045681145e-05,
"loss": 0.4291,
"step": 731
},
{
"epoch": 0.5518021201413428,
"grad_norm": 6.350541591644287,
"learning_rate": 8.491046752739624e-05,
"loss": 0.4113,
"step": 732
},
{
"epoch": 0.5525559481743227,
"grad_norm": 6.342377185821533,
"learning_rate": 8.467470046052927e-05,
"loss": 0.3725,
"step": 733
},
{
"epoch": 0.5533097762073027,
"grad_norm": 6.338717460632324,
"learning_rate": 8.443902059777373e-05,
"loss": 0.4044,
"step": 734
},
{
"epoch": 0.5540636042402827,
"grad_norm": 6.489543914794922,
"learning_rate": 8.420342928019666e-05,
"loss": 0.3806,
"step": 735
},
{
"epoch": 0.5548174322732626,
"grad_norm": 6.675236701965332,
"learning_rate": 8.396792784836108e-05,
"loss": 0.3937,
"step": 736
},
{
"epoch": 0.5555712603062426,
"grad_norm": 7.242746829986572,
"learning_rate": 8.373251764231872e-05,
"loss": 0.3968,
"step": 737
},
{
"epoch": 0.5563250883392226,
"grad_norm": 6.987369537353516,
"learning_rate": 8.349720000160218e-05,
"loss": 0.3878,
"step": 738
},
{
"epoch": 0.5570789163722026,
"grad_norm": 7.393560886383057,
"learning_rate": 8.326197626521723e-05,
"loss": 0.3883,
"step": 739
},
{
"epoch": 0.5578327444051826,
"grad_norm": 7.474055290222168,
"learning_rate": 8.30268477716354e-05,
"loss": 0.4183,
"step": 740
},
{
"epoch": 0.5585865724381626,
"grad_norm": 7.556806564331055,
"learning_rate": 8.279181585878635e-05,
"loss": 0.4282,
"step": 741
},
{
"epoch": 0.5593404004711425,
"grad_norm": 8.794517517089844,
"learning_rate": 8.255688186404996e-05,
"loss": 0.4694,
"step": 742
},
{
"epoch": 0.5600942285041225,
"grad_norm": 9.162858963012695,
"learning_rate": 8.232204712424911e-05,
"loss": 0.4888,
"step": 743
},
{
"epoch": 0.5608480565371025,
"grad_norm": 9.154852867126465,
"learning_rate": 8.208731297564189e-05,
"loss": 0.4735,
"step": 744
},
{
"epoch": 0.5616018845700824,
"grad_norm": 9.025120735168457,
"learning_rate": 8.185268075391388e-05,
"loss": 0.4743,
"step": 745
},
{
"epoch": 0.5623557126030624,
"grad_norm": 9.328535079956055,
"learning_rate": 8.161815179417078e-05,
"loss": 0.4575,
"step": 746
},
{
"epoch": 0.5631095406360423,
"grad_norm": 9.941339492797852,
"learning_rate": 8.138372743093076e-05,
"loss": 0.4969,
"step": 747
},
{
"epoch": 0.5638633686690224,
"grad_norm": 9.928484916687012,
"learning_rate": 8.114940899811662e-05,
"loss": 0.4634,
"step": 748
},
{
"epoch": 0.5646171967020024,
"grad_norm": 10.29101848602295,
"learning_rate": 8.091519782904857e-05,
"loss": 0.4114,
"step": 749
},
{
"epoch": 0.5653710247349824,
"grad_norm": 15.212136268615723,
"learning_rate": 8.068109525643647e-05,
"loss": 0.516,
"step": 750
},
{
"epoch": 0.5661248527679623,
"grad_norm": 8.223611831665039,
"learning_rate": 8.044710261237207e-05,
"loss": 0.541,
"step": 751
},
{
"epoch": 0.5668786808009423,
"grad_norm": 8.392924308776855,
"learning_rate": 8.021322122832178e-05,
"loss": 0.5317,
"step": 752
},
{
"epoch": 0.5676325088339222,
"grad_norm": 8.130448341369629,
"learning_rate": 7.99794524351189e-05,
"loss": 0.4935,
"step": 753
},
{
"epoch": 0.5683863368669022,
"grad_norm": 6.9753899574279785,
"learning_rate": 7.974579756295591e-05,
"loss": 0.4941,
"step": 754
},
{
"epoch": 0.5691401648998822,
"grad_norm": 6.365013122558594,
"learning_rate": 7.951225794137724e-05,
"loss": 0.4539,
"step": 755
},
{
"epoch": 0.5698939929328621,
"grad_norm": 5.7341628074646,
"learning_rate": 7.927883489927147e-05,
"loss": 0.4197,
"step": 756
},
{
"epoch": 0.5706478209658422,
"grad_norm": 6.036746025085449,
"learning_rate": 7.904552976486372e-05,
"loss": 0.4361,
"step": 757
},
{
"epoch": 0.5714016489988222,
"grad_norm": 5.587414264678955,
"learning_rate": 7.88123438657083e-05,
"loss": 0.4294,
"step": 758
},
{
"epoch": 0.5721554770318021,
"grad_norm": 5.824455738067627,
"learning_rate": 7.857927852868107e-05,
"loss": 0.426,
"step": 759
},
{
"epoch": 0.5729093050647821,
"grad_norm": 5.811740398406982,
"learning_rate": 7.83463350799717e-05,
"loss": 0.4336,
"step": 760
},
{
"epoch": 0.5736631330977621,
"grad_norm": 5.9260945320129395,
"learning_rate": 7.811351484507647e-05,
"loss": 0.4609,
"step": 761
},
{
"epoch": 0.574416961130742,
"grad_norm": 6.589666843414307,
"learning_rate": 7.788081914879051e-05,
"loss": 0.4384,
"step": 762
},
{
"epoch": 0.575170789163722,
"grad_norm": 5.957409858703613,
"learning_rate": 7.764824931520018e-05,
"loss": 0.4261,
"step": 763
},
{
"epoch": 0.575924617196702,
"grad_norm": 6.138071060180664,
"learning_rate": 7.741580666767583e-05,
"loss": 0.4189,
"step": 764
},
{
"epoch": 0.5766784452296819,
"grad_norm": 5.744472503662109,
"learning_rate": 7.718349252886395e-05,
"loss": 0.4086,
"step": 765
},
{
"epoch": 0.577432273262662,
"grad_norm": 6.045204162597656,
"learning_rate": 7.695130822067984e-05,
"loss": 0.4306,
"step": 766
},
{
"epoch": 0.578186101295642,
"grad_norm": 5.609772682189941,
"learning_rate": 7.67192550643001e-05,
"loss": 0.3998,
"step": 767
},
{
"epoch": 0.5789399293286219,
"grad_norm": 5.921622276306152,
"learning_rate": 7.648733438015493e-05,
"loss": 0.4225,
"step": 768
},
{
"epoch": 0.5796937573616019,
"grad_norm": 6.352652072906494,
"learning_rate": 7.625554748792085e-05,
"loss": 0.4193,
"step": 769
},
{
"epoch": 0.5804475853945819,
"grad_norm": 6.210894584655762,
"learning_rate": 7.602389570651303e-05,
"loss": 0.4119,
"step": 770
},
{
"epoch": 0.5812014134275618,
"grad_norm": 6.061959743499756,
"learning_rate": 7.579238035407776e-05,
"loss": 0.4097,
"step": 771
},
{
"epoch": 0.5819552414605418,
"grad_norm": 6.42627477645874,
"learning_rate": 7.556100274798519e-05,
"loss": 0.4226,
"step": 772
},
{
"epoch": 0.5827090694935217,
"grad_norm": 6.124332904815674,
"learning_rate": 7.532976420482146e-05,
"loss": 0.396,
"step": 773
},
{
"epoch": 0.5834628975265017,
"grad_norm": 5.928023815155029,
"learning_rate": 7.509866604038157e-05,
"loss": 0.3897,
"step": 774
},
{
"epoch": 0.5842167255594818,
"grad_norm": 6.037590503692627,
"learning_rate": 7.486770956966171e-05,
"loss": 0.3958,
"step": 775
},
{
"epoch": 0.5849705535924618,
"grad_norm": 6.051185131072998,
"learning_rate": 7.463689610685171e-05,
"loss": 0.4072,
"step": 776
},
{
"epoch": 0.5857243816254417,
"grad_norm": 6.234012126922607,
"learning_rate": 7.440622696532775e-05,
"loss": 0.4151,
"step": 777
},
{
"epoch": 0.5864782096584217,
"grad_norm": 6.273362636566162,
"learning_rate": 7.417570345764481e-05,
"loss": 0.418,
"step": 778
},
{
"epoch": 0.5872320376914016,
"grad_norm": 6.810718059539795,
"learning_rate": 7.394532689552905e-05,
"loss": 0.4082,
"step": 779
},
{
"epoch": 0.5879858657243816,
"grad_norm": 7.068334102630615,
"learning_rate": 7.371509858987061e-05,
"loss": 0.4031,
"step": 780
},
{
"epoch": 0.5887396937573616,
"grad_norm": 6.441345691680908,
"learning_rate": 7.348501985071603e-05,
"loss": 0.3973,
"step": 781
},
{
"epoch": 0.5894935217903415,
"grad_norm": 6.285884380340576,
"learning_rate": 7.325509198726064e-05,
"loss": 0.3888,
"step": 782
},
{
"epoch": 0.5902473498233216,
"grad_norm": 5.942330360412598,
"learning_rate": 7.302531630784137e-05,
"loss": 0.3656,
"step": 783
},
{
"epoch": 0.5910011778563016,
"grad_norm": 6.333634376525879,
"learning_rate": 7.279569411992926e-05,
"loss": 0.4081,
"step": 784
},
{
"epoch": 0.5917550058892815,
"grad_norm": 6.436288833618164,
"learning_rate": 7.256622673012175e-05,
"loss": 0.4118,
"step": 785
},
{
"epoch": 0.5925088339222615,
"grad_norm": 6.464933395385742,
"learning_rate": 7.233691544413558e-05,
"loss": 0.4269,
"step": 786
},
{
"epoch": 0.5932626619552415,
"grad_norm": 6.593018054962158,
"learning_rate": 7.210776156679931e-05,
"loss": 0.4124,
"step": 787
},
{
"epoch": 0.5940164899882214,
"grad_norm": 6.8628363609313965,
"learning_rate": 7.187876640204556e-05,
"loss": 0.4109,
"step": 788
},
{
"epoch": 0.5947703180212014,
"grad_norm": 7.0224151611328125,
"learning_rate": 7.164993125290407e-05,
"loss": 0.4141,
"step": 789
},
{
"epoch": 0.5955241460541814,
"grad_norm": 6.763969421386719,
"learning_rate": 7.1421257421494e-05,
"loss": 0.4093,
"step": 790
},
{
"epoch": 0.5962779740871613,
"grad_norm": 7.6155781745910645,
"learning_rate": 7.119274620901649e-05,
"loss": 0.413,
"step": 791
},
{
"epoch": 0.5970318021201414,
"grad_norm": 7.919892311096191,
"learning_rate": 7.096439891574745e-05,
"loss": 0.422,
"step": 792
},
{
"epoch": 0.5977856301531214,
"grad_norm": 9.18865966796875,
"learning_rate": 7.073621684103007e-05,
"loss": 0.4679,
"step": 793
},
{
"epoch": 0.5985394581861013,
"grad_norm": 8.299490928649902,
"learning_rate": 7.050820128326724e-05,
"loss": 0.4638,
"step": 794
},
{
"epoch": 0.5992932862190813,
"grad_norm": 9.120932579040527,
"learning_rate": 7.028035353991456e-05,
"loss": 0.451,
"step": 795
},
{
"epoch": 0.6000471142520613,
"grad_norm": 9.830779075622559,
"learning_rate": 7.005267490747263e-05,
"loss": 0.4778,
"step": 796
},
{
"epoch": 0.6008009422850412,
"grad_norm": 10.880460739135742,
"learning_rate": 6.982516668147967e-05,
"loss": 0.4404,
"step": 797
},
{
"epoch": 0.6015547703180212,
"grad_norm": 10.648106575012207,
"learning_rate": 6.959783015650446e-05,
"loss": 0.5199,
"step": 798
},
{
"epoch": 0.6023085983510011,
"grad_norm": 11.122642517089844,
"learning_rate": 6.937066662613863e-05,
"loss": 0.4476,
"step": 799
},
{
"epoch": 0.6030624263839811,
"grad_norm": 12.062220573425293,
"learning_rate": 6.914367738298941e-05,
"loss": 0.4763,
"step": 800
},
{
"epoch": 0.6038162544169612,
"grad_norm": 6.382950782775879,
"learning_rate": 6.891686371867239e-05,
"loss": 0.5237,
"step": 801
},
{
"epoch": 0.6045700824499411,
"grad_norm": 7.342101097106934,
"learning_rate": 6.869022692380411e-05,
"loss": 0.51,
"step": 802
},
{
"epoch": 0.6053239104829211,
"grad_norm": 7.170543670654297,
"learning_rate": 6.846376828799451e-05,
"loss": 0.4846,
"step": 803
},
{
"epoch": 0.6060777385159011,
"grad_norm": 6.772843360900879,
"learning_rate": 6.823748909983994e-05,
"loss": 0.4899,
"step": 804
},
{
"epoch": 0.606831566548881,
"grad_norm": 6.159712314605713,
"learning_rate": 6.801139064691562e-05,
"loss": 0.4651,
"step": 805
},
{
"epoch": 0.607585394581861,
"grad_norm": 6.47841739654541,
"learning_rate": 6.778547421576825e-05,
"loss": 0.4699,
"step": 806
},
{
"epoch": 0.608339222614841,
"grad_norm": 5.620822906494141,
"learning_rate": 6.75597410919089e-05,
"loss": 0.4317,
"step": 807
},
{
"epoch": 0.6090930506478209,
"grad_norm": 5.6669392585754395,
"learning_rate": 6.733419255980559e-05,
"loss": 0.4504,
"step": 808
},
{
"epoch": 0.6098468786808009,
"grad_norm": 5.989339828491211,
"learning_rate": 6.710882990287585e-05,
"loss": 0.4576,
"step": 809
},
{
"epoch": 0.610600706713781,
"grad_norm": 5.7165751457214355,
"learning_rate": 6.688365440347965e-05,
"loss": 0.4179,
"step": 810
},
{
"epoch": 0.6113545347467609,
"grad_norm": 6.0307087898254395,
"learning_rate": 6.665866734291205e-05,
"loss": 0.4815,
"step": 811
},
{
"epoch": 0.6121083627797409,
"grad_norm": 6.319530010223389,
"learning_rate": 6.643387000139565e-05,
"loss": 0.4407,
"step": 812
},
{
"epoch": 0.6128621908127209,
"grad_norm": 5.93934440612793,
"learning_rate": 6.620926365807372e-05,
"loss": 0.4081,
"step": 813
},
{
"epoch": 0.6136160188457008,
"grad_norm": 5.771956443786621,
"learning_rate": 6.598484959100257e-05,
"loss": 0.3936,
"step": 814
},
{
"epoch": 0.6143698468786808,
"grad_norm": 6.20790433883667,
"learning_rate": 6.576062907714448e-05,
"loss": 0.4513,
"step": 815
},
{
"epoch": 0.6151236749116608,
"grad_norm": 5.739172458648682,
"learning_rate": 6.553660339236041e-05,
"loss": 0.399,
"step": 816
},
{
"epoch": 0.6158775029446407,
"grad_norm": 6.355349540710449,
"learning_rate": 6.53127738114026e-05,
"loss": 0.4259,
"step": 817
},
{
"epoch": 0.6166313309776207,
"grad_norm": 5.847348213195801,
"learning_rate": 6.508914160790752e-05,
"loss": 0.4091,
"step": 818
},
{
"epoch": 0.6173851590106008,
"grad_norm": 5.917300224304199,
"learning_rate": 6.486570805438843e-05,
"loss": 0.4258,
"step": 819
},
{
"epoch": 0.6181389870435807,
"grad_norm": 6.199348449707031,
"learning_rate": 6.46424744222283e-05,
"loss": 0.4054,
"step": 820
},
{
"epoch": 0.6188928150765607,
"grad_norm": 6.075807571411133,
"learning_rate": 6.441944198167253e-05,
"loss": 0.4334,
"step": 821
},
{
"epoch": 0.6196466431095407,
"grad_norm": 5.835407257080078,
"learning_rate": 6.419661200182158e-05,
"loss": 0.4124,
"step": 822
},
{
"epoch": 0.6204004711425206,
"grad_norm": 6.856280326843262,
"learning_rate": 6.397398575062396e-05,
"loss": 0.4316,
"step": 823
},
{
"epoch": 0.6211542991755006,
"grad_norm": 6.388029098510742,
"learning_rate": 6.375156449486895e-05,
"loss": 0.4096,
"step": 824
},
{
"epoch": 0.6219081272084805,
"grad_norm": 6.334976673126221,
"learning_rate": 6.352934950017921e-05,
"loss": 0.4267,
"step": 825
},
{
"epoch": 0.6226619552414605,
"grad_norm": 6.394600868225098,
"learning_rate": 6.330734203100394e-05,
"loss": 0.4151,
"step": 826
},
{
"epoch": 0.6234157832744405,
"grad_norm": 6.139026165008545,
"learning_rate": 6.308554335061135e-05,
"loss": 0.4307,
"step": 827
},
{
"epoch": 0.6241696113074205,
"grad_norm": 6.6982102394104,
"learning_rate": 6.286395472108158e-05,
"loss": 0.4285,
"step": 828
},
{
"epoch": 0.6249234393404005,
"grad_norm": 5.852738857269287,
"learning_rate": 6.26425774032996e-05,
"loss": 0.3874,
"step": 829
},
{
"epoch": 0.6256772673733805,
"grad_norm": 6.24067497253418,
"learning_rate": 6.2421412656948e-05,
"loss": 0.3924,
"step": 830
},
{
"epoch": 0.6264310954063604,
"grad_norm": 6.479643821716309,
"learning_rate": 6.220046174049968e-05,
"loss": 0.4109,
"step": 831
},
{
"epoch": 0.6271849234393404,
"grad_norm": 6.55532169342041,
"learning_rate": 6.19797259112109e-05,
"loss": 0.4151,
"step": 832
},
{
"epoch": 0.6279387514723204,
"grad_norm": 5.995844841003418,
"learning_rate": 6.175920642511404e-05,
"loss": 0.3872,
"step": 833
},
{
"epoch": 0.6286925795053003,
"grad_norm": 6.913110256195068,
"learning_rate": 6.153890453701031e-05,
"loss": 0.4105,
"step": 834
},
{
"epoch": 0.6294464075382803,
"grad_norm": 6.36851692199707,
"learning_rate": 6.131882150046291e-05,
"loss": 0.4048,
"step": 835
},
{
"epoch": 0.6302002355712603,
"grad_norm": 5.844064712524414,
"learning_rate": 6.109895856778967e-05,
"loss": 0.3689,
"step": 836
},
{
"epoch": 0.6309540636042403,
"grad_norm": 7.132351398468018,
"learning_rate": 6.087931699005588e-05,
"loss": 0.4218,
"step": 837
},
{
"epoch": 0.6317078916372203,
"grad_norm": 6.560583114624023,
"learning_rate": 6.065989801706744e-05,
"loss": 0.4053,
"step": 838
},
{
"epoch": 0.6324617196702003,
"grad_norm": 6.6530351638793945,
"learning_rate": 6.044070289736352e-05,
"loss": 0.4061,
"step": 839
},
{
"epoch": 0.6332155477031802,
"grad_norm": 6.5088677406311035,
"learning_rate": 6.0221732878209425e-05,
"loss": 0.376,
"step": 840
},
{
"epoch": 0.6339693757361602,
"grad_norm": 6.723409175872803,
"learning_rate": 6.0002989205589734e-05,
"loss": 0.3978,
"step": 841
},
{
"epoch": 0.6347232037691402,
"grad_norm": 9.00965404510498,
"learning_rate": 5.978447312420103e-05,
"loss": 0.4661,
"step": 842
},
{
"epoch": 0.6354770318021201,
"grad_norm": 8.346488952636719,
"learning_rate": 5.9566185877444755e-05,
"loss": 0.4812,
"step": 843
},
{
"epoch": 0.6362308598351001,
"grad_norm": 9.07754135131836,
"learning_rate": 5.934812870742036e-05,
"loss": 0.5042,
"step": 844
},
{
"epoch": 0.63698468786808,
"grad_norm": 9.425755500793457,
"learning_rate": 5.913030285491808e-05,
"loss": 0.5273,
"step": 845
},
{
"epoch": 0.6377385159010601,
"grad_norm": 8.991804122924805,
"learning_rate": 5.891270955941184e-05,
"loss": 0.4724,
"step": 846
},
{
"epoch": 0.6384923439340401,
"grad_norm": 9.069438934326172,
"learning_rate": 5.869535005905232e-05,
"loss": 0.4694,
"step": 847
},
{
"epoch": 0.63924617196702,
"grad_norm": 9.837794303894043,
"learning_rate": 5.847822559065992e-05,
"loss": 0.4601,
"step": 848
},
{
"epoch": 0.64,
"grad_norm": 10.19363021850586,
"learning_rate": 5.8261337389717506e-05,
"loss": 0.4776,
"step": 849
},
{
"epoch": 0.64075382803298,
"grad_norm": 11.673394203186035,
"learning_rate": 5.804468669036369e-05,
"loss": 0.4425,
"step": 850
},
{
"epoch": 0.6415076560659599,
"grad_norm": 6.468347072601318,
"learning_rate": 5.7828274725385544e-05,
"loss": 0.5469,
"step": 851
},
{
"epoch": 0.6422614840989399,
"grad_norm": 7.060529708862305,
"learning_rate": 5.761210272621175e-05,
"loss": 0.5067,
"step": 852
},
{
"epoch": 0.6430153121319199,
"grad_norm": 7.569014072418213,
"learning_rate": 5.739617192290545e-05,
"loss": 0.5057,
"step": 853
},
{
"epoch": 0.6437691401648998,
"grad_norm": 7.41010046005249,
"learning_rate": 5.7180483544157546e-05,
"loss": 0.4897,
"step": 854
},
{
"epoch": 0.6445229681978799,
"grad_norm": 6.627238750457764,
"learning_rate": 5.696503881727917e-05,
"loss": 0.5036,
"step": 855
},
{
"epoch": 0.6452767962308599,
"grad_norm": 6.318825721740723,
"learning_rate": 5.6749838968195326e-05,
"loss": 0.4619,
"step": 856
},
{
"epoch": 0.6460306242638398,
"grad_norm": 5.585279941558838,
"learning_rate": 5.653488522143744e-05,
"loss": 0.4331,
"step": 857
},
{
"epoch": 0.6467844522968198,
"grad_norm": 5.902019500732422,
"learning_rate": 5.6320178800136626e-05,
"loss": 0.4596,
"step": 858
},
{
"epoch": 0.6475382803297998,
"grad_norm": 5.5325164794921875,
"learning_rate": 5.610572092601659e-05,
"loss": 0.4362,
"step": 859
},
{
"epoch": 0.6482921083627797,
"grad_norm": 5.381384372711182,
"learning_rate": 5.589151281938695e-05,
"loss": 0.4294,
"step": 860
},
{
"epoch": 0.6490459363957597,
"grad_norm": 6.080218315124512,
"learning_rate": 5.56775556991358e-05,
"loss": 0.4304,
"step": 861
},
{
"epoch": 0.6497997644287397,
"grad_norm": 5.510005950927734,
"learning_rate": 5.5463850782723346e-05,
"loss": 0.4157,
"step": 862
},
{
"epoch": 0.6505535924617196,
"grad_norm": 5.572638511657715,
"learning_rate": 5.5250399286174546e-05,
"loss": 0.4238,
"step": 863
},
{
"epoch": 0.6513074204946997,
"grad_norm": 5.32048225402832,
"learning_rate": 5.50372024240724e-05,
"loss": 0.3929,
"step": 864
},
{
"epoch": 0.6520612485276797,
"grad_norm": 5.80560827255249,
"learning_rate": 5.48242614095509e-05,
"loss": 0.4251,
"step": 865
},
{
"epoch": 0.6528150765606596,
"grad_norm": 5.714180946350098,
"learning_rate": 5.461157745428841e-05,
"loss": 0.4318,
"step": 866
},
{
"epoch": 0.6535689045936396,
"grad_norm": 5.553015232086182,
"learning_rate": 5.439915176850037e-05,
"loss": 0.3996,
"step": 867
},
{
"epoch": 0.6543227326266196,
"grad_norm": 5.774811744689941,
"learning_rate": 5.418698556093271e-05,
"loss": 0.4298,
"step": 868
},
{
"epoch": 0.6550765606595995,
"grad_norm": 5.804990291595459,
"learning_rate": 5.397508003885483e-05,
"loss": 0.4119,
"step": 869
},
{
"epoch": 0.6558303886925795,
"grad_norm": 5.6263556480407715,
"learning_rate": 5.3763436408052904e-05,
"loss": 0.394,
"step": 870
},
{
"epoch": 0.6565842167255594,
"grad_norm": 5.699732303619385,
"learning_rate": 5.3552055872822636e-05,
"loss": 0.4152,
"step": 871
},
{
"epoch": 0.6573380447585394,
"grad_norm": 5.353825569152832,
"learning_rate": 5.334093963596294e-05,
"loss": 0.3798,
"step": 872
},
{
"epoch": 0.6580918727915195,
"grad_norm": 5.929776668548584,
"learning_rate": 5.313008889876865e-05,
"loss": 0.4142,
"step": 873
},
{
"epoch": 0.6588457008244994,
"grad_norm": 6.101897716522217,
"learning_rate": 5.2919504861023903e-05,
"loss": 0.4396,
"step": 874
},
{
"epoch": 0.6595995288574794,
"grad_norm": 6.041595458984375,
"learning_rate": 5.270918872099522e-05,
"loss": 0.4455,
"step": 875
},
{
"epoch": 0.6603533568904594,
"grad_norm": 5.795607566833496,
"learning_rate": 5.249914167542486e-05,
"loss": 0.3927,
"step": 876
},
{
"epoch": 0.6611071849234393,
"grad_norm": 6.169924259185791,
"learning_rate": 5.228936491952363e-05,
"loss": 0.4022,
"step": 877
},
{
"epoch": 0.6618610129564193,
"grad_norm": 5.870789527893066,
"learning_rate": 5.207985964696462e-05,
"loss": 0.4012,
"step": 878
},
{
"epoch": 0.6626148409893993,
"grad_norm": 6.345909595489502,
"learning_rate": 5.1870627049875954e-05,
"loss": 0.3814,
"step": 879
},
{
"epoch": 0.6633686690223792,
"grad_norm": 6.1364569664001465,
"learning_rate": 5.16616683188342e-05,
"loss": 0.4032,
"step": 880
},
{
"epoch": 0.6641224970553592,
"grad_norm": 5.976447582244873,
"learning_rate": 5.145298464285757e-05,
"loss": 0.3814,
"step": 881
},
{
"epoch": 0.6648763250883393,
"grad_norm": 7.229459285736084,
"learning_rate": 5.12445772093992e-05,
"loss": 0.4171,
"step": 882
},
{
"epoch": 0.6656301531213192,
"grad_norm": 5.863222599029541,
"learning_rate": 5.103644720434027e-05,
"loss": 0.3782,
"step": 883
},
{
"epoch": 0.6663839811542992,
"grad_norm": 6.049070835113525,
"learning_rate": 5.082859581198344e-05,
"loss": 0.3789,
"step": 884
},
{
"epoch": 0.6671378091872792,
"grad_norm": 6.35960578918457,
"learning_rate": 5.062102421504593e-05,
"loss": 0.4086,
"step": 885
},
{
"epoch": 0.6678916372202591,
"grad_norm": 6.470669746398926,
"learning_rate": 5.041373359465289e-05,
"loss": 0.4076,
"step": 886
},
{
"epoch": 0.6686454652532391,
"grad_norm": 6.241630554199219,
"learning_rate": 5.020672513033066e-05,
"loss": 0.4007,
"step": 887
},
{
"epoch": 0.669399293286219,
"grad_norm": 6.308516502380371,
"learning_rate": 5.000000000000002e-05,
"loss": 0.3754,
"step": 888
},
{
"epoch": 0.670153121319199,
"grad_norm": 6.356692314147949,
"learning_rate": 4.9793559379969566e-05,
"loss": 0.3973,
"step": 889
},
{
"epoch": 0.670906949352179,
"grad_norm": 7.087871074676514,
"learning_rate": 4.958740444492892e-05,
"loss": 0.4128,
"step": 890
},
{
"epoch": 0.6716607773851591,
"grad_norm": 7.447615623474121,
"learning_rate": 4.9381536367942195e-05,
"loss": 0.4111,
"step": 891
},
{
"epoch": 0.672414605418139,
"grad_norm": 7.260590076446533,
"learning_rate": 4.917595632044113e-05,
"loss": 0.3799,
"step": 892
},
{
"epoch": 0.673168433451119,
"grad_norm": 7.701971530914307,
"learning_rate": 4.8970665472218537e-05,
"loss": 0.4017,
"step": 893
},
{
"epoch": 0.673922261484099,
"grad_norm": 8.021989822387695,
"learning_rate": 4.8765664991421634e-05,
"loss": 0.4536,
"step": 894
},
{
"epoch": 0.6746760895170789,
"grad_norm": 8.987250328063965,
"learning_rate": 4.856095604454539e-05,
"loss": 0.4939,
"step": 895
},
{
"epoch": 0.6754299175500589,
"grad_norm": 10.436625480651855,
"learning_rate": 4.835653979642585e-05,
"loss": 0.5239,
"step": 896
},
{
"epoch": 0.6761837455830388,
"grad_norm": 9.789538383483887,
"learning_rate": 4.815241741023367e-05,
"loss": 0.4798,
"step": 897
},
{
"epoch": 0.6769375736160188,
"grad_norm": 9.678764343261719,
"learning_rate": 4.7948590047467153e-05,
"loss": 0.4441,
"step": 898
},
{
"epoch": 0.6776914016489988,
"grad_norm": 10.444610595703125,
"learning_rate": 4.774505886794609e-05,
"loss": 0.4201,
"step": 899
},
{
"epoch": 0.6784452296819788,
"grad_norm": 12.58081340789795,
"learning_rate": 4.754182502980477e-05,
"loss": 0.4634,
"step": 900
},
{
"epoch": 0.6791990577149588,
"grad_norm": 5.85378885269165,
"learning_rate": 4.7338889689485624e-05,
"loss": 0.5182,
"step": 901
},
{
"epoch": 0.6799528857479388,
"grad_norm": 6.6499857902526855,
"learning_rate": 4.713625400173247e-05,
"loss": 0.5216,
"step": 902
},
{
"epoch": 0.6807067137809187,
"grad_norm": 6.543797016143799,
"learning_rate": 4.693391911958426e-05,
"loss": 0.4798,
"step": 903
},
{
"epoch": 0.6814605418138987,
"grad_norm": 6.197330951690674,
"learning_rate": 4.673188619436798e-05,
"loss": 0.4892,
"step": 904
},
{
"epoch": 0.6822143698468787,
"grad_norm": 6.185276031494141,
"learning_rate": 4.6530156375692726e-05,
"loss": 0.474,
"step": 905
},
{
"epoch": 0.6829681978798586,
"grad_norm": 5.581246376037598,
"learning_rate": 4.632873081144267e-05,
"loss": 0.4498,
"step": 906
},
{
"epoch": 0.6837220259128386,
"grad_norm": 5.916640281677246,
"learning_rate": 4.6127610647770767e-05,
"loss": 0.4619,
"step": 907
},
{
"epoch": 0.6844758539458186,
"grad_norm": 5.591888904571533,
"learning_rate": 4.592679702909216e-05,
"loss": 0.4275,
"step": 908
},
{
"epoch": 0.6852296819787986,
"grad_norm": 5.287500858306885,
"learning_rate": 4.572629109807782e-05,
"loss": 0.4073,
"step": 909
},
{
"epoch": 0.6859835100117786,
"grad_norm": 5.325054168701172,
"learning_rate": 4.552609399564762e-05,
"loss": 0.3894,
"step": 910
},
{
"epoch": 0.6867373380447586,
"grad_norm": 5.576198101043701,
"learning_rate": 4.532620686096446e-05,
"loss": 0.4185,
"step": 911
},
{
"epoch": 0.6874911660777385,
"grad_norm": 5.555250644683838,
"learning_rate": 4.5126630831427264e-05,
"loss": 0.3818,
"step": 912
},
{
"epoch": 0.6882449941107185,
"grad_norm": 5.309383869171143,
"learning_rate": 4.492736704266475e-05,
"loss": 0.3835,
"step": 913
},
{
"epoch": 0.6889988221436985,
"grad_norm": 5.426351547241211,
"learning_rate": 4.472841662852888e-05,
"loss": 0.4087,
"step": 914
},
{
"epoch": 0.6897526501766784,
"grad_norm": 5.882096767425537,
"learning_rate": 4.452978072108859e-05,
"loss": 0.4398,
"step": 915
},
{
"epoch": 0.6905064782096584,
"grad_norm": 5.80626916885376,
"learning_rate": 4.4331460450623064e-05,
"loss": 0.4234,
"step": 916
},
{
"epoch": 0.6912603062426383,
"grad_norm": 5.8705291748046875,
"learning_rate": 4.413345694561549e-05,
"loss": 0.4223,
"step": 917
},
{
"epoch": 0.6920141342756184,
"grad_norm": 5.822587966918945,
"learning_rate": 4.393577133274658e-05,
"loss": 0.4314,
"step": 918
},
{
"epoch": 0.6927679623085984,
"grad_norm": 6.2686872482299805,
"learning_rate": 4.373840473688829e-05,
"loss": 0.459,
"step": 919
},
{
"epoch": 0.6935217903415783,
"grad_norm": 5.543201923370361,
"learning_rate": 4.354135828109707e-05,
"loss": 0.3963,
"step": 920
},
{
"epoch": 0.6942756183745583,
"grad_norm": 5.7019267082214355,
"learning_rate": 4.3344633086607955e-05,
"loss": 0.3964,
"step": 921
},
{
"epoch": 0.6950294464075383,
"grad_norm": 5.6861958503723145,
"learning_rate": 4.3148230272827784e-05,
"loss": 0.4175,
"step": 922
},
{
"epoch": 0.6957832744405182,
"grad_norm": 5.791751384735107,
"learning_rate": 4.295215095732904e-05,
"loss": 0.4196,
"step": 923
},
{
"epoch": 0.6965371024734982,
"grad_norm": 6.20761251449585,
"learning_rate": 4.275639625584338e-05,
"loss": 0.4159,
"step": 924
},
{
"epoch": 0.6972909305064782,
"grad_norm": 6.440983772277832,
"learning_rate": 4.256096728225548e-05,
"loss": 0.418,
"step": 925
},
{
"epoch": 0.6980447585394581,
"grad_norm": 5.713172435760498,
"learning_rate": 4.236586514859633e-05,
"loss": 0.4084,
"step": 926
},
{
"epoch": 0.6987985865724382,
"grad_norm": 5.674785137176514,
"learning_rate": 4.217109096503736e-05,
"loss": 0.3978,
"step": 927
},
{
"epoch": 0.6995524146054182,
"grad_norm": 6.123269081115723,
"learning_rate": 4.197664583988376e-05,
"loss": 0.421,
"step": 928
},
{
"epoch": 0.7003062426383981,
"grad_norm": 5.961802959442139,
"learning_rate": 4.1782530879568374e-05,
"loss": 0.4027,
"step": 929
},
{
"epoch": 0.7010600706713781,
"grad_norm": 6.020455360412598,
"learning_rate": 4.1588747188645275e-05,
"loss": 0.3978,
"step": 930
},
{
"epoch": 0.7018138987043581,
"grad_norm": 5.788726329803467,
"learning_rate": 4.1395295869783615e-05,
"loss": 0.3744,
"step": 931
},
{
"epoch": 0.702567726737338,
"grad_norm": 6.581162929534912,
"learning_rate": 4.1202178023761195e-05,
"loss": 0.4003,
"step": 932
},
{
"epoch": 0.703321554770318,
"grad_norm": 5.601202011108398,
"learning_rate": 4.100939474945843e-05,
"loss": 0.37,
"step": 933
},
{
"epoch": 0.704075382803298,
"grad_norm": 6.49223518371582,
"learning_rate": 4.0816947143851816e-05,
"loss": 0.4088,
"step": 934
},
{
"epoch": 0.7048292108362779,
"grad_norm": 6.10722541809082,
"learning_rate": 4.0624836302007886e-05,
"loss": 0.3835,
"step": 935
},
{
"epoch": 0.705583038869258,
"grad_norm": 6.136714935302734,
"learning_rate": 4.0433063317076893e-05,
"loss": 0.4056,
"step": 936
},
{
"epoch": 0.706336866902238,
"grad_norm": 6.344220161437988,
"learning_rate": 4.024162928028663e-05,
"loss": 0.386,
"step": 937
},
{
"epoch": 0.7070906949352179,
"grad_norm": 7.188864231109619,
"learning_rate": 4.0050535280936205e-05,
"loss": 0.3849,
"step": 938
},
{
"epoch": 0.7078445229681979,
"grad_norm": 6.800889492034912,
"learning_rate": 3.985978240638981e-05,
"loss": 0.3989,
"step": 939
},
{
"epoch": 0.7085983510011779,
"grad_norm": 7.130059242248535,
"learning_rate": 3.966937174207066e-05,
"loss": 0.3821,
"step": 940
},
{
"epoch": 0.7093521790341578,
"grad_norm": 6.849576473236084,
"learning_rate": 3.947930437145464e-05,
"loss": 0.3843,
"step": 941
},
{
"epoch": 0.7101060070671378,
"grad_norm": 7.004662036895752,
"learning_rate": 3.928958137606421e-05,
"loss": 0.3686,
"step": 942
},
{
"epoch": 0.7108598351001177,
"grad_norm": 8.136757850646973,
"learning_rate": 3.910020383546233e-05,
"loss": 0.4558,
"step": 943
},
{
"epoch": 0.7116136631330977,
"grad_norm": 8.616293907165527,
"learning_rate": 3.8911172827246215e-05,
"loss": 0.4368,
"step": 944
},
{
"epoch": 0.7123674911660778,
"grad_norm": 8.701359748840332,
"learning_rate": 3.8722489427041185e-05,
"loss": 0.4512,
"step": 945
},
{
"epoch": 0.7131213191990577,
"grad_norm": 9.437173843383789,
"learning_rate": 3.853415470849479e-05,
"loss": 0.481,
"step": 946
},
{
"epoch": 0.7138751472320377,
"grad_norm": 10.383941650390625,
"learning_rate": 3.834616974327021e-05,
"loss": 0.5005,
"step": 947
},
{
"epoch": 0.7146289752650177,
"grad_norm": 9.366165161132812,
"learning_rate": 3.815853560104075e-05,
"loss": 0.4548,
"step": 948
},
{
"epoch": 0.7153828032979976,
"grad_norm": 9.855792999267578,
"learning_rate": 3.7971253349483285e-05,
"loss": 0.4908,
"step": 949
},
{
"epoch": 0.7161366313309776,
"grad_norm": 11.261048316955566,
"learning_rate": 3.7784324054272405e-05,
"loss": 0.4601,
"step": 950
},
{
"epoch": 0.7168904593639576,
"grad_norm": 5.492030143737793,
"learning_rate": 3.759774877907428e-05,
"loss": 0.5291,
"step": 951
},
{
"epoch": 0.7176442873969375,
"grad_norm": 6.00732421875,
"learning_rate": 3.741152858554077e-05,
"loss": 0.5058,
"step": 952
},
{
"epoch": 0.7183981154299176,
"grad_norm": 5.992036819458008,
"learning_rate": 3.722566453330298e-05,
"loss": 0.5028,
"step": 953
},
{
"epoch": 0.7191519434628976,
"grad_norm": 5.949222564697266,
"learning_rate": 3.7040157679965796e-05,
"loss": 0.4631,
"step": 954
},
{
"epoch": 0.7199057714958775,
"grad_norm": 5.833024978637695,
"learning_rate": 3.6855009081101355e-05,
"loss": 0.449,
"step": 955
},
{
"epoch": 0.7206595995288575,
"grad_norm": 5.746013641357422,
"learning_rate": 3.6670219790243344e-05,
"loss": 0.4442,
"step": 956
},
{
"epoch": 0.7214134275618375,
"grad_norm": 5.595402240753174,
"learning_rate": 3.648579085888085e-05,
"loss": 0.4353,
"step": 957
},
{
"epoch": 0.7221672555948174,
"grad_norm": 5.437952995300293,
"learning_rate": 3.630172333645261e-05,
"loss": 0.434,
"step": 958
},
{
"epoch": 0.7229210836277974,
"grad_norm": 5.620044231414795,
"learning_rate": 3.611801827034059e-05,
"loss": 0.4137,
"step": 959
},
{
"epoch": 0.7236749116607774,
"grad_norm": 5.448288440704346,
"learning_rate": 3.593467670586457e-05,
"loss": 0.4197,
"step": 960
},
{
"epoch": 0.7244287396937573,
"grad_norm": 5.672021389007568,
"learning_rate": 3.5751699686275786e-05,
"loss": 0.4495,
"step": 961
},
{
"epoch": 0.7251825677267374,
"grad_norm": 5.292520046234131,
"learning_rate": 3.556908825275117e-05,
"loss": 0.4203,
"step": 962
},
{
"epoch": 0.7259363957597174,
"grad_norm": 5.522578239440918,
"learning_rate": 3.538684344438736e-05,
"loss": 0.4043,
"step": 963
},
{
"epoch": 0.7266902237926973,
"grad_norm": 5.811888694763184,
"learning_rate": 3.520496629819494e-05,
"loss": 0.4239,
"step": 964
},
{
"epoch": 0.7274440518256773,
"grad_norm": 5.410277366638184,
"learning_rate": 3.502345784909229e-05,
"loss": 0.4163,
"step": 965
},
{
"epoch": 0.7281978798586572,
"grad_norm": 5.810190677642822,
"learning_rate": 3.484231912989989e-05,
"loss": 0.4323,
"step": 966
},
{
"epoch": 0.7289517078916372,
"grad_norm": 5.343920707702637,
"learning_rate": 3.466155117133433e-05,
"loss": 0.4153,
"step": 967
},
{
"epoch": 0.7297055359246172,
"grad_norm": 5.489987373352051,
"learning_rate": 3.448115500200263e-05,
"loss": 0.3828,
"step": 968
},
{
"epoch": 0.7304593639575971,
"grad_norm": 5.753129005432129,
"learning_rate": 3.430113164839601e-05,
"loss": 0.4047,
"step": 969
},
{
"epoch": 0.7312131919905771,
"grad_norm": 5.8478569984436035,
"learning_rate": 3.4121482134884575e-05,
"loss": 0.4231,
"step": 970
},
{
"epoch": 0.7319670200235572,
"grad_norm": 6.3078413009643555,
"learning_rate": 3.3942207483710986e-05,
"loss": 0.3913,
"step": 971
},
{
"epoch": 0.7327208480565371,
"grad_norm": 5.719088077545166,
"learning_rate": 3.3763308714984974e-05,
"loss": 0.4149,
"step": 972
},
{
"epoch": 0.7334746760895171,
"grad_norm": 5.784895420074463,
"learning_rate": 3.358478684667734e-05,
"loss": 0.3997,
"step": 973
},
{
"epoch": 0.7342285041224971,
"grad_norm": 5.888166427612305,
"learning_rate": 3.3406642894614394e-05,
"loss": 0.4064,
"step": 974
},
{
"epoch": 0.734982332155477,
"grad_norm": 6.573143482208252,
"learning_rate": 3.3228877872471786e-05,
"loss": 0.4188,
"step": 975
},
{
"epoch": 0.735736160188457,
"grad_norm": 5.861452102661133,
"learning_rate": 3.305149279176921e-05,
"loss": 0.3993,
"step": 976
},
{
"epoch": 0.736489988221437,
"grad_norm": 5.746969223022461,
"learning_rate": 3.287448866186428e-05,
"loss": 0.4014,
"step": 977
},
{
"epoch": 0.7372438162544169,
"grad_norm": 5.95499849319458,
"learning_rate": 3.269786648994697e-05,
"loss": 0.4129,
"step": 978
},
{
"epoch": 0.7379976442873969,
"grad_norm": 5.868785858154297,
"learning_rate": 3.252162728103382e-05,
"loss": 0.4006,
"step": 979
},
{
"epoch": 0.738751472320377,
"grad_norm": 6.216129779815674,
"learning_rate": 3.234577203796223e-05,
"loss": 0.4097,
"step": 980
},
{
"epoch": 0.7395053003533569,
"grad_norm": 5.94473934173584,
"learning_rate": 3.217030176138474e-05,
"loss": 0.3947,
"step": 981
},
{
"epoch": 0.7402591283863369,
"grad_norm": 5.822911262512207,
"learning_rate": 3.199521744976342e-05,
"loss": 0.3838,
"step": 982
},
{
"epoch": 0.7410129564193169,
"grad_norm": 5.968900203704834,
"learning_rate": 3.182052009936404e-05,
"loss": 0.3945,
"step": 983
},
{
"epoch": 0.7417667844522968,
"grad_norm": 6.497354984283447,
"learning_rate": 3.164621070425051e-05,
"loss": 0.4138,
"step": 984
},
{
"epoch": 0.7425206124852768,
"grad_norm": 6.382023334503174,
"learning_rate": 3.147229025627922e-05,
"loss": 0.37,
"step": 985
},
{
"epoch": 0.7432744405182568,
"grad_norm": 6.162110328674316,
"learning_rate": 3.129875974509332e-05,
"loss": 0.3743,
"step": 986
},
{
"epoch": 0.7440282685512367,
"grad_norm": 6.0412116050720215,
"learning_rate": 3.1125620158117186e-05,
"loss": 0.3714,
"step": 987
},
{
"epoch": 0.7447820965842167,
"grad_norm": 6.072629451751709,
"learning_rate": 3.095287248055069e-05,
"loss": 0.369,
"step": 988
},
{
"epoch": 0.7455359246171968,
"grad_norm": 6.4712958335876465,
"learning_rate": 3.078051769536378e-05,
"loss": 0.3956,
"step": 989
},
{
"epoch": 0.7462897526501767,
"grad_norm": 6.292232036590576,
"learning_rate": 3.060855678329063e-05,
"loss": 0.3755,
"step": 990
},
{
"epoch": 0.7470435806831567,
"grad_norm": 6.797161102294922,
"learning_rate": 3.043699072282429e-05,
"loss": 0.3941,
"step": 991
},
{
"epoch": 0.7477974087161366,
"grad_norm": 7.063961029052734,
"learning_rate": 3.0265820490210973e-05,
"loss": 0.4085,
"step": 992
},
{
"epoch": 0.7485512367491166,
"grad_norm": 8.036771774291992,
"learning_rate": 3.0095047059444546e-05,
"loss": 0.4553,
"step": 993
},
{
"epoch": 0.7493050647820966,
"grad_norm": 8.343942642211914,
"learning_rate": 2.9924671402261018e-05,
"loss": 0.4532,
"step": 994
},
{
"epoch": 0.7500588928150765,
"grad_norm": 8.597431182861328,
"learning_rate": 2.9754694488133038e-05,
"loss": 0.4544,
"step": 995
},
{
"epoch": 0.7508127208480565,
"grad_norm": 8.797038078308105,
"learning_rate": 2.958511728426414e-05,
"loss": 0.4565,
"step": 996
},
{
"epoch": 0.7508127208480565,
"eval_loss": 0.42347389459609985,
"eval_runtime": 127.0592,
"eval_samples_per_second": 17.59,
"eval_steps_per_second": 8.799,
"step": 996
},
{
"epoch": 0.7515665488810365,
"grad_norm": 9.90727710723877,
"learning_rate": 2.941594075558366e-05,
"loss": 0.4791,
"step": 997
},
{
"epoch": 0.7523203769140165,
"grad_norm": 9.148994445800781,
"learning_rate": 2.9247165864740856e-05,
"loss": 0.4488,
"step": 998
},
{
"epoch": 0.7530742049469965,
"grad_norm": 10.751917839050293,
"learning_rate": 2.9078793572099616e-05,
"loss": 0.4695,
"step": 999
},
{
"epoch": 0.7538280329799765,
"grad_norm": 12.66123104095459,
"learning_rate": 2.8910824835732952e-05,
"loss": 0.4773,
"step": 1000
},
{
"epoch": 0.7545818610129564,
"grad_norm": 5.507136821746826,
"learning_rate": 2.8743260611417665e-05,
"loss": 0.5073,
"step": 1001
},
{
"epoch": 0.7553356890459364,
"grad_norm": 5.805990695953369,
"learning_rate": 2.857610185262859e-05,
"loss": 0.4735,
"step": 1002
},
{
"epoch": 0.7560895170789164,
"grad_norm": 5.612555980682373,
"learning_rate": 2.8409349510533578e-05,
"loss": 0.4536,
"step": 1003
},
{
"epoch": 0.7568433451118963,
"grad_norm": 5.850246906280518,
"learning_rate": 2.8243004533987793e-05,
"loss": 0.4578,
"step": 1004
},
{
"epoch": 0.7575971731448763,
"grad_norm": 5.569720268249512,
"learning_rate": 2.8077067869528417e-05,
"loss": 0.4135,
"step": 1005
},
{
"epoch": 0.7583510011778563,
"grad_norm": 5.9112114906311035,
"learning_rate": 2.7911540461369222e-05,
"loss": 0.4445,
"step": 1006
},
{
"epoch": 0.7591048292108363,
"grad_norm": 5.9236249923706055,
"learning_rate": 2.774642325139535e-05,
"loss": 0.4402,
"step": 1007
},
{
"epoch": 0.7598586572438163,
"grad_norm": 6.210232257843018,
"learning_rate": 2.7581717179157606e-05,
"loss": 0.4605,
"step": 1008
},
{
"epoch": 0.7606124852767963,
"grad_norm": 5.880030155181885,
"learning_rate": 2.7417423181867585e-05,
"loss": 0.4227,
"step": 1009
},
{
"epoch": 0.7613663133097762,
"grad_norm": 5.549881458282471,
"learning_rate": 2.72535421943919e-05,
"loss": 0.4168,
"step": 1010
},
{
"epoch": 0.7621201413427562,
"grad_norm": 5.586158275604248,
"learning_rate": 2.7090075149247217e-05,
"loss": 0.4334,
"step": 1011
},
{
"epoch": 0.7628739693757361,
"grad_norm": 5.5952348709106445,
"learning_rate": 2.6927022976594607e-05,
"loss": 0.4232,
"step": 1012
},
{
"epoch": 0.7636277974087161,
"grad_norm": 5.478029727935791,
"learning_rate": 2.676438660423457e-05,
"loss": 0.4053,
"step": 1013
},
{
"epoch": 0.7643816254416961,
"grad_norm": 5.441522121429443,
"learning_rate": 2.660216695760157e-05,
"loss": 0.3847,
"step": 1014
},
{
"epoch": 0.765135453474676,
"grad_norm": 5.584785461425781,
"learning_rate": 2.6440364959758813e-05,
"loss": 0.4098,
"step": 1015
},
{
"epoch": 0.7658892815076561,
"grad_norm": 5.545854091644287,
"learning_rate": 2.6278981531392945e-05,
"loss": 0.4002,
"step": 1016
},
{
"epoch": 0.7666431095406361,
"grad_norm": 5.697778701782227,
"learning_rate": 2.6118017590809017e-05,
"loss": 0.4013,
"step": 1017
},
{
"epoch": 0.767396937573616,
"grad_norm": 6.265735626220703,
"learning_rate": 2.595747405392491e-05,
"loss": 0.4102,
"step": 1018
},
{
"epoch": 0.768150765606596,
"grad_norm": 5.284882545471191,
"learning_rate": 2.579735183426649e-05,
"loss": 0.3747,
"step": 1019
},
{
"epoch": 0.768904593639576,
"grad_norm": 5.939345359802246,
"learning_rate": 2.5637651842962164e-05,
"loss": 0.4019,
"step": 1020
},
{
"epoch": 0.7696584216725559,
"grad_norm": 5.655182838439941,
"learning_rate": 2.5478374988737753e-05,
"loss": 0.4038,
"step": 1021
},
{
"epoch": 0.7704122497055359,
"grad_norm": 5.510229587554932,
"learning_rate": 2.531952217791136e-05,
"loss": 0.3912,
"step": 1022
},
{
"epoch": 0.7711660777385159,
"grad_norm": 5.720643997192383,
"learning_rate": 2.5161094314388278e-05,
"loss": 0.3995,
"step": 1023
},
{
"epoch": 0.7719199057714958,
"grad_norm": 5.860435962677002,
"learning_rate": 2.5003092299655584e-05,
"loss": 0.3995,
"step": 1024
},
{
"epoch": 0.7726737338044759,
"grad_norm": 6.223293304443359,
"learning_rate": 2.4845517032777364e-05,
"loss": 0.4424,
"step": 1025
},
{
"epoch": 0.7734275618374559,
"grad_norm": 6.027644157409668,
"learning_rate": 2.4688369410389334e-05,
"loss": 0.4299,
"step": 1026
},
{
"epoch": 0.7741813898704358,
"grad_norm": 5.946674346923828,
"learning_rate": 2.4531650326693822e-05,
"loss": 0.3849,
"step": 1027
},
{
"epoch": 0.7749352179034158,
"grad_norm": 6.277134895324707,
"learning_rate": 2.4375360673454718e-05,
"loss": 0.4147,
"step": 1028
},
{
"epoch": 0.7756890459363958,
"grad_norm": 6.024038314819336,
"learning_rate": 2.4219501339992334e-05,
"loss": 0.3774,
"step": 1029
},
{
"epoch": 0.7764428739693757,
"grad_norm": 5.8574910163879395,
"learning_rate": 2.406407321317835e-05,
"loss": 0.3865,
"step": 1030
},
{
"epoch": 0.7771967020023557,
"grad_norm": 6.022578239440918,
"learning_rate": 2.3909077177430893e-05,
"loss": 0.3957,
"step": 1031
},
{
"epoch": 0.7779505300353357,
"grad_norm": 5.923416614532471,
"learning_rate": 2.3754514114709304e-05,
"loss": 0.3836,
"step": 1032
},
{
"epoch": 0.7787043580683156,
"grad_norm": 6.270270824432373,
"learning_rate": 2.3600384904509254e-05,
"loss": 0.3979,
"step": 1033
},
{
"epoch": 0.7794581861012957,
"grad_norm": 6.285928726196289,
"learning_rate": 2.3446690423857685e-05,
"loss": 0.4098,
"step": 1034
},
{
"epoch": 0.7802120141342757,
"grad_norm": 6.104770660400391,
"learning_rate": 2.3293431547307887e-05,
"loss": 0.3746,
"step": 1035
},
{
"epoch": 0.7809658421672556,
"grad_norm": 6.284374237060547,
"learning_rate": 2.31406091469344e-05,
"loss": 0.3933,
"step": 1036
},
{
"epoch": 0.7817196702002356,
"grad_norm": 6.502585411071777,
"learning_rate": 2.298822409232817e-05,
"loss": 0.3964,
"step": 1037
},
{
"epoch": 0.7824734982332155,
"grad_norm": 6.121708869934082,
"learning_rate": 2.2836277250591574e-05,
"loss": 0.3822,
"step": 1038
},
{
"epoch": 0.7832273262661955,
"grad_norm": 7.069113731384277,
"learning_rate": 2.2684769486333445e-05,
"loss": 0.3919,
"step": 1039
},
{
"epoch": 0.7839811542991755,
"grad_norm": 6.825623035430908,
"learning_rate": 2.2533701661664154e-05,
"loss": 0.4296,
"step": 1040
},
{
"epoch": 0.7847349823321554,
"grad_norm": 7.632999897003174,
"learning_rate": 2.2383074636190748e-05,
"loss": 0.4266,
"step": 1041
},
{
"epoch": 0.7854888103651354,
"grad_norm": 7.41874885559082,
"learning_rate": 2.2232889267012038e-05,
"loss": 0.4263,
"step": 1042
},
{
"epoch": 0.7862426383981155,
"grad_norm": 7.6582417488098145,
"learning_rate": 2.2083146408713673e-05,
"loss": 0.4351,
"step": 1043
},
{
"epoch": 0.7869964664310954,
"grad_norm": 9.17532730102539,
"learning_rate": 2.1933846913363466e-05,
"loss": 0.5107,
"step": 1044
},
{
"epoch": 0.7877502944640754,
"grad_norm": 9.609545707702637,
"learning_rate": 2.178499163050617e-05,
"loss": 0.4606,
"step": 1045
},
{
"epoch": 0.7885041224970554,
"grad_norm": 9.567949295043945,
"learning_rate": 2.1636581407159105e-05,
"loss": 0.4663,
"step": 1046
},
{
"epoch": 0.7892579505300353,
"grad_norm": 9.527708053588867,
"learning_rate": 2.1488617087806982e-05,
"loss": 0.4712,
"step": 1047
},
{
"epoch": 0.7900117785630153,
"grad_norm": 9.680562973022461,
"learning_rate": 2.1341099514397266e-05,
"loss": 0.4975,
"step": 1048
},
{
"epoch": 0.7907656065959953,
"grad_norm": 10.399216651916504,
"learning_rate": 2.1194029526335303e-05,
"loss": 0.4586,
"step": 1049
},
{
"epoch": 0.7915194346289752,
"grad_norm": 10.869539260864258,
"learning_rate": 2.1047407960479702e-05,
"loss": 0.4429,
"step": 1050
},
{
"epoch": 0.7922732626619552,
"grad_norm": 5.385607719421387,
"learning_rate": 2.0901235651137284e-05,
"loss": 0.5019,
"step": 1051
},
{
"epoch": 0.7930270906949353,
"grad_norm": 5.6260223388671875,
"learning_rate": 2.0755513430058672e-05,
"loss": 0.4988,
"step": 1052
},
{
"epoch": 0.7937809187279152,
"grad_norm": 5.487570762634277,
"learning_rate": 2.0610242126433297e-05,
"loss": 0.4594,
"step": 1053
},
{
"epoch": 0.7945347467608952,
"grad_norm": 5.6461591720581055,
"learning_rate": 2.0465422566884805e-05,
"loss": 0.4642,
"step": 1054
},
{
"epoch": 0.7952885747938752,
"grad_norm": 5.7345123291015625,
"learning_rate": 2.0321055575466284e-05,
"loss": 0.4442,
"step": 1055
},
{
"epoch": 0.7960424028268551,
"grad_norm": 5.918202877044678,
"learning_rate": 2.0177141973655766e-05,
"loss": 0.4708,
"step": 1056
},
{
"epoch": 0.7967962308598351,
"grad_norm": 5.593347549438477,
"learning_rate": 2.0033682580351144e-05,
"loss": 0.4277,
"step": 1057
},
{
"epoch": 0.797550058892815,
"grad_norm": 5.557769775390625,
"learning_rate": 1.9890678211866033e-05,
"loss": 0.4267,
"step": 1058
},
{
"epoch": 0.798303886925795,
"grad_norm": 5.38918924331665,
"learning_rate": 1.9748129681924675e-05,
"loss": 0.4112,
"step": 1059
},
{
"epoch": 0.799057714958775,
"grad_norm": 5.82417631149292,
"learning_rate": 1.9606037801657673e-05,
"loss": 0.4104,
"step": 1060
},
{
"epoch": 0.799811542991755,
"grad_norm": 5.548363208770752,
"learning_rate": 1.9464403379596963e-05,
"loss": 0.4127,
"step": 1061
},
{
"epoch": 0.800565371024735,
"grad_norm": 5.548163890838623,
"learning_rate": 1.932322722167168e-05,
"loss": 0.4198,
"step": 1062
},
{
"epoch": 0.801319199057715,
"grad_norm": 5.443014621734619,
"learning_rate": 1.9182510131203224e-05,
"loss": 0.4012,
"step": 1063
},
{
"epoch": 0.802073027090695,
"grad_norm": 5.750105381011963,
"learning_rate": 1.9042252908900814e-05,
"loss": 0.4075,
"step": 1064
},
{
"epoch": 0.8028268551236749,
"grad_norm": 5.6281418800354,
"learning_rate": 1.8902456352856925e-05,
"loss": 0.3896,
"step": 1065
},
{
"epoch": 0.8035806831566549,
"grad_norm": 5.443961143493652,
"learning_rate": 1.8763121258542815e-05,
"loss": 0.4057,
"step": 1066
},
{
"epoch": 0.8043345111896348,
"grad_norm": 5.808502674102783,
"learning_rate": 1.86242484188038e-05,
"loss": 0.4137,
"step": 1067
},
{
"epoch": 0.8050883392226148,
"grad_norm": 5.866790294647217,
"learning_rate": 1.848583862385501e-05,
"loss": 0.4129,
"step": 1068
},
{
"epoch": 0.8058421672555948,
"grad_norm": 5.517582893371582,
"learning_rate": 1.8347892661276656e-05,
"loss": 0.3901,
"step": 1069
},
{
"epoch": 0.8065959952885748,
"grad_norm": 6.088197231292725,
"learning_rate": 1.82104113160097e-05,
"loss": 0.4125,
"step": 1070
},
{
"epoch": 0.8073498233215548,
"grad_norm": 5.613511562347412,
"learning_rate": 1.8073395370351287e-05,
"loss": 0.3968,
"step": 1071
},
{
"epoch": 0.8081036513545348,
"grad_norm": 5.712565898895264,
"learning_rate": 1.7936845603950447e-05,
"loss": 0.3925,
"step": 1072
},
{
"epoch": 0.8088574793875147,
"grad_norm": 5.371545314788818,
"learning_rate": 1.780076279380337e-05,
"loss": 0.3589,
"step": 1073
},
{
"epoch": 0.8096113074204947,
"grad_norm": 5.599592208862305,
"learning_rate": 1.7665147714249376e-05,
"loss": 0.3838,
"step": 1074
},
{
"epoch": 0.8103651354534747,
"grad_norm": 6.015298843383789,
"learning_rate": 1.753000113696617e-05,
"loss": 0.386,
"step": 1075
},
{
"epoch": 0.8111189634864546,
"grad_norm": 5.434444427490234,
"learning_rate": 1.7395323830965605e-05,
"loss": 0.3771,
"step": 1076
},
{
"epoch": 0.8118727915194346,
"grad_norm": 6.145053863525391,
"learning_rate": 1.726111656258932e-05,
"loss": 0.4039,
"step": 1077
},
{
"epoch": 0.8126266195524146,
"grad_norm": 5.7801384925842285,
"learning_rate": 1.7127380095504296e-05,
"loss": 0.3955,
"step": 1078
},
{
"epoch": 0.8133804475853946,
"grad_norm": 5.640938758850098,
"learning_rate": 1.699411519069858e-05,
"loss": 0.3788,
"step": 1079
},
{
"epoch": 0.8141342756183746,
"grad_norm": 5.714921951293945,
"learning_rate": 1.686132260647696e-05,
"loss": 0.3637,
"step": 1080
},
{
"epoch": 0.8148881036513546,
"grad_norm": 6.3913750648498535,
"learning_rate": 1.6729003098456576e-05,
"loss": 0.3815,
"step": 1081
},
{
"epoch": 0.8156419316843345,
"grad_norm": 5.981407642364502,
"learning_rate": 1.6597157419562703e-05,
"loss": 0.3756,
"step": 1082
},
{
"epoch": 0.8163957597173145,
"grad_norm": 6.408857822418213,
"learning_rate": 1.646578632002439e-05,
"loss": 0.4219,
"step": 1083
},
{
"epoch": 0.8171495877502944,
"grad_norm": 6.3557329177856445,
"learning_rate": 1.6334890547370286e-05,
"loss": 0.387,
"step": 1084
},
{
"epoch": 0.8179034157832744,
"grad_norm": 6.406612873077393,
"learning_rate": 1.6204470846424268e-05,
"loss": 0.3736,
"step": 1085
},
{
"epoch": 0.8186572438162544,
"grad_norm": 6.225420951843262,
"learning_rate": 1.607452795930131e-05,
"loss": 0.3886,
"step": 1086
},
{
"epoch": 0.8194110718492343,
"grad_norm": 6.3113789558410645,
"learning_rate": 1.594506262540324e-05,
"loss": 0.402,
"step": 1087
},
{
"epoch": 0.8201648998822144,
"grad_norm": 6.504429817199707,
"learning_rate": 1.5816075581414458e-05,
"loss": 0.3911,
"step": 1088
},
{
"epoch": 0.8209187279151944,
"grad_norm": 7.651139736175537,
"learning_rate": 1.56875675612978e-05,
"loss": 0.4127,
"step": 1089
},
{
"epoch": 0.8216725559481743,
"grad_norm": 6.864494800567627,
"learning_rate": 1.5559539296290403e-05,
"loss": 0.3841,
"step": 1090
},
{
"epoch": 0.8224263839811543,
"grad_norm": 7.120053291320801,
"learning_rate": 1.5431991514899446e-05,
"loss": 0.4185,
"step": 1091
},
{
"epoch": 0.8231802120141343,
"grad_norm": 7.861664295196533,
"learning_rate": 1.5304924942898068e-05,
"loss": 0.4293,
"step": 1092
},
{
"epoch": 0.8239340400471142,
"grad_norm": 8.355661392211914,
"learning_rate": 1.5178340303321314e-05,
"loss": 0.4559,
"step": 1093
},
{
"epoch": 0.8246878680800942,
"grad_norm": 8.859525680541992,
"learning_rate": 1.5052238316461753e-05,
"loss": 0.4503,
"step": 1094
},
{
"epoch": 0.8254416961130742,
"grad_norm": 9.211348533630371,
"learning_rate": 1.492661969986574e-05,
"loss": 0.4435,
"step": 1095
},
{
"epoch": 0.8261955241460541,
"grad_norm": 8.610541343688965,
"learning_rate": 1.4801485168329066e-05,
"loss": 0.4625,
"step": 1096
},
{
"epoch": 0.8269493521790342,
"grad_norm": 10.033802032470703,
"learning_rate": 1.4676835433892989e-05,
"loss": 0.437,
"step": 1097
},
{
"epoch": 0.8277031802120142,
"grad_norm": 10.607207298278809,
"learning_rate": 1.4552671205840163e-05,
"loss": 0.4369,
"step": 1098
},
{
"epoch": 0.8284570082449941,
"grad_norm": 10.07897663116455,
"learning_rate": 1.4428993190690677e-05,
"loss": 0.4563,
"step": 1099
},
{
"epoch": 0.8292108362779741,
"grad_norm": 12.518508911132812,
"learning_rate": 1.4305802092197829e-05,
"loss": 0.4645,
"step": 1100
},
{
"epoch": 0.8299646643109541,
"grad_norm": 5.578033924102783,
"learning_rate": 1.4183098611344415e-05,
"loss": 0.51,
"step": 1101
},
{
"epoch": 0.830718492343934,
"grad_norm": 5.301563739776611,
"learning_rate": 1.4060883446338502e-05,
"loss": 0.4486,
"step": 1102
},
{
"epoch": 0.831472320376914,
"grad_norm": 5.3994293212890625,
"learning_rate": 1.393915729260955e-05,
"loss": 0.4536,
"step": 1103
},
{
"epoch": 0.832226148409894,
"grad_norm": 5.560753345489502,
"learning_rate": 1.3817920842804433e-05,
"loss": 0.455,
"step": 1104
},
{
"epoch": 0.8329799764428739,
"grad_norm": 5.810977935791016,
"learning_rate": 1.3697174786783584e-05,
"loss": 0.4373,
"step": 1105
},
{
"epoch": 0.833733804475854,
"grad_norm": 5.4894256591796875,
"learning_rate": 1.3576919811616862e-05,
"loss": 0.4106,
"step": 1106
},
{
"epoch": 0.834487632508834,
"grad_norm": 5.865782737731934,
"learning_rate": 1.345715660157989e-05,
"loss": 0.4151,
"step": 1107
},
{
"epoch": 0.8352414605418139,
"grad_norm": 5.4949469566345215,
"learning_rate": 1.3337885838149988e-05,
"loss": 0.4422,
"step": 1108
},
{
"epoch": 0.8359952885747939,
"grad_norm": 5.45637845993042,
"learning_rate": 1.3219108200002418e-05,
"loss": 0.4237,
"step": 1109
},
{
"epoch": 0.8367491166077738,
"grad_norm": 5.681154251098633,
"learning_rate": 1.3100824363006326e-05,
"loss": 0.443,
"step": 1110
},
{
"epoch": 0.8375029446407538,
"grad_norm": 5.729828357696533,
"learning_rate": 1.2983035000221177e-05,
"loss": 0.4053,
"step": 1111
},
{
"epoch": 0.8382567726737338,
"grad_norm": 6.101329326629639,
"learning_rate": 1.2865740781892699e-05,
"loss": 0.4384,
"step": 1112
},
{
"epoch": 0.8390106007067137,
"grad_norm": 5.694645881652832,
"learning_rate": 1.2748942375449135e-05,
"loss": 0.4013,
"step": 1113
},
{
"epoch": 0.8397644287396937,
"grad_norm": 5.564671516418457,
"learning_rate": 1.263264044549748e-05,
"loss": 0.4148,
"step": 1114
},
{
"epoch": 0.8405182567726738,
"grad_norm": 5.393068313598633,
"learning_rate": 1.2516835653819725e-05,
"loss": 0.3981,
"step": 1115
},
{
"epoch": 0.8412720848056537,
"grad_norm": 5.637123107910156,
"learning_rate": 1.2401528659368911e-05,
"loss": 0.406,
"step": 1116
},
{
"epoch": 0.8420259128386337,
"grad_norm": 5.908216953277588,
"learning_rate": 1.2286720118265659e-05,
"loss": 0.3637,
"step": 1117
},
{
"epoch": 0.8427797408716137,
"grad_norm": 5.7352070808410645,
"learning_rate": 1.2172410683794177e-05,
"loss": 0.4082,
"step": 1118
},
{
"epoch": 0.8435335689045936,
"grad_norm": 5.5727858543396,
"learning_rate": 1.2058601006398718e-05,
"loss": 0.3828,
"step": 1119
},
{
"epoch": 0.8442873969375736,
"grad_norm": 6.22990608215332,
"learning_rate": 1.1945291733679764e-05,
"loss": 0.4306,
"step": 1120
},
{
"epoch": 0.8450412249705536,
"grad_norm": 5.981517314910889,
"learning_rate": 1.1832483510390469e-05,
"loss": 0.4177,
"step": 1121
},
{
"epoch": 0.8457950530035335,
"grad_norm": 5.5717973709106445,
"learning_rate": 1.1720176978432795e-05,
"loss": 0.375,
"step": 1122
},
{
"epoch": 0.8465488810365136,
"grad_norm": 5.83533239364624,
"learning_rate": 1.1608372776854103e-05,
"loss": 0.4141,
"step": 1123
},
{
"epoch": 0.8473027090694936,
"grad_norm": 5.770301342010498,
"learning_rate": 1.1497071541843306e-05,
"loss": 0.3698,
"step": 1124
},
{
"epoch": 0.8480565371024735,
"grad_norm": 5.999599933624268,
"learning_rate": 1.1386273906727363e-05,
"loss": 0.4177,
"step": 1125
},
{
"epoch": 0.8488103651354535,
"grad_norm": 5.716385841369629,
"learning_rate": 1.1275980501967642e-05,
"loss": 0.3931,
"step": 1126
},
{
"epoch": 0.8495641931684335,
"grad_norm": 6.15166711807251,
"learning_rate": 1.1166191955156346e-05,
"loss": 0.4025,
"step": 1127
},
{
"epoch": 0.8503180212014134,
"grad_norm": 6.117612361907959,
"learning_rate": 1.1056908891012884e-05,
"loss": 0.4186,
"step": 1128
},
{
"epoch": 0.8510718492343934,
"grad_norm": 6.109333038330078,
"learning_rate": 1.0948131931380457e-05,
"loss": 0.3863,
"step": 1129
},
{
"epoch": 0.8518256772673733,
"grad_norm": 5.863979816436768,
"learning_rate": 1.0839861695222354e-05,
"loss": 0.3737,
"step": 1130
},
{
"epoch": 0.8525795053003533,
"grad_norm": 5.980686664581299,
"learning_rate": 1.0732098798618517e-05,
"loss": 0.3739,
"step": 1131
},
{
"epoch": 0.8533333333333334,
"grad_norm": 6.321891784667969,
"learning_rate": 1.0624843854762034e-05,
"loss": 0.416,
"step": 1132
},
{
"epoch": 0.8540871613663134,
"grad_norm": 6.081487655639648,
"learning_rate": 1.0518097473955624e-05,
"loss": 0.3922,
"step": 1133
},
{
"epoch": 0.8548409893992933,
"grad_norm": 6.287003040313721,
"learning_rate": 1.0411860263608186e-05,
"loss": 0.3747,
"step": 1134
},
{
"epoch": 0.8555948174322733,
"grad_norm": 6.175232887268066,
"learning_rate": 1.0306132828231318e-05,
"loss": 0.3708,
"step": 1135
},
{
"epoch": 0.8563486454652532,
"grad_norm": 6.49648904800415,
"learning_rate": 1.0200915769435937e-05,
"loss": 0.373,
"step": 1136
},
{
"epoch": 0.8571024734982332,
"grad_norm": 6.249892234802246,
"learning_rate": 1.009620968592876e-05,
"loss": 0.3807,
"step": 1137
},
{
"epoch": 0.8578563015312132,
"grad_norm": 6.616731643676758,
"learning_rate": 9.992015173508995e-06,
"loss": 0.3981,
"step": 1138
},
{
"epoch": 0.8586101295641931,
"grad_norm": 6.801102638244629,
"learning_rate": 9.88833282506486e-06,
"loss": 0.3968,
"step": 1139
},
{
"epoch": 0.8593639575971731,
"grad_norm": 6.820323467254639,
"learning_rate": 9.785163230570282e-06,
"loss": 0.3939,
"step": 1140
},
{
"epoch": 0.8601177856301532,
"grad_norm": 8.20490837097168,
"learning_rate": 9.682506977081496e-06,
"loss": 0.4353,
"step": 1141
},
{
"epoch": 0.8608716136631331,
"grad_norm": 7.587864398956299,
"learning_rate": 9.580364648733775e-06,
"loss": 0.4369,
"step": 1142
},
{
"epoch": 0.8616254416961131,
"grad_norm": 7.294688701629639,
"learning_rate": 9.478736826737944e-06,
"loss": 0.411,
"step": 1143
},
{
"epoch": 0.8623792697290931,
"grad_norm": 7.802835464477539,
"learning_rate": 9.37762408937729e-06,
"loss": 0.424,
"step": 1144
},
{
"epoch": 0.863133097762073,
"grad_norm": 8.21778678894043,
"learning_rate": 9.277027012004125e-06,
"loss": 0.4752,
"step": 1145
},
{
"epoch": 0.863886925795053,
"grad_norm": 8.805744171142578,
"learning_rate": 9.176946167036516e-06,
"loss": 0.4736,
"step": 1146
},
{
"epoch": 0.864640753828033,
"grad_norm": 10.24565601348877,
"learning_rate": 9.07738212395508e-06,
"loss": 0.4635,
"step": 1147
},
{
"epoch": 0.8653945818610129,
"grad_norm": 9.218001365661621,
"learning_rate": 8.978335449299791e-06,
"loss": 0.4313,
"step": 1148
},
{
"epoch": 0.8661484098939929,
"grad_norm": 10.276748657226562,
"learning_rate": 8.87980670666655e-06,
"loss": 0.421,
"step": 1149
},
{
"epoch": 0.866902237926973,
"grad_norm": 11.982145309448242,
"learning_rate": 8.781796456704262e-06,
"loss": 0.4486,
"step": 1150
},
{
"epoch": 0.8676560659599529,
"grad_norm": 5.365624904632568,
"learning_rate": 8.684305257111425e-06,
"loss": 0.5014,
"step": 1151
},
{
"epoch": 0.8684098939929329,
"grad_norm": 5.599196910858154,
"learning_rate": 8.587333662633035e-06,
"loss": 0.4984,
"step": 1152
},
{
"epoch": 0.8691637220259129,
"grad_norm": 5.679477214813232,
"learning_rate": 8.490882225057428e-06,
"loss": 0.5011,
"step": 1153
},
{
"epoch": 0.8699175500588928,
"grad_norm": 5.679898738861084,
"learning_rate": 8.39495149321322e-06,
"loss": 0.443,
"step": 1154
},
{
"epoch": 0.8706713780918728,
"grad_norm": 5.414709091186523,
"learning_rate": 8.299542012965944e-06,
"loss": 0.4269,
"step": 1155
},
{
"epoch": 0.8714252061248527,
"grad_norm": 5.3179426193237305,
"learning_rate": 8.204654327215267e-06,
"loss": 0.4395,
"step": 1156
},
{
"epoch": 0.8721790341578327,
"grad_norm": 5.2444963455200195,
"learning_rate": 8.110288975891634e-06,
"loss": 0.4217,
"step": 1157
},
{
"epoch": 0.8729328621908127,
"grad_norm": 5.733283996582031,
"learning_rate": 8.016446495953367e-06,
"loss": 0.4395,
"step": 1158
},
{
"epoch": 0.8736866902237928,
"grad_norm": 5.545217037200928,
"learning_rate": 7.923127421383458e-06,
"loss": 0.436,
"step": 1159
},
{
"epoch": 0.8744405182567727,
"grad_norm": 5.759894371032715,
"learning_rate": 7.830332283186714e-06,
"loss": 0.4376,
"step": 1160
},
{
"epoch": 0.8751943462897527,
"grad_norm": 5.31406831741333,
"learning_rate": 7.73806160938656e-06,
"loss": 0.4097,
"step": 1161
},
{
"epoch": 0.8759481743227326,
"grad_norm": 5.372743129730225,
"learning_rate": 7.646315925022152e-06,
"loss": 0.4264,
"step": 1162
},
{
"epoch": 0.8767020023557126,
"grad_norm": 5.223913192749023,
"learning_rate": 7.555095752145313e-06,
"loss": 0.3879,
"step": 1163
},
{
"epoch": 0.8774558303886926,
"grad_norm": 5.493069171905518,
"learning_rate": 7.4644016098176615e-06,
"loss": 0.4099,
"step": 1164
},
{
"epoch": 0.8782096584216725,
"grad_norm": 5.413908004760742,
"learning_rate": 7.374234014107484e-06,
"loss": 0.4041,
"step": 1165
},
{
"epoch": 0.8789634864546525,
"grad_norm": 5.9703288078308105,
"learning_rate": 7.284593478087043e-06,
"loss": 0.4391,
"step": 1166
},
{
"epoch": 0.8797173144876325,
"grad_norm": 6.033265590667725,
"learning_rate": 7.195480511829411e-06,
"loss": 0.4356,
"step": 1167
},
{
"epoch": 0.8804711425206125,
"grad_norm": 5.589619159698486,
"learning_rate": 7.106895622405752e-06,
"loss": 0.4029,
"step": 1168
},
{
"epoch": 0.8812249705535925,
"grad_norm": 5.580582141876221,
"learning_rate": 7.018839313882286e-06,
"loss": 0.4039,
"step": 1169
},
{
"epoch": 0.8819787985865725,
"grad_norm": 5.605942726135254,
"learning_rate": 6.931312087317632e-06,
"loss": 0.3915,
"step": 1170
},
{
"epoch": 0.8827326266195524,
"grad_norm": 5.954355239868164,
"learning_rate": 6.844314440759647e-06,
"loss": 0.4119,
"step": 1171
},
{
"epoch": 0.8834864546525324,
"grad_norm": 5.943442344665527,
"learning_rate": 6.7578468692429345e-06,
"loss": 0.4227,
"step": 1172
},
{
"epoch": 0.8842402826855124,
"grad_norm": 6.070568561553955,
"learning_rate": 6.6719098647857525e-06,
"loss": 0.3824,
"step": 1173
},
{
"epoch": 0.8849941107184923,
"grad_norm": 5.827738285064697,
"learning_rate": 6.586503916387366e-06,
"loss": 0.4358,
"step": 1174
},
{
"epoch": 0.8857479387514723,
"grad_norm": 5.9503655433654785,
"learning_rate": 6.501629510025231e-06,
"loss": 0.3862,
"step": 1175
},
{
"epoch": 0.8865017667844522,
"grad_norm": 5.86431360244751,
"learning_rate": 6.417287128652172e-06,
"loss": 0.3849,
"step": 1176
},
{
"epoch": 0.8872555948174323,
"grad_norm": 5.833621978759766,
"learning_rate": 6.333477252193731e-06,
"loss": 0.3935,
"step": 1177
},
{
"epoch": 0.8880094228504123,
"grad_norm": 6.094554901123047,
"learning_rate": 6.250200357545377e-06,
"loss": 0.3911,
"step": 1178
},
{
"epoch": 0.8887632508833923,
"grad_norm": 5.814612865447998,
"learning_rate": 6.167456918569792e-06,
"loss": 0.3738,
"step": 1179
},
{
"epoch": 0.8895170789163722,
"grad_norm": 6.395360946655273,
"learning_rate": 6.085247406094197e-06,
"loss": 0.3692,
"step": 1180
},
{
"epoch": 0.8902709069493522,
"grad_norm": 5.914385795593262,
"learning_rate": 6.003572287907633e-06,
"loss": 0.4008,
"step": 1181
},
{
"epoch": 0.8910247349823321,
"grad_norm": 6.416135787963867,
"learning_rate": 5.922432028758362e-06,
"loss": 0.3997,
"step": 1182
},
{
"epoch": 0.8917785630153121,
"grad_norm": 5.680757522583008,
"learning_rate": 5.841827090351171e-06,
"loss": 0.347,
"step": 1183
},
{
"epoch": 0.8925323910482921,
"grad_norm": 5.837109088897705,
"learning_rate": 5.761757931344758e-06,
"loss": 0.3623,
"step": 1184
},
{
"epoch": 0.893286219081272,
"grad_norm": 5.914787769317627,
"learning_rate": 5.68222500734914e-06,
"loss": 0.3632,
"step": 1185
},
{
"epoch": 0.8940400471142521,
"grad_norm": 6.179137229919434,
"learning_rate": 5.603228770923041e-06,
"loss": 0.3864,
"step": 1186
},
{
"epoch": 0.8947938751472321,
"grad_norm": 5.854869365692139,
"learning_rate": 5.524769671571317e-06,
"loss": 0.3318,
"step": 1187
},
{
"epoch": 0.895547703180212,
"grad_norm": 6.880571365356445,
"learning_rate": 5.446848155742401e-06,
"loss": 0.4063,
"step": 1188
},
{
"epoch": 0.896301531213192,
"grad_norm": 6.602806568145752,
"learning_rate": 5.3694646668257855e-06,
"loss": 0.3698,
"step": 1189
},
{
"epoch": 0.897055359246172,
"grad_norm": 7.17775821685791,
"learning_rate": 5.292619645149433e-06,
"loss": 0.4266,
"step": 1190
},
{
"epoch": 0.8978091872791519,
"grad_norm": 7.022253036499023,
"learning_rate": 5.2163135279773904e-06,
"loss": 0.3885,
"step": 1191
},
{
"epoch": 0.8985630153121319,
"grad_norm": 7.834957599639893,
"learning_rate": 5.140546749507136e-06,
"loss": 0.4484,
"step": 1192
},
{
"epoch": 0.8993168433451119,
"grad_norm": 8.505350112915039,
"learning_rate": 5.06531974086728e-06,
"loss": 0.4535,
"step": 1193
},
{
"epoch": 0.9000706713780918,
"grad_norm": 8.074254035949707,
"learning_rate": 4.9906329301149914e-06,
"loss": 0.4528,
"step": 1194
},
{
"epoch": 0.9008244994110719,
"grad_norm": 8.195548057556152,
"learning_rate": 4.916486742233606e-06,
"loss": 0.447,
"step": 1195
},
{
"epoch": 0.9015783274440519,
"grad_norm": 9.020340919494629,
"learning_rate": 4.8428815991302005e-06,
"loss": 0.4507,
"step": 1196
},
{
"epoch": 0.9023321554770318,
"grad_norm": 9.480902671813965,
"learning_rate": 4.769817919633235e-06,
"loss": 0.4905,
"step": 1197
},
{
"epoch": 0.9030859835100118,
"grad_norm": 9.953953742980957,
"learning_rate": 4.697296119490047e-06,
"loss": 0.4291,
"step": 1198
},
{
"epoch": 0.9038398115429918,
"grad_norm": 9.974310874938965,
"learning_rate": 4.625316611364661e-06,
"loss": 0.4283,
"step": 1199
},
{
"epoch": 0.9045936395759717,
"grad_norm": 12.497854232788086,
"learning_rate": 4.553879804835282e-06,
"loss": 0.4614,
"step": 1200
},
{
"epoch": 0.9053474676089517,
"grad_norm": 4.8798136711120605,
"learning_rate": 4.482986106392073e-06,
"loss": 0.4771,
"step": 1201
},
{
"epoch": 0.9061012956419316,
"grad_norm": 4.956184387207031,
"learning_rate": 4.412635919434749e-06,
"loss": 0.4444,
"step": 1202
},
{
"epoch": 0.9068551236749116,
"grad_norm": 5.346173286437988,
"learning_rate": 4.342829644270429e-06,
"loss": 0.4442,
"step": 1203
},
{
"epoch": 0.9076089517078917,
"grad_norm": 5.293701648712158,
"learning_rate": 4.273567678111123e-06,
"loss": 0.4614,
"step": 1204
},
{
"epoch": 0.9083627797408717,
"grad_norm": 5.237243175506592,
"learning_rate": 4.204850415071748e-06,
"loss": 0.4512,
"step": 1205
},
{
"epoch": 0.9091166077738516,
"grad_norm": 5.3798604011535645,
"learning_rate": 4.136678246167636e-06,
"loss": 0.4286,
"step": 1206
},
{
"epoch": 0.9098704358068316,
"grad_norm": 5.367835998535156,
"learning_rate": 4.069051559312531e-06,
"loss": 0.4139,
"step": 1207
},
{
"epoch": 0.9106242638398115,
"grad_norm": 5.50463342666626,
"learning_rate": 4.001970739316163e-06,
"loss": 0.4407,
"step": 1208
},
{
"epoch": 0.9113780918727915,
"grad_norm": 5.295793056488037,
"learning_rate": 3.935436167882234e-06,
"loss": 0.418,
"step": 1209
},
{
"epoch": 0.9121319199057715,
"grad_norm": 5.284564018249512,
"learning_rate": 3.869448223606165e-06,
"loss": 0.4096,
"step": 1210
},
{
"epoch": 0.9128857479387514,
"grad_norm": 5.553956031799316,
"learning_rate": 3.8040072819729545e-06,
"loss": 0.4141,
"step": 1211
},
{
"epoch": 0.9136395759717314,
"grad_norm": 5.626007080078125,
"learning_rate": 3.7391137153550137e-06,
"loss": 0.4138,
"step": 1212
},
{
"epoch": 0.9143934040047115,
"grad_norm": 5.603013038635254,
"learning_rate": 3.6747678930101558e-06,
"loss": 0.4148,
"step": 1213
},
{
"epoch": 0.9151472320376914,
"grad_norm": 5.539734363555908,
"learning_rate": 3.6109701810793208e-06,
"loss": 0.4181,
"step": 1214
},
{
"epoch": 0.9159010600706714,
"grad_norm": 5.379584789276123,
"learning_rate": 3.5477209425846538e-06,
"loss": 0.4015,
"step": 1215
},
{
"epoch": 0.9166548881036514,
"grad_norm": 5.433023929595947,
"learning_rate": 3.4850205374273416e-06,
"loss": 0.398,
"step": 1216
},
{
"epoch": 0.9174087161366313,
"grad_norm": 5.5849199295043945,
"learning_rate": 3.4228693223856136e-06,
"loss": 0.4165,
"step": 1217
},
{
"epoch": 0.9181625441696113,
"grad_norm": 5.703511714935303,
"learning_rate": 3.361267651112676e-06,
"loss": 0.422,
"step": 1218
},
{
"epoch": 0.9189163722025913,
"grad_norm": 5.733764171600342,
"learning_rate": 3.30021587413476e-06,
"loss": 0.4017,
"step": 1219
},
{
"epoch": 0.9196702002355712,
"grad_norm": 5.802048206329346,
"learning_rate": 3.2397143388489983e-06,
"loss": 0.3935,
"step": 1220
},
{
"epoch": 0.9204240282685512,
"grad_norm": 5.458968639373779,
"learning_rate": 3.1797633895216394e-06,
"loss": 0.3783,
"step": 1221
},
{
"epoch": 0.9211778563015313,
"grad_norm": 5.353023052215576,
"learning_rate": 3.120363367285917e-06,
"loss": 0.3788,
"step": 1222
},
{
"epoch": 0.9219316843345112,
"grad_norm": 5.518474578857422,
"learning_rate": 3.0615146101401925e-06,
"loss": 0.3944,
"step": 1223
},
{
"epoch": 0.9226855123674912,
"grad_norm": 5.713134765625,
"learning_rate": 3.0032174529460165e-06,
"loss": 0.3953,
"step": 1224
},
{
"epoch": 0.9234393404004712,
"grad_norm": 6.142655372619629,
"learning_rate": 2.945472227426227e-06,
"loss": 0.4168,
"step": 1225
},
{
"epoch": 0.9241931684334511,
"grad_norm": 5.580604553222656,
"learning_rate": 2.8882792621630406e-06,
"loss": 0.3642,
"step": 1226
},
{
"epoch": 0.9249469964664311,
"grad_norm": 5.7619757652282715,
"learning_rate": 2.8316388825962324e-06,
"loss": 0.3708,
"step": 1227
},
{
"epoch": 0.925700824499411,
"grad_norm": 6.232563018798828,
"learning_rate": 2.7755514110212264e-06,
"loss": 0.4063,
"step": 1228
},
{
"epoch": 0.926454652532391,
"grad_norm": 5.895346164703369,
"learning_rate": 2.7200171665872742e-06,
"loss": 0.399,
"step": 1229
},
{
"epoch": 0.927208480565371,
"grad_norm": 5.760490894317627,
"learning_rate": 2.6650364652956894e-06,
"loss": 0.3785,
"step": 1230
},
{
"epoch": 0.927962308598351,
"grad_norm": 5.620173454284668,
"learning_rate": 2.6106096199979614e-06,
"loss": 0.3564,
"step": 1231
},
{
"epoch": 0.928716136631331,
"grad_norm": 5.84246826171875,
"learning_rate": 2.5567369403940776e-06,
"loss": 0.3575,
"step": 1232
},
{
"epoch": 0.929469964664311,
"grad_norm": 5.908325672149658,
"learning_rate": 2.50341873303066e-06,
"loss": 0.384,
"step": 1233
},
{
"epoch": 0.9302237926972909,
"grad_norm": 5.850981712341309,
"learning_rate": 2.4506553012993093e-06,
"loss": 0.3704,
"step": 1234
},
{
"epoch": 0.9309776207302709,
"grad_norm": 6.301943778991699,
"learning_rate": 2.398446945434818e-06,
"loss": 0.385,
"step": 1235
},
{
"epoch": 0.9317314487632509,
"grad_norm": 6.557477951049805,
"learning_rate": 2.346793962513483e-06,
"loss": 0.3607,
"step": 1236
},
{
"epoch": 0.9324852767962308,
"grad_norm": 6.442347049713135,
"learning_rate": 2.2956966464514175e-06,
"loss": 0.3829,
"step": 1237
},
{
"epoch": 0.9332391048292108,
"grad_norm": 7.224841594696045,
"learning_rate": 2.245155288002876e-06,
"loss": 0.3964,
"step": 1238
},
{
"epoch": 0.9339929328621908,
"grad_norm": 7.129518032073975,
"learning_rate": 2.1951701747585982e-06,
"loss": 0.3682,
"step": 1239
},
{
"epoch": 0.9347467608951708,
"grad_norm": 6.685035228729248,
"learning_rate": 2.1457415911442013e-06,
"loss": 0.4049,
"step": 1240
},
{
"epoch": 0.9355005889281508,
"grad_norm": 7.421708583831787,
"learning_rate": 2.0968698184184565e-06,
"loss": 0.4029,
"step": 1241
},
{
"epoch": 0.9362544169611308,
"grad_norm": 7.260560989379883,
"learning_rate": 2.04855513467187e-06,
"loss": 0.4232,
"step": 1242
},
{
"epoch": 0.9370082449941107,
"grad_norm": 8.069437980651855,
"learning_rate": 2.000797814824906e-06,
"loss": 0.4409,
"step": 1243
},
{
"epoch": 0.9377620730270907,
"grad_norm": 7.945827960968018,
"learning_rate": 1.9535981306265884e-06,
"loss": 0.4244,
"step": 1244
},
{
"epoch": 0.9385159010600707,
"grad_norm": 8.818882942199707,
"learning_rate": 1.9069563506527998e-06,
"loss": 0.4722,
"step": 1245
},
{
"epoch": 0.9392697290930506,
"grad_norm": 8.6805419921875,
"learning_rate": 1.8608727403049309e-06,
"loss": 0.4574,
"step": 1246
},
{
"epoch": 0.9400235571260306,
"grad_norm": 8.550375938415527,
"learning_rate": 1.8153475618081673e-06,
"loss": 0.4289,
"step": 1247
},
{
"epoch": 0.9407773851590105,
"grad_norm": 9.816337585449219,
"learning_rate": 1.7703810742101813e-06,
"loss": 0.4884,
"step": 1248
},
{
"epoch": 0.9415312131919906,
"grad_norm": 9.228532791137695,
"learning_rate": 1.7259735333795545e-06,
"loss": 0.4282,
"step": 1249
},
{
"epoch": 0.9422850412249706,
"grad_norm": 12.300414085388184,
"learning_rate": 1.6821251920043246e-06,
"loss": 0.4527,
"step": 1250
},
{
"epoch": 0.9430388692579506,
"grad_norm": 5.250865459442139,
"learning_rate": 1.6388362995905848e-06,
"loss": 0.509,
"step": 1251
},
{
"epoch": 0.9437926972909305,
"grad_norm": 5.213113307952881,
"learning_rate": 1.5961071024610752e-06,
"loss": 0.4615,
"step": 1252
},
{
"epoch": 0.9445465253239105,
"grad_norm": 5.200348377227783,
"learning_rate": 1.5539378437536944e-06,
"loss": 0.4463,
"step": 1253
},
{
"epoch": 0.9453003533568904,
"grad_norm": 5.2860941886901855,
"learning_rate": 1.5123287634202454e-06,
"loss": 0.4441,
"step": 1254
},
{
"epoch": 0.9460541813898704,
"grad_norm": 5.183274269104004,
"learning_rate": 1.4712800982249474e-06,
"loss": 0.4292,
"step": 1255
},
{
"epoch": 0.9468080094228504,
"grad_norm": 5.593634605407715,
"learning_rate": 1.430792081743182e-06,
"loss": 0.4589,
"step": 1256
},
{
"epoch": 0.9475618374558303,
"grad_norm": 5.3267388343811035,
"learning_rate": 1.3908649443600707e-06,
"loss": 0.4336,
"step": 1257
},
{
"epoch": 0.9483156654888104,
"grad_norm": 5.741166114807129,
"learning_rate": 1.351498913269289e-06,
"loss": 0.4008,
"step": 1258
},
{
"epoch": 0.9490694935217904,
"grad_norm": 5.336604118347168,
"learning_rate": 1.3126942124716213e-06,
"loss": 0.4218,
"step": 1259
},
{
"epoch": 0.9498233215547703,
"grad_norm": 5.611804962158203,
"learning_rate": 1.2744510627738516e-06,
"loss": 0.4434,
"step": 1260
},
{
"epoch": 0.9505771495877503,
"grad_norm": 5.724870204925537,
"learning_rate": 1.2367696817873419e-06,
"loss": 0.4227,
"step": 1261
},
{
"epoch": 0.9513309776207303,
"grad_norm": 5.307777404785156,
"learning_rate": 1.1996502839269453e-06,
"loss": 0.4002,
"step": 1262
},
{
"epoch": 0.9520848056537102,
"grad_norm": 5.79971170425415,
"learning_rate": 1.1630930804096495e-06,
"loss": 0.405,
"step": 1263
},
{
"epoch": 0.9528386336866902,
"grad_norm": 5.324243068695068,
"learning_rate": 1.127098279253491e-06,
"loss": 0.4043,
"step": 1264
},
{
"epoch": 0.9535924617196702,
"grad_norm": 5.532378673553467,
"learning_rate": 1.0916660852763216e-06,
"loss": 0.4068,
"step": 1265
},
{
"epoch": 0.9543462897526501,
"grad_norm": 5.695662021636963,
"learning_rate": 1.0567967000945866e-06,
"loss": 0.4286,
"step": 1266
},
{
"epoch": 0.9551001177856302,
"grad_norm": 5.8561482429504395,
"learning_rate": 1.0224903221222938e-06,
"loss": 0.4249,
"step": 1267
},
{
"epoch": 0.9558539458186102,
"grad_norm": 5.72511625289917,
"learning_rate": 9.88747146569813e-07,
"loss": 0.4021,
"step": 1268
},
{
"epoch": 0.9566077738515901,
"grad_norm": 5.385478973388672,
"learning_rate": 9.555673654427332e-07,
"loss": 0.3788,
"step": 1269
},
{
"epoch": 0.9573616018845701,
"grad_norm": 5.669264316558838,
"learning_rate": 9.229511675408642e-07,
"loss": 0.4148,
"step": 1270
},
{
"epoch": 0.95811542991755,
"grad_norm": 5.313277244567871,
"learning_rate": 8.90898738457091e-07,
"loss": 0.3641,
"step": 1271
},
{
"epoch": 0.95886925795053,
"grad_norm": 5.480482578277588,
"learning_rate": 8.59410260576321e-07,
"loss": 0.3971,
"step": 1272
},
{
"epoch": 0.95962308598351,
"grad_norm": 5.8209757804870605,
"learning_rate": 8.28485913074506e-07,
"loss": 0.3919,
"step": 1273
},
{
"epoch": 0.96037691401649,
"grad_norm": 5.919877052307129,
"learning_rate": 7.981258719175322e-07,
"loss": 0.3863,
"step": 1274
},
{
"epoch": 0.9611307420494699,
"grad_norm": 5.9404144287109375,
"learning_rate": 7.683303098602989e-07,
"loss": 0.4059,
"step": 1275
},
{
"epoch": 0.96188457008245,
"grad_norm": 5.609850883483887,
"learning_rate": 7.39099396445686e-07,
"loss": 0.3697,
"step": 1276
},
{
"epoch": 0.96263839811543,
"grad_norm": 5.695891857147217,
"learning_rate": 7.104332980036211e-07,
"loss": 0.3917,
"step": 1277
},
{
"epoch": 0.9633922261484099,
"grad_norm": 5.932850360870361,
"learning_rate": 6.823321776501024e-07,
"loss": 0.415,
"step": 1278
},
{
"epoch": 0.9641460541813899,
"grad_norm": 6.023778438568115,
"learning_rate": 6.547961952863002e-07,
"loss": 0.3817,
"step": 1279
},
{
"epoch": 0.9648998822143698,
"grad_norm": 5.926705360412598,
"learning_rate": 6.278255075976125e-07,
"loss": 0.3884,
"step": 1280
},
{
"epoch": 0.9656537102473498,
"grad_norm": 5.837738513946533,
"learning_rate": 6.014202680528324e-07,
"loss": 0.3598,
"step": 1281
},
{
"epoch": 0.9664075382803298,
"grad_norm": 6.178413391113281,
"learning_rate": 5.755806269031827e-07,
"loss": 0.3917,
"step": 1282
},
{
"epoch": 0.9671613663133097,
"grad_norm": 6.282332897186279,
"learning_rate": 5.503067311815713e-07,
"loss": 0.4286,
"step": 1283
},
{
"epoch": 0.9679151943462897,
"grad_norm": 6.746578216552734,
"learning_rate": 5.255987247016591e-07,
"loss": 0.4118,
"step": 1284
},
{
"epoch": 0.9686690223792698,
"grad_norm": 6.075422763824463,
"learning_rate": 5.014567480570831e-07,
"loss": 0.3829,
"step": 1285
},
{
"epoch": 0.9694228504122497,
"grad_norm": 6.149974346160889,
"learning_rate": 4.778809386206895e-07,
"loss": 0.3847,
"step": 1286
},
{
"epoch": 0.9701766784452297,
"grad_norm": 6.333911418914795,
"learning_rate": 4.548714305436685e-07,
"loss": 0.3638,
"step": 1287
},
{
"epoch": 0.9709305064782097,
"grad_norm": 6.391441345214844,
"learning_rate": 4.324283547548658e-07,
"loss": 0.3893,
"step": 1288
},
{
"epoch": 0.9716843345111896,
"grad_norm": 6.624934196472168,
"learning_rate": 4.1055183896001606e-07,
"loss": 0.378,
"step": 1289
},
{
"epoch": 0.9724381625441696,
"grad_norm": 6.473977565765381,
"learning_rate": 3.892420076409886e-07,
"loss": 0.366,
"step": 1290
},
{
"epoch": 0.9731919905771496,
"grad_norm": 6.985432147979736,
"learning_rate": 3.68498982055121e-07,
"loss": 0.4335,
"step": 1291
},
{
"epoch": 0.9739458186101295,
"grad_norm": 7.089210510253906,
"learning_rate": 3.483228802344973e-07,
"loss": 0.4066,
"step": 1292
},
{
"epoch": 0.9746996466431095,
"grad_norm": 7.46934175491333,
"learning_rate": 3.2871381698529324e-07,
"loss": 0.4253,
"step": 1293
},
{
"epoch": 0.9754534746760896,
"grad_norm": 8.461312294006348,
"learning_rate": 3.0967190388712097e-07,
"loss": 0.4596,
"step": 1294
},
{
"epoch": 0.9762073027090695,
"grad_norm": 8.289325714111328,
"learning_rate": 2.9119724929239645e-07,
"loss": 0.4382,
"step": 1295
},
{
"epoch": 0.9769611307420495,
"grad_norm": 8.890064239501953,
"learning_rate": 2.7328995832568426e-07,
"loss": 0.4469,
"step": 1296
},
{
"epoch": 0.9777149587750295,
"grad_norm": 8.737083435058594,
"learning_rate": 2.5595013288318703e-07,
"loss": 0.4262,
"step": 1297
},
{
"epoch": 0.9784687868080094,
"grad_norm": 9.281461715698242,
"learning_rate": 2.391778716320792e-07,
"loss": 0.4036,
"step": 1298
},
{
"epoch": 0.9792226148409894,
"grad_norm": 9.91952896118164,
"learning_rate": 2.2297327000996293e-07,
"loss": 0.4469,
"step": 1299
},
{
"epoch": 0.9799764428739693,
"grad_norm": 11.952555656433105,
"learning_rate": 2.0733642022437994e-07,
"loss": 0.4597,
"step": 1300
},
{
"epoch": 0.9807302709069493,
"grad_norm": 5.1298322677612305,
"learning_rate": 1.922674112522227e-07,
"loss": 0.478,
"step": 1301
},
{
"epoch": 0.9814840989399294,
"grad_norm": 5.572525501251221,
"learning_rate": 1.7776632883924615e-07,
"loss": 0.4829,
"step": 1302
},
{
"epoch": 0.9822379269729093,
"grad_norm": 5.343718528747559,
"learning_rate": 1.638332554996125e-07,
"loss": 0.4319,
"step": 1303
},
{
"epoch": 0.9829917550058893,
"grad_norm": 5.716027736663818,
"learning_rate": 1.5046827051536928e-07,
"loss": 0.4378,
"step": 1304
},
{
"epoch": 0.9837455830388693,
"grad_norm": 5.513693809509277,
"learning_rate": 1.3767144993602766e-07,
"loss": 0.4235,
"step": 1305
},
{
"epoch": 0.9844994110718492,
"grad_norm": 5.508944988250732,
"learning_rate": 1.254428665781515e-07,
"loss": 0.4007,
"step": 1306
},
{
"epoch": 0.9852532391048292,
"grad_norm": 5.180131435394287,
"learning_rate": 1.1378259002488013e-07,
"loss": 0.3939,
"step": 1307
},
{
"epoch": 0.9860070671378092,
"grad_norm": 5.590184688568115,
"learning_rate": 1.0269068662560611e-07,
"loss": 0.4166,
"step": 1308
},
{
"epoch": 0.9867608951707891,
"grad_norm": 5.44436502456665,
"learning_rate": 9.216721949553142e-08,
"loss": 0.4047,
"step": 1309
},
{
"epoch": 0.9875147232037691,
"grad_norm": 5.489165782928467,
"learning_rate": 8.221224851535647e-08,
"loss": 0.3999,
"step": 1310
},
{
"epoch": 0.9882685512367492,
"grad_norm": 5.663797855377197,
"learning_rate": 7.282583033091372e-08,
"loss": 0.3842,
"step": 1311
},
{
"epoch": 0.9890223792697291,
"grad_norm": 5.638896942138672,
"learning_rate": 6.400801835286796e-08,
"loss": 0.3977,
"step": 1312
},
{
"epoch": 0.9897762073027091,
"grad_norm": 5.8632307052612305,
"learning_rate": 5.57588627563721e-08,
"loss": 0.3579,
"step": 1313
},
{
"epoch": 0.9905300353356891,
"grad_norm": 5.826532363891602,
"learning_rate": 4.807841048082296e-08,
"loss": 0.4088,
"step": 1314
},
{
"epoch": 0.991283863368669,
"grad_norm": 5.712516784667969,
"learning_rate": 4.096670522959478e-08,
"loss": 0.3853,
"step": 1315
},
{
"epoch": 0.992037691401649,
"grad_norm": 6.0777459144592285,
"learning_rate": 3.442378746972841e-08,
"loss": 0.4111,
"step": 1316
},
{
"epoch": 0.992791519434629,
"grad_norm": 5.916062831878662,
"learning_rate": 2.844969443178691e-08,
"loss": 0.3821,
"step": 1317
},
{
"epoch": 0.9935453474676089,
"grad_norm": 5.911341190338135,
"learning_rate": 2.304446010958916e-08,
"loss": 0.38,
"step": 1318
},
{
"epoch": 0.9942991755005889,
"grad_norm": 6.334498405456543,
"learning_rate": 1.8208115260032187e-08,
"loss": 0.3812,
"step": 1319
},
{
"epoch": 0.995053003533569,
"grad_norm": 6.576707363128662,
"learning_rate": 1.3940687402924646e-08,
"loss": 0.3858,
"step": 1320
},
{
"epoch": 0.9958068315665489,
"grad_norm": 6.39242696762085,
"learning_rate": 1.0242200820786974e-08,
"loss": 0.3661,
"step": 1321
},
{
"epoch": 0.9965606595995289,
"grad_norm": 7.869157791137695,
"learning_rate": 7.112676558784781e-09,
"loss": 0.3966,
"step": 1322
},
{
"epoch": 0.9973144876325089,
"grad_norm": 7.689291954040527,
"learning_rate": 4.552132424562317e-09,
"loss": 0.4297,
"step": 1323
},
{
"epoch": 0.9980683156654888,
"grad_norm": 8.572519302368164,
"learning_rate": 2.5605829881203414e-09,
"loss": 0.451,
"step": 1324
},
{
"epoch": 0.9988221436984688,
"grad_norm": 9.072525024414062,
"learning_rate": 1.1380395818050282e-09,
"loss": 0.4373,
"step": 1325
},
{
"epoch": 0.9995759717314487,
"grad_norm": 9.224164962768555,
"learning_rate": 2.8451030018583623e-10,
"loss": 0.4368,
"step": 1326
},
{
"epoch": 1.0007067137809187,
"grad_norm": 5.4062819480896,
"learning_rate": 0.0,
"loss": 0.4839,
"step": 1327
}
],
"logging_steps": 1,
"max_steps": 1327,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 332,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8606342447625667e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}