huseinzol05's picture
Add files using upload-large-folder tool
dc2fc76 verified
raw
history blame
227 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5446166736489317,
"eval_steps": 500,
"global_step": 1300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00041893590280687055,
"grad_norm": 8.333969116210938,
"learning_rate": 2e-05,
"loss": 3.3261,
"step": 1
},
{
"epoch": 0.0008378718056137411,
"grad_norm": 7.348582744598389,
"learning_rate": 1.9998324256388776e-05,
"loss": 3.2312,
"step": 2
},
{
"epoch": 0.0012568077084206116,
"grad_norm": 3.1899449825286865,
"learning_rate": 1.9996648512777547e-05,
"loss": 2.9318,
"step": 3
},
{
"epoch": 0.0016757436112274822,
"grad_norm": 2.177359104156494,
"learning_rate": 1.9994972769166317e-05,
"loss": 2.8876,
"step": 4
},
{
"epoch": 0.0020946795140343527,
"grad_norm": 2.1708009243011475,
"learning_rate": 1.999329702555509e-05,
"loss": 3.068,
"step": 5
},
{
"epoch": 0.0025136154168412233,
"grad_norm": 1.77362060546875,
"learning_rate": 1.9991621281943865e-05,
"loss": 2.6598,
"step": 6
},
{
"epoch": 0.002932551319648094,
"grad_norm": 1.709831714630127,
"learning_rate": 1.9989945538332636e-05,
"loss": 2.767,
"step": 7
},
{
"epoch": 0.0033514872224549644,
"grad_norm": 1.5493128299713135,
"learning_rate": 1.998826979472141e-05,
"loss": 2.7182,
"step": 8
},
{
"epoch": 0.003770423125261835,
"grad_norm": 1.39557945728302,
"learning_rate": 1.998659405111018e-05,
"loss": 2.8695,
"step": 9
},
{
"epoch": 0.0041893590280687055,
"grad_norm": 1.1357755661010742,
"learning_rate": 1.9984918307498955e-05,
"loss": 2.5497,
"step": 10
},
{
"epoch": 0.004608294930875576,
"grad_norm": 1.0683544874191284,
"learning_rate": 1.9983242563887726e-05,
"loss": 2.7363,
"step": 11
},
{
"epoch": 0.005027230833682447,
"grad_norm": 0.9199109673500061,
"learning_rate": 1.99815668202765e-05,
"loss": 2.4438,
"step": 12
},
{
"epoch": 0.005446166736489317,
"grad_norm": 0.9708887338638306,
"learning_rate": 1.997989107666527e-05,
"loss": 2.6098,
"step": 13
},
{
"epoch": 0.005865102639296188,
"grad_norm": 0.9026995301246643,
"learning_rate": 1.9978215333054045e-05,
"loss": 2.4444,
"step": 14
},
{
"epoch": 0.006284038542103058,
"grad_norm": 1.2418183088302612,
"learning_rate": 1.9976539589442816e-05,
"loss": 2.5436,
"step": 15
},
{
"epoch": 0.006702974444909929,
"grad_norm": 0.8761052489280701,
"learning_rate": 1.997486384583159e-05,
"loss": 2.3926,
"step": 16
},
{
"epoch": 0.007121910347716799,
"grad_norm": 0.8849633932113647,
"learning_rate": 1.997318810222036e-05,
"loss": 2.5935,
"step": 17
},
{
"epoch": 0.00754084625052367,
"grad_norm": 0.8368175029754639,
"learning_rate": 1.9971512358609135e-05,
"loss": 2.4684,
"step": 18
},
{
"epoch": 0.007959782153330541,
"grad_norm": 0.697807788848877,
"learning_rate": 1.9969836614997905e-05,
"loss": 2.3537,
"step": 19
},
{
"epoch": 0.008378718056137411,
"grad_norm": 0.7476556301116943,
"learning_rate": 1.996816087138668e-05,
"loss": 2.4659,
"step": 20
},
{
"epoch": 0.008797653958944282,
"grad_norm": 0.7885666489601135,
"learning_rate": 1.9966485127775454e-05,
"loss": 2.4466,
"step": 21
},
{
"epoch": 0.009216589861751152,
"grad_norm": 0.7271686792373657,
"learning_rate": 1.9964809384164224e-05,
"loss": 2.3659,
"step": 22
},
{
"epoch": 0.009635525764558023,
"grad_norm": 0.7286465764045715,
"learning_rate": 1.9963133640552995e-05,
"loss": 2.4473,
"step": 23
},
{
"epoch": 0.010054461667364893,
"grad_norm": 0.8572853207588196,
"learning_rate": 1.996145789694177e-05,
"loss": 2.3595,
"step": 24
},
{
"epoch": 0.010473397570171765,
"grad_norm": 0.8283334374427795,
"learning_rate": 1.9959782153330543e-05,
"loss": 2.5291,
"step": 25
},
{
"epoch": 0.010892333472978634,
"grad_norm": 0.6586313843727112,
"learning_rate": 1.9958106409719314e-05,
"loss": 2.3985,
"step": 26
},
{
"epoch": 0.011311269375785506,
"grad_norm": 0.6430657505989075,
"learning_rate": 1.9956430666108085e-05,
"loss": 2.3578,
"step": 27
},
{
"epoch": 0.011730205278592375,
"grad_norm": 0.6550448536872864,
"learning_rate": 1.995475492249686e-05,
"loss": 2.4077,
"step": 28
},
{
"epoch": 0.012149141181399247,
"grad_norm": 0.7592840194702148,
"learning_rate": 1.9953079178885633e-05,
"loss": 2.5008,
"step": 29
},
{
"epoch": 0.012568077084206116,
"grad_norm": 0.7858672738075256,
"learning_rate": 1.9951403435274407e-05,
"loss": 2.4674,
"step": 30
},
{
"epoch": 0.012987012987012988,
"grad_norm": 0.6130352020263672,
"learning_rate": 1.9949727691663178e-05,
"loss": 2.3526,
"step": 31
},
{
"epoch": 0.013405948889819858,
"grad_norm": 0.6684207320213318,
"learning_rate": 1.994805194805195e-05,
"loss": 2.3732,
"step": 32
},
{
"epoch": 0.013824884792626729,
"grad_norm": 0.8275600671768188,
"learning_rate": 1.9946376204440723e-05,
"loss": 2.135,
"step": 33
},
{
"epoch": 0.014243820695433599,
"grad_norm": 0.5858725309371948,
"learning_rate": 1.9944700460829494e-05,
"loss": 2.1368,
"step": 34
},
{
"epoch": 0.01466275659824047,
"grad_norm": 0.7133444547653198,
"learning_rate": 1.9943024717218268e-05,
"loss": 2.304,
"step": 35
},
{
"epoch": 0.01508169250104734,
"grad_norm": 0.5466803312301636,
"learning_rate": 1.994134897360704e-05,
"loss": 2.1682,
"step": 36
},
{
"epoch": 0.015500628403854211,
"grad_norm": 0.5196086168289185,
"learning_rate": 1.9939673229995813e-05,
"loss": 2.1546,
"step": 37
},
{
"epoch": 0.015919564306661083,
"grad_norm": 0.5088497400283813,
"learning_rate": 1.9937997486384583e-05,
"loss": 2.1018,
"step": 38
},
{
"epoch": 0.016338500209467952,
"grad_norm": 0.6117899417877197,
"learning_rate": 1.9936321742773358e-05,
"loss": 2.2346,
"step": 39
},
{
"epoch": 0.016757436112274822,
"grad_norm": 0.5710458159446716,
"learning_rate": 1.993464599916213e-05,
"loss": 2.2147,
"step": 40
},
{
"epoch": 0.01717637201508169,
"grad_norm": 0.5152861475944519,
"learning_rate": 1.9932970255550902e-05,
"loss": 2.2716,
"step": 41
},
{
"epoch": 0.017595307917888565,
"grad_norm": 0.6851192712783813,
"learning_rate": 1.9931294511939673e-05,
"loss": 2.3158,
"step": 42
},
{
"epoch": 0.018014243820695434,
"grad_norm": 0.5485531687736511,
"learning_rate": 1.9929618768328447e-05,
"loss": 2.2679,
"step": 43
},
{
"epoch": 0.018433179723502304,
"grad_norm": 0.48592010140419006,
"learning_rate": 1.992794302471722e-05,
"loss": 2.1303,
"step": 44
},
{
"epoch": 0.018852115626309174,
"grad_norm": 0.5533665418624878,
"learning_rate": 1.9926267281105992e-05,
"loss": 2.1981,
"step": 45
},
{
"epoch": 0.019271051529116047,
"grad_norm": 0.5932656526565552,
"learning_rate": 1.9924591537494763e-05,
"loss": 2.3737,
"step": 46
},
{
"epoch": 0.019689987431922917,
"grad_norm": 0.5236673951148987,
"learning_rate": 1.9922915793883537e-05,
"loss": 2.2694,
"step": 47
},
{
"epoch": 0.020108923334729786,
"grad_norm": 0.5357316732406616,
"learning_rate": 1.992124005027231e-05,
"loss": 2.2368,
"step": 48
},
{
"epoch": 0.020527859237536656,
"grad_norm": 0.5500349998474121,
"learning_rate": 1.9919564306661082e-05,
"loss": 2.213,
"step": 49
},
{
"epoch": 0.02094679514034353,
"grad_norm": 0.48040810227394104,
"learning_rate": 1.9917888563049853e-05,
"loss": 2.1892,
"step": 50
},
{
"epoch": 0.0213657310431504,
"grad_norm": 0.5716186165809631,
"learning_rate": 1.9916212819438627e-05,
"loss": 2.2039,
"step": 51
},
{
"epoch": 0.02178466694595727,
"grad_norm": 0.5564374923706055,
"learning_rate": 1.99145370758274e-05,
"loss": 2.1411,
"step": 52
},
{
"epoch": 0.022203602848764138,
"grad_norm": 0.4996980130672455,
"learning_rate": 1.9912861332216175e-05,
"loss": 2.1521,
"step": 53
},
{
"epoch": 0.02262253875157101,
"grad_norm": 0.5239240527153015,
"learning_rate": 1.9911185588604946e-05,
"loss": 2.0742,
"step": 54
},
{
"epoch": 0.02304147465437788,
"grad_norm": 0.4403076767921448,
"learning_rate": 1.9909509844993716e-05,
"loss": 1.9841,
"step": 55
},
{
"epoch": 0.02346041055718475,
"grad_norm": 0.5169032216072083,
"learning_rate": 1.990783410138249e-05,
"loss": 2.0327,
"step": 56
},
{
"epoch": 0.02387934645999162,
"grad_norm": 0.4901898503303528,
"learning_rate": 1.9906158357771265e-05,
"loss": 2.0063,
"step": 57
},
{
"epoch": 0.024298282362798494,
"grad_norm": 0.6581910252571106,
"learning_rate": 1.9904482614160035e-05,
"loss": 2.1385,
"step": 58
},
{
"epoch": 0.024717218265605363,
"grad_norm": 0.4522070586681366,
"learning_rate": 1.9902806870548806e-05,
"loss": 1.9944,
"step": 59
},
{
"epoch": 0.025136154168412233,
"grad_norm": 0.5315820574760437,
"learning_rate": 1.990113112693758e-05,
"loss": 2.1579,
"step": 60
},
{
"epoch": 0.025555090071219103,
"grad_norm": 0.4661259353160858,
"learning_rate": 1.9899455383326354e-05,
"loss": 2.1193,
"step": 61
},
{
"epoch": 0.025974025974025976,
"grad_norm": 0.4940222203731537,
"learning_rate": 1.9897779639715125e-05,
"loss": 2.0844,
"step": 62
},
{
"epoch": 0.026392961876832845,
"grad_norm": 0.46520665287971497,
"learning_rate": 1.98961038961039e-05,
"loss": 1.9306,
"step": 63
},
{
"epoch": 0.026811897779639715,
"grad_norm": 0.5645989179611206,
"learning_rate": 1.989442815249267e-05,
"loss": 2.1236,
"step": 64
},
{
"epoch": 0.027230833682446585,
"grad_norm": 0.47880157828330994,
"learning_rate": 1.989275240888144e-05,
"loss": 2.0206,
"step": 65
},
{
"epoch": 0.027649769585253458,
"grad_norm": 0.6371349692344666,
"learning_rate": 1.9891076665270215e-05,
"loss": 2.019,
"step": 66
},
{
"epoch": 0.028068705488060328,
"grad_norm": 0.5742272734642029,
"learning_rate": 1.988940092165899e-05,
"loss": 2.0899,
"step": 67
},
{
"epoch": 0.028487641390867197,
"grad_norm": 0.5579768419265747,
"learning_rate": 1.988772517804776e-05,
"loss": 2.081,
"step": 68
},
{
"epoch": 0.028906577293674067,
"grad_norm": 0.5897182822227478,
"learning_rate": 1.988604943443653e-05,
"loss": 1.9601,
"step": 69
},
{
"epoch": 0.02932551319648094,
"grad_norm": 0.46881428360939026,
"learning_rate": 1.9884373690825305e-05,
"loss": 1.9085,
"step": 70
},
{
"epoch": 0.02974444909928781,
"grad_norm": 0.6095844507217407,
"learning_rate": 1.988269794721408e-05,
"loss": 1.9762,
"step": 71
},
{
"epoch": 0.03016338500209468,
"grad_norm": 0.599513053894043,
"learning_rate": 1.988102220360285e-05,
"loss": 1.8723,
"step": 72
},
{
"epoch": 0.03058232090490155,
"grad_norm": 0.585457980632782,
"learning_rate": 1.987934645999162e-05,
"loss": 1.9209,
"step": 73
},
{
"epoch": 0.031001256807708422,
"grad_norm": 0.42224225401878357,
"learning_rate": 1.9877670716380394e-05,
"loss": 1.9186,
"step": 74
},
{
"epoch": 0.03142019271051529,
"grad_norm": 0.4566991329193115,
"learning_rate": 1.987599497276917e-05,
"loss": 2.018,
"step": 75
},
{
"epoch": 0.031839128613322165,
"grad_norm": 0.47718995809555054,
"learning_rate": 1.9874319229157943e-05,
"loss": 2.0119,
"step": 76
},
{
"epoch": 0.03225806451612903,
"grad_norm": 0.4412285089492798,
"learning_rate": 1.9872643485546713e-05,
"loss": 1.9211,
"step": 77
},
{
"epoch": 0.032677000418935905,
"grad_norm": 0.4711454212665558,
"learning_rate": 1.9870967741935484e-05,
"loss": 1.9051,
"step": 78
},
{
"epoch": 0.03309593632174277,
"grad_norm": 0.4665948450565338,
"learning_rate": 1.9869291998324258e-05,
"loss": 1.9571,
"step": 79
},
{
"epoch": 0.033514872224549644,
"grad_norm": 0.46011775732040405,
"learning_rate": 1.9867616254713032e-05,
"loss": 1.9599,
"step": 80
},
{
"epoch": 0.03393380812735652,
"grad_norm": 0.46272069215774536,
"learning_rate": 1.9865940511101803e-05,
"loss": 1.9161,
"step": 81
},
{
"epoch": 0.03435274403016338,
"grad_norm": 0.5554195046424866,
"learning_rate": 1.9864264767490574e-05,
"loss": 2.0202,
"step": 82
},
{
"epoch": 0.034771679932970256,
"grad_norm": 0.5324104428291321,
"learning_rate": 1.9862589023879348e-05,
"loss": 1.9356,
"step": 83
},
{
"epoch": 0.03519061583577713,
"grad_norm": 0.5279750823974609,
"learning_rate": 1.9860913280268122e-05,
"loss": 1.9511,
"step": 84
},
{
"epoch": 0.035609551738583996,
"grad_norm": 0.5002080202102661,
"learning_rate": 1.9859237536656893e-05,
"loss": 1.9248,
"step": 85
},
{
"epoch": 0.03602848764139087,
"grad_norm": 0.5625497102737427,
"learning_rate": 1.9857561793045667e-05,
"loss": 2.0023,
"step": 86
},
{
"epoch": 0.036447423544197735,
"grad_norm": 0.6030247807502747,
"learning_rate": 1.9855886049434438e-05,
"loss": 1.893,
"step": 87
},
{
"epoch": 0.03686635944700461,
"grad_norm": 0.4760509729385376,
"learning_rate": 1.9854210305823212e-05,
"loss": 1.8902,
"step": 88
},
{
"epoch": 0.03728529534981148,
"grad_norm": 0.6618624925613403,
"learning_rate": 1.9852534562211983e-05,
"loss": 1.9173,
"step": 89
},
{
"epoch": 0.03770423125261835,
"grad_norm": 0.47204822301864624,
"learning_rate": 1.9850858818600757e-05,
"loss": 1.9266,
"step": 90
},
{
"epoch": 0.03812316715542522,
"grad_norm": 0.5421533584594727,
"learning_rate": 1.9849183074989527e-05,
"loss": 1.9796,
"step": 91
},
{
"epoch": 0.038542103058232094,
"grad_norm": 0.48972201347351074,
"learning_rate": 1.98475073313783e-05,
"loss": 1.91,
"step": 92
},
{
"epoch": 0.03896103896103896,
"grad_norm": 0.5566658973693848,
"learning_rate": 1.9845831587767072e-05,
"loss": 1.8992,
"step": 93
},
{
"epoch": 0.03937997486384583,
"grad_norm": 0.4685937464237213,
"learning_rate": 1.9844155844155846e-05,
"loss": 1.9231,
"step": 94
},
{
"epoch": 0.0397989107666527,
"grad_norm": 0.6744531393051147,
"learning_rate": 1.9842480100544617e-05,
"loss": 1.9109,
"step": 95
},
{
"epoch": 0.04021784666945957,
"grad_norm": 0.6984325051307678,
"learning_rate": 1.984080435693339e-05,
"loss": 1.9566,
"step": 96
},
{
"epoch": 0.040636782572266446,
"grad_norm": 0.6627328991889954,
"learning_rate": 1.9839128613322162e-05,
"loss": 1.9933,
"step": 97
},
{
"epoch": 0.04105571847507331,
"grad_norm": 0.4586343765258789,
"learning_rate": 1.9837452869710936e-05,
"loss": 1.7939,
"step": 98
},
{
"epoch": 0.041474654377880185,
"grad_norm": 0.6211162805557251,
"learning_rate": 1.983577712609971e-05,
"loss": 1.9164,
"step": 99
},
{
"epoch": 0.04189359028068706,
"grad_norm": 0.9397639632225037,
"learning_rate": 1.983410138248848e-05,
"loss": 2.0262,
"step": 100
},
{
"epoch": 0.042312526183493925,
"grad_norm": 0.7698065638542175,
"learning_rate": 1.9832425638877252e-05,
"loss": 1.8979,
"step": 101
},
{
"epoch": 0.0427314620863008,
"grad_norm": 0.5800043940544128,
"learning_rate": 1.9830749895266026e-05,
"loss": 1.9483,
"step": 102
},
{
"epoch": 0.043150397989107664,
"grad_norm": 0.7634892463684082,
"learning_rate": 1.98290741516548e-05,
"loss": 1.7777,
"step": 103
},
{
"epoch": 0.04356933389191454,
"grad_norm": 0.5963580012321472,
"learning_rate": 1.982739840804357e-05,
"loss": 1.9406,
"step": 104
},
{
"epoch": 0.04398826979472141,
"grad_norm": 0.6970496773719788,
"learning_rate": 1.982572266443234e-05,
"loss": 1.967,
"step": 105
},
{
"epoch": 0.044407205697528276,
"grad_norm": 0.5826534032821655,
"learning_rate": 1.9824046920821116e-05,
"loss": 1.8577,
"step": 106
},
{
"epoch": 0.04482614160033515,
"grad_norm": 0.60413658618927,
"learning_rate": 1.982237117720989e-05,
"loss": 1.9528,
"step": 107
},
{
"epoch": 0.04524507750314202,
"grad_norm": 0.7267922759056091,
"learning_rate": 1.982069543359866e-05,
"loss": 1.7067,
"step": 108
},
{
"epoch": 0.04566401340594889,
"grad_norm": 0.6376165747642517,
"learning_rate": 1.9819019689987435e-05,
"loss": 1.9888,
"step": 109
},
{
"epoch": 0.04608294930875576,
"grad_norm": 0.5887031555175781,
"learning_rate": 1.9817343946376205e-05,
"loss": 1.7777,
"step": 110
},
{
"epoch": 0.04650188521156263,
"grad_norm": 0.6548938155174255,
"learning_rate": 1.981566820276498e-05,
"loss": 1.9222,
"step": 111
},
{
"epoch": 0.0469208211143695,
"grad_norm": 0.5757064819335938,
"learning_rate": 1.981399245915375e-05,
"loss": 1.844,
"step": 112
},
{
"epoch": 0.047339757017176375,
"grad_norm": 0.7597166895866394,
"learning_rate": 1.9812316715542524e-05,
"loss": 1.8678,
"step": 113
},
{
"epoch": 0.04775869291998324,
"grad_norm": 0.5536984801292419,
"learning_rate": 1.9810640971931295e-05,
"loss": 1.8502,
"step": 114
},
{
"epoch": 0.048177628822790114,
"grad_norm": 0.5753149390220642,
"learning_rate": 1.980896522832007e-05,
"loss": 1.8613,
"step": 115
},
{
"epoch": 0.04859656472559699,
"grad_norm": 0.6214611530303955,
"learning_rate": 1.980728948470884e-05,
"loss": 1.8803,
"step": 116
},
{
"epoch": 0.04901550062840385,
"grad_norm": 0.5892764329910278,
"learning_rate": 1.9805613741097614e-05,
"loss": 1.8241,
"step": 117
},
{
"epoch": 0.049434436531210726,
"grad_norm": 0.5623623132705688,
"learning_rate": 1.9803937997486388e-05,
"loss": 1.8584,
"step": 118
},
{
"epoch": 0.04985337243401759,
"grad_norm": 0.5206480622291565,
"learning_rate": 1.980226225387516e-05,
"loss": 1.8058,
"step": 119
},
{
"epoch": 0.050272308336824466,
"grad_norm": 0.7416813373565674,
"learning_rate": 1.980058651026393e-05,
"loss": 1.7422,
"step": 120
},
{
"epoch": 0.05069124423963134,
"grad_norm": 0.6095878481864929,
"learning_rate": 1.9798910766652704e-05,
"loss": 1.7567,
"step": 121
},
{
"epoch": 0.051110180142438205,
"grad_norm": 0.5830249786376953,
"learning_rate": 1.9797235023041478e-05,
"loss": 1.8099,
"step": 122
},
{
"epoch": 0.05152911604524508,
"grad_norm": 0.7603867053985596,
"learning_rate": 1.979555927943025e-05,
"loss": 1.8074,
"step": 123
},
{
"epoch": 0.05194805194805195,
"grad_norm": 0.607905387878418,
"learning_rate": 1.979388353581902e-05,
"loss": 1.8359,
"step": 124
},
{
"epoch": 0.05236698785085882,
"grad_norm": 0.5446661710739136,
"learning_rate": 1.9792207792207794e-05,
"loss": 1.7291,
"step": 125
},
{
"epoch": 0.05278592375366569,
"grad_norm": 0.5527285933494568,
"learning_rate": 1.9790532048596568e-05,
"loss": 1.7841,
"step": 126
},
{
"epoch": 0.05320485965647256,
"grad_norm": 0.6565405130386353,
"learning_rate": 1.978885630498534e-05,
"loss": 1.9168,
"step": 127
},
{
"epoch": 0.05362379555927943,
"grad_norm": 0.5959405899047852,
"learning_rate": 1.978718056137411e-05,
"loss": 1.8735,
"step": 128
},
{
"epoch": 0.0540427314620863,
"grad_norm": 0.7685437202453613,
"learning_rate": 1.9785504817762883e-05,
"loss": 1.782,
"step": 129
},
{
"epoch": 0.05446166736489317,
"grad_norm": 0.5747430324554443,
"learning_rate": 1.9783829074151657e-05,
"loss": 1.7831,
"step": 130
},
{
"epoch": 0.05488060326770004,
"grad_norm": 0.7328975200653076,
"learning_rate": 1.9782153330540428e-05,
"loss": 1.7451,
"step": 131
},
{
"epoch": 0.055299539170506916,
"grad_norm": 0.5662095546722412,
"learning_rate": 1.9780477586929202e-05,
"loss": 1.6731,
"step": 132
},
{
"epoch": 0.05571847507331378,
"grad_norm": 0.6165090799331665,
"learning_rate": 1.9778801843317973e-05,
"loss": 1.7787,
"step": 133
},
{
"epoch": 0.056137410976120655,
"grad_norm": 0.6399924755096436,
"learning_rate": 1.9777126099706747e-05,
"loss": 1.8697,
"step": 134
},
{
"epoch": 0.05655634687892752,
"grad_norm": 0.6513495445251465,
"learning_rate": 1.9775450356095518e-05,
"loss": 1.7885,
"step": 135
},
{
"epoch": 0.056975282781734395,
"grad_norm": 0.652104914188385,
"learning_rate": 1.9773774612484292e-05,
"loss": 1.8462,
"step": 136
},
{
"epoch": 0.05739421868454127,
"grad_norm": 0.8418712615966797,
"learning_rate": 1.9772098868873063e-05,
"loss": 1.8771,
"step": 137
},
{
"epoch": 0.057813154587348134,
"grad_norm": 0.6133163571357727,
"learning_rate": 1.9770423125261837e-05,
"loss": 1.7998,
"step": 138
},
{
"epoch": 0.05823209049015501,
"grad_norm": 0.6718358993530273,
"learning_rate": 1.9768747381650608e-05,
"loss": 1.7633,
"step": 139
},
{
"epoch": 0.05865102639296188,
"grad_norm": 0.6728368997573853,
"learning_rate": 1.9767071638039382e-05,
"loss": 1.8313,
"step": 140
},
{
"epoch": 0.059069962295768746,
"grad_norm": 0.588307797908783,
"learning_rate": 1.9765395894428156e-05,
"loss": 1.7395,
"step": 141
},
{
"epoch": 0.05948889819857562,
"grad_norm": 0.889776885509491,
"learning_rate": 1.9763720150816927e-05,
"loss": 1.7591,
"step": 142
},
{
"epoch": 0.059907834101382486,
"grad_norm": 0.5996978878974915,
"learning_rate": 1.9762044407205697e-05,
"loss": 1.6881,
"step": 143
},
{
"epoch": 0.06032677000418936,
"grad_norm": 0.6324535012245178,
"learning_rate": 1.976036866359447e-05,
"loss": 1.7142,
"step": 144
},
{
"epoch": 0.06074570590699623,
"grad_norm": 0.6198902130126953,
"learning_rate": 1.9758692919983246e-05,
"loss": 1.6867,
"step": 145
},
{
"epoch": 0.0611646418098031,
"grad_norm": 0.6651074886322021,
"learning_rate": 1.9757017176372016e-05,
"loss": 1.7773,
"step": 146
},
{
"epoch": 0.06158357771260997,
"grad_norm": 0.6877864599227905,
"learning_rate": 1.9755341432760787e-05,
"loss": 1.792,
"step": 147
},
{
"epoch": 0.062002513615416845,
"grad_norm": 0.5880079865455627,
"learning_rate": 1.975366568914956e-05,
"loss": 1.7761,
"step": 148
},
{
"epoch": 0.06242144951822371,
"grad_norm": 0.6574519872665405,
"learning_rate": 1.9751989945538335e-05,
"loss": 1.7769,
"step": 149
},
{
"epoch": 0.06284038542103058,
"grad_norm": 0.5385615825653076,
"learning_rate": 1.9750314201927106e-05,
"loss": 1.7226,
"step": 150
},
{
"epoch": 0.06325932132383745,
"grad_norm": 0.6323086023330688,
"learning_rate": 1.9748638458315877e-05,
"loss": 1.7043,
"step": 151
},
{
"epoch": 0.06367825722664433,
"grad_norm": 0.5224570035934448,
"learning_rate": 1.974696271470465e-05,
"loss": 1.7103,
"step": 152
},
{
"epoch": 0.0640971931294512,
"grad_norm": 0.5448949933052063,
"learning_rate": 1.9745286971093425e-05,
"loss": 1.7691,
"step": 153
},
{
"epoch": 0.06451612903225806,
"grad_norm": 0.47703322768211365,
"learning_rate": 1.97436112274822e-05,
"loss": 1.7107,
"step": 154
},
{
"epoch": 0.06493506493506493,
"grad_norm": 0.555005669593811,
"learning_rate": 1.974193548387097e-05,
"loss": 1.7493,
"step": 155
},
{
"epoch": 0.06535400083787181,
"grad_norm": 0.5083591938018799,
"learning_rate": 1.974025974025974e-05,
"loss": 1.621,
"step": 156
},
{
"epoch": 0.06577293674067868,
"grad_norm": 0.5037218928337097,
"learning_rate": 1.9738583996648515e-05,
"loss": 1.6905,
"step": 157
},
{
"epoch": 0.06619187264348554,
"grad_norm": 0.48570749163627625,
"learning_rate": 1.973690825303729e-05,
"loss": 1.73,
"step": 158
},
{
"epoch": 0.06661080854629242,
"grad_norm": 0.5082443952560425,
"learning_rate": 1.973523250942606e-05,
"loss": 1.7107,
"step": 159
},
{
"epoch": 0.06702974444909929,
"grad_norm": 0.5330659747123718,
"learning_rate": 1.973355676581483e-05,
"loss": 1.7756,
"step": 160
},
{
"epoch": 0.06744868035190615,
"grad_norm": 0.6401243209838867,
"learning_rate": 1.9731881022203605e-05,
"loss": 1.7171,
"step": 161
},
{
"epoch": 0.06786761625471303,
"grad_norm": 0.5555040836334229,
"learning_rate": 1.9730205278592375e-05,
"loss": 1.7117,
"step": 162
},
{
"epoch": 0.0682865521575199,
"grad_norm": 0.5580366253852844,
"learning_rate": 1.972852953498115e-05,
"loss": 1.7173,
"step": 163
},
{
"epoch": 0.06870548806032677,
"grad_norm": 0.6296005249023438,
"learning_rate": 1.9726853791369924e-05,
"loss": 1.7316,
"step": 164
},
{
"epoch": 0.06912442396313365,
"grad_norm": 0.681302547454834,
"learning_rate": 1.9725178047758694e-05,
"loss": 1.7849,
"step": 165
},
{
"epoch": 0.06954335986594051,
"grad_norm": 0.6220213770866394,
"learning_rate": 1.9723502304147465e-05,
"loss": 1.656,
"step": 166
},
{
"epoch": 0.06996229576874738,
"grad_norm": 0.6640814542770386,
"learning_rate": 1.972182656053624e-05,
"loss": 1.8749,
"step": 167
},
{
"epoch": 0.07038123167155426,
"grad_norm": 0.5521919131278992,
"learning_rate": 1.9720150816925013e-05,
"loss": 1.763,
"step": 168
},
{
"epoch": 0.07080016757436113,
"grad_norm": 0.6188511848449707,
"learning_rate": 1.9718475073313784e-05,
"loss": 1.6431,
"step": 169
},
{
"epoch": 0.07121910347716799,
"grad_norm": 0.5388275980949402,
"learning_rate": 1.9716799329702555e-05,
"loss": 1.7444,
"step": 170
},
{
"epoch": 0.07163803937997486,
"grad_norm": 1.0150574445724487,
"learning_rate": 1.971512358609133e-05,
"loss": 1.6986,
"step": 171
},
{
"epoch": 0.07205697528278174,
"grad_norm": 0.6714919805526733,
"learning_rate": 1.9713447842480103e-05,
"loss": 1.7461,
"step": 172
},
{
"epoch": 0.0724759111855886,
"grad_norm": 0.6587640047073364,
"learning_rate": 1.9711772098868874e-05,
"loss": 1.6743,
"step": 173
},
{
"epoch": 0.07289484708839547,
"grad_norm": 0.6181256175041199,
"learning_rate": 1.9710096355257648e-05,
"loss": 1.8552,
"step": 174
},
{
"epoch": 0.07331378299120235,
"grad_norm": 0.5564039945602417,
"learning_rate": 1.970842061164642e-05,
"loss": 1.741,
"step": 175
},
{
"epoch": 0.07373271889400922,
"grad_norm": 0.5421382188796997,
"learning_rate": 1.9706744868035193e-05,
"loss": 1.6919,
"step": 176
},
{
"epoch": 0.07415165479681608,
"grad_norm": 0.6172086000442505,
"learning_rate": 1.9705069124423967e-05,
"loss": 1.6686,
"step": 177
},
{
"epoch": 0.07457059069962296,
"grad_norm": 0.5004185438156128,
"learning_rate": 1.9703393380812738e-05,
"loss": 1.6309,
"step": 178
},
{
"epoch": 0.07498952660242983,
"grad_norm": 0.5099078416824341,
"learning_rate": 1.970171763720151e-05,
"loss": 1.674,
"step": 179
},
{
"epoch": 0.0754084625052367,
"grad_norm": 0.5554249882698059,
"learning_rate": 1.9700041893590283e-05,
"loss": 1.6227,
"step": 180
},
{
"epoch": 0.07582739840804358,
"grad_norm": 0.6313985586166382,
"learning_rate": 1.9698366149979057e-05,
"loss": 1.7098,
"step": 181
},
{
"epoch": 0.07624633431085044,
"grad_norm": 0.8186052441596985,
"learning_rate": 1.9696690406367827e-05,
"loss": 1.7933,
"step": 182
},
{
"epoch": 0.07666527021365731,
"grad_norm": 0.5017969608306885,
"learning_rate": 1.9695014662756598e-05,
"loss": 1.6853,
"step": 183
},
{
"epoch": 0.07708420611646419,
"grad_norm": 0.6917697787284851,
"learning_rate": 1.9693338919145372e-05,
"loss": 1.7031,
"step": 184
},
{
"epoch": 0.07750314201927105,
"grad_norm": 0.5040557980537415,
"learning_rate": 1.9691663175534146e-05,
"loss": 1.6353,
"step": 185
},
{
"epoch": 0.07792207792207792,
"grad_norm": 0.5733162760734558,
"learning_rate": 1.9689987431922917e-05,
"loss": 1.7278,
"step": 186
},
{
"epoch": 0.07834101382488479,
"grad_norm": 0.7823026776313782,
"learning_rate": 1.968831168831169e-05,
"loss": 1.6954,
"step": 187
},
{
"epoch": 0.07875994972769167,
"grad_norm": 0.5559296607971191,
"learning_rate": 1.9686635944700462e-05,
"loss": 1.7417,
"step": 188
},
{
"epoch": 0.07917888563049853,
"grad_norm": 0.6399711966514587,
"learning_rate": 1.9684960201089236e-05,
"loss": 1.6529,
"step": 189
},
{
"epoch": 0.0795978215333054,
"grad_norm": 0.6075267195701599,
"learning_rate": 1.9683284457478007e-05,
"loss": 1.687,
"step": 190
},
{
"epoch": 0.08001675743611228,
"grad_norm": 0.5875303149223328,
"learning_rate": 1.968160871386678e-05,
"loss": 1.6569,
"step": 191
},
{
"epoch": 0.08043569333891915,
"grad_norm": 0.6546170711517334,
"learning_rate": 1.9679932970255552e-05,
"loss": 1.6844,
"step": 192
},
{
"epoch": 0.08085462924172601,
"grad_norm": 0.5778879523277283,
"learning_rate": 1.9678257226644322e-05,
"loss": 1.6705,
"step": 193
},
{
"epoch": 0.08127356514453289,
"grad_norm": 0.5718396306037903,
"learning_rate": 1.9676581483033097e-05,
"loss": 1.6519,
"step": 194
},
{
"epoch": 0.08169250104733976,
"grad_norm": 0.6596693992614746,
"learning_rate": 1.967490573942187e-05,
"loss": 1.6547,
"step": 195
},
{
"epoch": 0.08211143695014662,
"grad_norm": 0.5057825446128845,
"learning_rate": 1.9673229995810645e-05,
"loss": 1.6214,
"step": 196
},
{
"epoch": 0.0825303728529535,
"grad_norm": 0.6651629209518433,
"learning_rate": 1.9671554252199416e-05,
"loss": 1.6835,
"step": 197
},
{
"epoch": 0.08294930875576037,
"grad_norm": 0.5056618452072144,
"learning_rate": 1.9669878508588186e-05,
"loss": 1.6368,
"step": 198
},
{
"epoch": 0.08336824465856724,
"grad_norm": 0.4693203568458557,
"learning_rate": 1.966820276497696e-05,
"loss": 1.6322,
"step": 199
},
{
"epoch": 0.08378718056137412,
"grad_norm": 0.5992833375930786,
"learning_rate": 1.9666527021365735e-05,
"loss": 1.7568,
"step": 200
},
{
"epoch": 0.08420611646418098,
"grad_norm": 0.62791508436203,
"learning_rate": 1.9664851277754505e-05,
"loss": 1.6937,
"step": 201
},
{
"epoch": 0.08462505236698785,
"grad_norm": 0.5130066275596619,
"learning_rate": 1.9663175534143276e-05,
"loss": 1.6236,
"step": 202
},
{
"epoch": 0.08504398826979472,
"grad_norm": 0.5045161247253418,
"learning_rate": 1.966149979053205e-05,
"loss": 1.7386,
"step": 203
},
{
"epoch": 0.0854629241726016,
"grad_norm": 0.6568188667297363,
"learning_rate": 1.9659824046920824e-05,
"loss": 1.6639,
"step": 204
},
{
"epoch": 0.08588186007540846,
"grad_norm": 0.6545958518981934,
"learning_rate": 1.9658148303309595e-05,
"loss": 1.6593,
"step": 205
},
{
"epoch": 0.08630079597821533,
"grad_norm": 0.5190823078155518,
"learning_rate": 1.9656472559698366e-05,
"loss": 1.6398,
"step": 206
},
{
"epoch": 0.08671973188102221,
"grad_norm": 0.5478256344795227,
"learning_rate": 1.965479681608714e-05,
"loss": 1.5848,
"step": 207
},
{
"epoch": 0.08713866778382907,
"grad_norm": 0.5818894505500793,
"learning_rate": 1.9653121072475914e-05,
"loss": 1.6046,
"step": 208
},
{
"epoch": 0.08755760368663594,
"grad_norm": 0.6687189936637878,
"learning_rate": 1.9651445328864685e-05,
"loss": 1.6177,
"step": 209
},
{
"epoch": 0.08797653958944282,
"grad_norm": 0.5873174071311951,
"learning_rate": 1.964976958525346e-05,
"loss": 1.659,
"step": 210
},
{
"epoch": 0.08839547549224969,
"grad_norm": 0.5621105432510376,
"learning_rate": 1.964809384164223e-05,
"loss": 1.6739,
"step": 211
},
{
"epoch": 0.08881441139505655,
"grad_norm": 0.7059792876243591,
"learning_rate": 1.9646418098031004e-05,
"loss": 1.6738,
"step": 212
},
{
"epoch": 0.08923334729786343,
"grad_norm": 0.5294623970985413,
"learning_rate": 1.9644742354419775e-05,
"loss": 1.6419,
"step": 213
},
{
"epoch": 0.0896522832006703,
"grad_norm": 0.7764983773231506,
"learning_rate": 1.964306661080855e-05,
"loss": 1.6793,
"step": 214
},
{
"epoch": 0.09007121910347717,
"grad_norm": 0.628094494342804,
"learning_rate": 1.964139086719732e-05,
"loss": 1.676,
"step": 215
},
{
"epoch": 0.09049015500628405,
"grad_norm": 0.5593112111091614,
"learning_rate": 1.9639715123586094e-05,
"loss": 1.7119,
"step": 216
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.7385965585708618,
"learning_rate": 1.9638039379974864e-05,
"loss": 1.6039,
"step": 217
},
{
"epoch": 0.09132802681189778,
"grad_norm": 0.5662972927093506,
"learning_rate": 1.963636363636364e-05,
"loss": 1.6255,
"step": 218
},
{
"epoch": 0.09174696271470464,
"grad_norm": 0.7071956992149353,
"learning_rate": 1.9634687892752413e-05,
"loss": 1.6587,
"step": 219
},
{
"epoch": 0.09216589861751152,
"grad_norm": 0.6986474990844727,
"learning_rate": 1.9633012149141183e-05,
"loss": 1.6266,
"step": 220
},
{
"epoch": 0.09258483452031839,
"grad_norm": 0.6965208053588867,
"learning_rate": 1.9631336405529954e-05,
"loss": 1.6619,
"step": 221
},
{
"epoch": 0.09300377042312526,
"grad_norm": 0.7024741768836975,
"learning_rate": 1.9629660661918728e-05,
"loss": 1.7327,
"step": 222
},
{
"epoch": 0.09342270632593214,
"grad_norm": 0.5887707471847534,
"learning_rate": 1.9627984918307502e-05,
"loss": 1.6706,
"step": 223
},
{
"epoch": 0.093841642228739,
"grad_norm": 0.8550237417221069,
"learning_rate": 1.9626309174696273e-05,
"loss": 1.5773,
"step": 224
},
{
"epoch": 0.09426057813154587,
"grad_norm": 0.6820223331451416,
"learning_rate": 1.9624633431085044e-05,
"loss": 1.7496,
"step": 225
},
{
"epoch": 0.09467951403435275,
"grad_norm": 0.7763844728469849,
"learning_rate": 1.9622957687473818e-05,
"loss": 1.6204,
"step": 226
},
{
"epoch": 0.09509844993715962,
"grad_norm": 0.779120147228241,
"learning_rate": 1.9621281943862592e-05,
"loss": 1.617,
"step": 227
},
{
"epoch": 0.09551738583996648,
"grad_norm": 0.7589849233627319,
"learning_rate": 1.9619606200251363e-05,
"loss": 1.5478,
"step": 228
},
{
"epoch": 0.09593632174277336,
"grad_norm": 0.5152125954627991,
"learning_rate": 1.9617930456640137e-05,
"loss": 1.6749,
"step": 229
},
{
"epoch": 0.09635525764558023,
"grad_norm": 0.7013604640960693,
"learning_rate": 1.9616254713028908e-05,
"loss": 1.5864,
"step": 230
},
{
"epoch": 0.0967741935483871,
"grad_norm": 0.7294275760650635,
"learning_rate": 1.9614578969417682e-05,
"loss": 1.5845,
"step": 231
},
{
"epoch": 0.09719312945119397,
"grad_norm": 0.5346665382385254,
"learning_rate": 1.9612903225806452e-05,
"loss": 1.643,
"step": 232
},
{
"epoch": 0.09761206535400084,
"grad_norm": 0.953779935836792,
"learning_rate": 1.9611227482195227e-05,
"loss": 1.6152,
"step": 233
},
{
"epoch": 0.0980310012568077,
"grad_norm": 0.6668707132339478,
"learning_rate": 1.9609551738583997e-05,
"loss": 1.6066,
"step": 234
},
{
"epoch": 0.09844993715961457,
"grad_norm": 0.6693033576011658,
"learning_rate": 1.960787599497277e-05,
"loss": 1.6124,
"step": 235
},
{
"epoch": 0.09886887306242145,
"grad_norm": 0.8081066608428955,
"learning_rate": 1.9606200251361542e-05,
"loss": 1.6351,
"step": 236
},
{
"epoch": 0.09928780896522832,
"grad_norm": 0.7561270594596863,
"learning_rate": 1.9604524507750316e-05,
"loss": 1.6476,
"step": 237
},
{
"epoch": 0.09970674486803519,
"grad_norm": 0.8342212438583374,
"learning_rate": 1.9602848764139087e-05,
"loss": 1.5573,
"step": 238
},
{
"epoch": 0.10012568077084207,
"grad_norm": 0.8095865845680237,
"learning_rate": 1.960117302052786e-05,
"loss": 1.5594,
"step": 239
},
{
"epoch": 0.10054461667364893,
"grad_norm": 0.8448402881622314,
"learning_rate": 1.9599497276916632e-05,
"loss": 1.6646,
"step": 240
},
{
"epoch": 0.1009635525764558,
"grad_norm": 0.936273455619812,
"learning_rate": 1.9597821533305406e-05,
"loss": 1.5822,
"step": 241
},
{
"epoch": 0.10138248847926268,
"grad_norm": 0.5605466365814209,
"learning_rate": 1.959614578969418e-05,
"loss": 1.616,
"step": 242
},
{
"epoch": 0.10180142438206954,
"grad_norm": 1.0700498819351196,
"learning_rate": 1.959447004608295e-05,
"loss": 1.618,
"step": 243
},
{
"epoch": 0.10222036028487641,
"grad_norm": 0.6166669726371765,
"learning_rate": 1.959279430247172e-05,
"loss": 1.5989,
"step": 244
},
{
"epoch": 0.10263929618768329,
"grad_norm": 0.7001603841781616,
"learning_rate": 1.9591118558860496e-05,
"loss": 1.464,
"step": 245
},
{
"epoch": 0.10305823209049016,
"grad_norm": 0.5694488883018494,
"learning_rate": 1.958944281524927e-05,
"loss": 1.5641,
"step": 246
},
{
"epoch": 0.10347716799329702,
"grad_norm": 0.5658904314041138,
"learning_rate": 1.958776707163804e-05,
"loss": 1.5911,
"step": 247
},
{
"epoch": 0.1038961038961039,
"grad_norm": 0.6600093245506287,
"learning_rate": 1.958609132802681e-05,
"loss": 1.6812,
"step": 248
},
{
"epoch": 0.10431503979891077,
"grad_norm": 0.7548564672470093,
"learning_rate": 1.9584415584415586e-05,
"loss": 1.5563,
"step": 249
},
{
"epoch": 0.10473397570171764,
"grad_norm": 0.6965343952178955,
"learning_rate": 1.958273984080436e-05,
"loss": 1.6126,
"step": 250
},
{
"epoch": 0.1051529116045245,
"grad_norm": 0.57705157995224,
"learning_rate": 1.958106409719313e-05,
"loss": 1.6778,
"step": 251
},
{
"epoch": 0.10557184750733138,
"grad_norm": 1.2478373050689697,
"learning_rate": 1.9579388353581905e-05,
"loss": 1.6654,
"step": 252
},
{
"epoch": 0.10599078341013825,
"grad_norm": 0.586651623249054,
"learning_rate": 1.9577712609970675e-05,
"loss": 1.5895,
"step": 253
},
{
"epoch": 0.10640971931294511,
"grad_norm": 0.6501399874687195,
"learning_rate": 1.957603686635945e-05,
"loss": 1.5991,
"step": 254
},
{
"epoch": 0.106828655215752,
"grad_norm": 0.7814245223999023,
"learning_rate": 1.9574361122748224e-05,
"loss": 1.5765,
"step": 255
},
{
"epoch": 0.10724759111855886,
"grad_norm": 0.691818356513977,
"learning_rate": 1.9572685379136994e-05,
"loss": 1.6663,
"step": 256
},
{
"epoch": 0.10766652702136573,
"grad_norm": 0.9530359506607056,
"learning_rate": 1.9571009635525765e-05,
"loss": 1.6614,
"step": 257
},
{
"epoch": 0.1080854629241726,
"grad_norm": 1.280455231666565,
"learning_rate": 1.956933389191454e-05,
"loss": 1.5998,
"step": 258
},
{
"epoch": 0.10850439882697947,
"grad_norm": 0.6066200137138367,
"learning_rate": 1.956765814830331e-05,
"loss": 1.6352,
"step": 259
},
{
"epoch": 0.10892333472978634,
"grad_norm": 0.6973633766174316,
"learning_rate": 1.9565982404692084e-05,
"loss": 1.619,
"step": 260
},
{
"epoch": 0.10934227063259322,
"grad_norm": 0.857652485370636,
"learning_rate": 1.9564306661080855e-05,
"loss": 1.6507,
"step": 261
},
{
"epoch": 0.10976120653540009,
"grad_norm": 0.5751752853393555,
"learning_rate": 1.956263091746963e-05,
"loss": 1.6866,
"step": 262
},
{
"epoch": 0.11018014243820695,
"grad_norm": 0.899047315120697,
"learning_rate": 1.95609551738584e-05,
"loss": 1.6211,
"step": 263
},
{
"epoch": 0.11059907834101383,
"grad_norm": 0.6887657046318054,
"learning_rate": 1.9559279430247174e-05,
"loss": 1.5289,
"step": 264
},
{
"epoch": 0.1110180142438207,
"grad_norm": 0.6921897530555725,
"learning_rate": 1.9557603686635948e-05,
"loss": 1.6667,
"step": 265
},
{
"epoch": 0.11143695014662756,
"grad_norm": 0.5706766843795776,
"learning_rate": 1.955592794302472e-05,
"loss": 1.6057,
"step": 266
},
{
"epoch": 0.11185588604943443,
"grad_norm": 0.7291983962059021,
"learning_rate": 1.955425219941349e-05,
"loss": 1.5033,
"step": 267
},
{
"epoch": 0.11227482195224131,
"grad_norm": 0.5996133089065552,
"learning_rate": 1.9552576455802263e-05,
"loss": 1.5232,
"step": 268
},
{
"epoch": 0.11269375785504818,
"grad_norm": 0.6987999677658081,
"learning_rate": 1.9550900712191038e-05,
"loss": 1.5832,
"step": 269
},
{
"epoch": 0.11311269375785504,
"grad_norm": 0.7466180920600891,
"learning_rate": 1.954922496857981e-05,
"loss": 1.6155,
"step": 270
},
{
"epoch": 0.11353162966066192,
"grad_norm": 0.5365012884140015,
"learning_rate": 1.954754922496858e-05,
"loss": 1.4844,
"step": 271
},
{
"epoch": 0.11395056556346879,
"grad_norm": 0.6302610635757446,
"learning_rate": 1.9545873481357353e-05,
"loss": 1.6401,
"step": 272
},
{
"epoch": 0.11436950146627566,
"grad_norm": 0.7011299729347229,
"learning_rate": 1.9544197737746127e-05,
"loss": 1.5856,
"step": 273
},
{
"epoch": 0.11478843736908254,
"grad_norm": 0.5213186740875244,
"learning_rate": 1.95425219941349e-05,
"loss": 1.6877,
"step": 274
},
{
"epoch": 0.1152073732718894,
"grad_norm": 0.5973451137542725,
"learning_rate": 1.9540846250523672e-05,
"loss": 1.6255,
"step": 275
},
{
"epoch": 0.11562630917469627,
"grad_norm": 0.8726099729537964,
"learning_rate": 1.9539170506912443e-05,
"loss": 1.6345,
"step": 276
},
{
"epoch": 0.11604524507750315,
"grad_norm": 0.6559906005859375,
"learning_rate": 1.9537494763301217e-05,
"loss": 1.4426,
"step": 277
},
{
"epoch": 0.11646418098031001,
"grad_norm": 1.2165895700454712,
"learning_rate": 1.953581901968999e-05,
"loss": 1.597,
"step": 278
},
{
"epoch": 0.11688311688311688,
"grad_norm": 0.5331985354423523,
"learning_rate": 1.9534143276078762e-05,
"loss": 1.5959,
"step": 279
},
{
"epoch": 0.11730205278592376,
"grad_norm": 0.7331112027168274,
"learning_rate": 1.9532467532467533e-05,
"loss": 1.6,
"step": 280
},
{
"epoch": 0.11772098868873063,
"grad_norm": 0.6983991265296936,
"learning_rate": 1.9530791788856307e-05,
"loss": 1.6188,
"step": 281
},
{
"epoch": 0.11813992459153749,
"grad_norm": 0.6612614393234253,
"learning_rate": 1.952911604524508e-05,
"loss": 1.579,
"step": 282
},
{
"epoch": 0.11855886049434436,
"grad_norm": 0.6999834179878235,
"learning_rate": 1.952744030163385e-05,
"loss": 1.715,
"step": 283
},
{
"epoch": 0.11897779639715124,
"grad_norm": 0.6970024108886719,
"learning_rate": 1.9525764558022622e-05,
"loss": 1.6406,
"step": 284
},
{
"epoch": 0.1193967322999581,
"grad_norm": 0.5302523374557495,
"learning_rate": 1.9524088814411397e-05,
"loss": 1.6388,
"step": 285
},
{
"epoch": 0.11981566820276497,
"grad_norm": 0.7106460332870483,
"learning_rate": 1.952241307080017e-05,
"loss": 1.5679,
"step": 286
},
{
"epoch": 0.12023460410557185,
"grad_norm": 0.6428540945053101,
"learning_rate": 1.952073732718894e-05,
"loss": 1.5151,
"step": 287
},
{
"epoch": 0.12065354000837872,
"grad_norm": 0.6660862565040588,
"learning_rate": 1.9519061583577716e-05,
"loss": 1.6999,
"step": 288
},
{
"epoch": 0.12107247591118558,
"grad_norm": 0.6251623034477234,
"learning_rate": 1.9517385839966486e-05,
"loss": 1.5289,
"step": 289
},
{
"epoch": 0.12149141181399246,
"grad_norm": 0.5240997672080994,
"learning_rate": 1.9515710096355257e-05,
"loss": 1.5991,
"step": 290
},
{
"epoch": 0.12191034771679933,
"grad_norm": 0.47173094749450684,
"learning_rate": 1.951403435274403e-05,
"loss": 1.5734,
"step": 291
},
{
"epoch": 0.1223292836196062,
"grad_norm": 0.775086522102356,
"learning_rate": 1.9512358609132805e-05,
"loss": 1.4597,
"step": 292
},
{
"epoch": 0.12274821952241308,
"grad_norm": 0.622778594493866,
"learning_rate": 1.9510682865521576e-05,
"loss": 1.5872,
"step": 293
},
{
"epoch": 0.12316715542521994,
"grad_norm": 0.5721079111099243,
"learning_rate": 1.9509007121910347e-05,
"loss": 1.5226,
"step": 294
},
{
"epoch": 0.12358609132802681,
"grad_norm": 0.8591808676719666,
"learning_rate": 1.950733137829912e-05,
"loss": 1.6192,
"step": 295
},
{
"epoch": 0.12400502723083369,
"grad_norm": 0.5390528440475464,
"learning_rate": 1.9505655634687895e-05,
"loss": 1.5228,
"step": 296
},
{
"epoch": 0.12442396313364056,
"grad_norm": 0.6414217948913574,
"learning_rate": 1.950397989107667e-05,
"loss": 1.6464,
"step": 297
},
{
"epoch": 0.12484289903644742,
"grad_norm": 0.9504109621047974,
"learning_rate": 1.950230414746544e-05,
"loss": 1.4574,
"step": 298
},
{
"epoch": 0.1252618349392543,
"grad_norm": 0.7934843301773071,
"learning_rate": 1.950062840385421e-05,
"loss": 1.5825,
"step": 299
},
{
"epoch": 0.12568077084206117,
"grad_norm": 1.2911075353622437,
"learning_rate": 1.9498952660242985e-05,
"loss": 1.5519,
"step": 300
},
{
"epoch": 0.12609970674486803,
"grad_norm": 0.7778868079185486,
"learning_rate": 1.949727691663176e-05,
"loss": 1.569,
"step": 301
},
{
"epoch": 0.1265186426476749,
"grad_norm": 0.669278621673584,
"learning_rate": 1.949560117302053e-05,
"loss": 1.6144,
"step": 302
},
{
"epoch": 0.12693757855048177,
"grad_norm": 1.0039944648742676,
"learning_rate": 1.94939254294093e-05,
"loss": 1.5249,
"step": 303
},
{
"epoch": 0.12735651445328866,
"grad_norm": 0.7724584937095642,
"learning_rate": 1.9492249685798074e-05,
"loss": 1.6277,
"step": 304
},
{
"epoch": 0.12777545035609553,
"grad_norm": 0.759467601776123,
"learning_rate": 1.949057394218685e-05,
"loss": 1.5756,
"step": 305
},
{
"epoch": 0.1281943862589024,
"grad_norm": 1.3089075088500977,
"learning_rate": 1.948889819857562e-05,
"loss": 1.5371,
"step": 306
},
{
"epoch": 0.12861332216170926,
"grad_norm": 0.5424050688743591,
"learning_rate": 1.9487222454964393e-05,
"loss": 1.5824,
"step": 307
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.7417402267456055,
"learning_rate": 1.9485546711353164e-05,
"loss": 1.5458,
"step": 308
},
{
"epoch": 0.129451193967323,
"grad_norm": 0.6556446552276611,
"learning_rate": 1.948387096774194e-05,
"loss": 1.6494,
"step": 309
},
{
"epoch": 0.12987012987012986,
"grad_norm": 0.9575488567352295,
"learning_rate": 1.948219522413071e-05,
"loss": 1.5509,
"step": 310
},
{
"epoch": 0.13028906577293675,
"grad_norm": 0.8725043535232544,
"learning_rate": 1.9480519480519483e-05,
"loss": 1.5099,
"step": 311
},
{
"epoch": 0.13070800167574362,
"grad_norm": 1.1596598625183105,
"learning_rate": 1.9478843736908254e-05,
"loss": 1.6001,
"step": 312
},
{
"epoch": 0.13112693757855048,
"grad_norm": 0.986149251461029,
"learning_rate": 1.9477167993297028e-05,
"loss": 1.4582,
"step": 313
},
{
"epoch": 0.13154587348135735,
"grad_norm": 0.6939131021499634,
"learning_rate": 1.94754922496858e-05,
"loss": 1.6263,
"step": 314
},
{
"epoch": 0.13196480938416422,
"grad_norm": 0.664408802986145,
"learning_rate": 1.9473816506074573e-05,
"loss": 1.538,
"step": 315
},
{
"epoch": 0.13238374528697108,
"grad_norm": 0.6826114058494568,
"learning_rate": 1.9472140762463344e-05,
"loss": 1.5806,
"step": 316
},
{
"epoch": 0.13280268118977798,
"grad_norm": 0.6879693269729614,
"learning_rate": 1.9470465018852118e-05,
"loss": 1.5156,
"step": 317
},
{
"epoch": 0.13322161709258484,
"grad_norm": 1.2276116609573364,
"learning_rate": 1.946878927524089e-05,
"loss": 1.4466,
"step": 318
},
{
"epoch": 0.1336405529953917,
"grad_norm": 0.7346695065498352,
"learning_rate": 1.9467113531629663e-05,
"loss": 1.5769,
"step": 319
},
{
"epoch": 0.13405948889819858,
"grad_norm": 0.795690655708313,
"learning_rate": 1.9465437788018437e-05,
"loss": 1.5348,
"step": 320
},
{
"epoch": 0.13447842480100544,
"grad_norm": 0.8207523822784424,
"learning_rate": 1.9463762044407208e-05,
"loss": 1.5196,
"step": 321
},
{
"epoch": 0.1348973607038123,
"grad_norm": 0.6205607056617737,
"learning_rate": 1.9462086300795978e-05,
"loss": 1.5508,
"step": 322
},
{
"epoch": 0.13531629660661917,
"grad_norm": 0.7060804963111877,
"learning_rate": 1.9460410557184752e-05,
"loss": 1.547,
"step": 323
},
{
"epoch": 0.13573523250942607,
"grad_norm": 0.6053579449653625,
"learning_rate": 1.9458734813573527e-05,
"loss": 1.5521,
"step": 324
},
{
"epoch": 0.13615416841223293,
"grad_norm": 0.6387944221496582,
"learning_rate": 1.9457059069962297e-05,
"loss": 1.5785,
"step": 325
},
{
"epoch": 0.1365731043150398,
"grad_norm": 0.7160474061965942,
"learning_rate": 1.9455383326351068e-05,
"loss": 1.5527,
"step": 326
},
{
"epoch": 0.13699204021784667,
"grad_norm": 0.5747194290161133,
"learning_rate": 1.9453707582739842e-05,
"loss": 1.4228,
"step": 327
},
{
"epoch": 0.13741097612065353,
"grad_norm": 0.7289405465126038,
"learning_rate": 1.9452031839128616e-05,
"loss": 1.5758,
"step": 328
},
{
"epoch": 0.1378299120234604,
"grad_norm": 0.5292596817016602,
"learning_rate": 1.9450356095517387e-05,
"loss": 1.5287,
"step": 329
},
{
"epoch": 0.1382488479262673,
"grad_norm": 0.7831090092658997,
"learning_rate": 1.944868035190616e-05,
"loss": 1.5584,
"step": 330
},
{
"epoch": 0.13866778382907416,
"grad_norm": 0.6046620011329651,
"learning_rate": 1.9447004608294932e-05,
"loss": 1.5534,
"step": 331
},
{
"epoch": 0.13908671973188103,
"grad_norm": 0.7165292501449585,
"learning_rate": 1.9445328864683706e-05,
"loss": 1.621,
"step": 332
},
{
"epoch": 0.1395056556346879,
"grad_norm": 0.7406589388847351,
"learning_rate": 1.9443653121072477e-05,
"loss": 1.5954,
"step": 333
},
{
"epoch": 0.13992459153749476,
"grad_norm": 0.5955418348312378,
"learning_rate": 1.944197737746125e-05,
"loss": 1.5457,
"step": 334
},
{
"epoch": 0.14034352744030162,
"grad_norm": 0.5523016452789307,
"learning_rate": 1.944030163385002e-05,
"loss": 1.5131,
"step": 335
},
{
"epoch": 0.14076246334310852,
"grad_norm": 0.7677832841873169,
"learning_rate": 1.9438625890238796e-05,
"loss": 1.5521,
"step": 336
},
{
"epoch": 0.14118139924591538,
"grad_norm": 0.6301062107086182,
"learning_rate": 1.9436950146627566e-05,
"loss": 1.5836,
"step": 337
},
{
"epoch": 0.14160033514872225,
"grad_norm": 0.6077446341514587,
"learning_rate": 1.943527440301634e-05,
"loss": 1.5108,
"step": 338
},
{
"epoch": 0.14201927105152912,
"grad_norm": 0.6678585410118103,
"learning_rate": 1.943359865940511e-05,
"loss": 1.5488,
"step": 339
},
{
"epoch": 0.14243820695433598,
"grad_norm": 0.7583240270614624,
"learning_rate": 1.9431922915793885e-05,
"loss": 1.551,
"step": 340
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.5899871587753296,
"learning_rate": 1.9430247172182656e-05,
"loss": 1.5522,
"step": 341
},
{
"epoch": 0.14327607875994972,
"grad_norm": 0.6008652448654175,
"learning_rate": 1.942857142857143e-05,
"loss": 1.5742,
"step": 342
},
{
"epoch": 0.1436950146627566,
"grad_norm": 0.5442250967025757,
"learning_rate": 1.9426895684960204e-05,
"loss": 1.5352,
"step": 343
},
{
"epoch": 0.14411395056556348,
"grad_norm": 0.5791664123535156,
"learning_rate": 1.9425219941348975e-05,
"loss": 1.5924,
"step": 344
},
{
"epoch": 0.14453288646837034,
"grad_norm": 0.5690919160842896,
"learning_rate": 1.9423544197737746e-05,
"loss": 1.5219,
"step": 345
},
{
"epoch": 0.1449518223711772,
"grad_norm": 0.6443383693695068,
"learning_rate": 1.942186845412652e-05,
"loss": 1.5994,
"step": 346
},
{
"epoch": 0.14537075827398407,
"grad_norm": 0.5575938820838928,
"learning_rate": 1.9420192710515294e-05,
"loss": 1.5579,
"step": 347
},
{
"epoch": 0.14578969417679094,
"grad_norm": 0.612278401851654,
"learning_rate": 1.9418516966904065e-05,
"loss": 1.6067,
"step": 348
},
{
"epoch": 0.14620863007959783,
"grad_norm": 0.6119928956031799,
"learning_rate": 1.9416841223292836e-05,
"loss": 1.4636,
"step": 349
},
{
"epoch": 0.1466275659824047,
"grad_norm": 0.568268895149231,
"learning_rate": 1.941516547968161e-05,
"loss": 1.4923,
"step": 350
},
{
"epoch": 0.14704650188521157,
"grad_norm": 0.6106241345405579,
"learning_rate": 1.9413489736070384e-05,
"loss": 1.5735,
"step": 351
},
{
"epoch": 0.14746543778801843,
"grad_norm": 0.5713450312614441,
"learning_rate": 1.9411813992459158e-05,
"loss": 1.4594,
"step": 352
},
{
"epoch": 0.1478843736908253,
"grad_norm": 0.5222604274749756,
"learning_rate": 1.941013824884793e-05,
"loss": 1.4957,
"step": 353
},
{
"epoch": 0.14830330959363217,
"grad_norm": 0.6617407202720642,
"learning_rate": 1.94084625052367e-05,
"loss": 1.3623,
"step": 354
},
{
"epoch": 0.14872224549643903,
"grad_norm": 0.6270737648010254,
"learning_rate": 1.9406786761625474e-05,
"loss": 1.494,
"step": 355
},
{
"epoch": 0.14914118139924593,
"grad_norm": 0.6663143634796143,
"learning_rate": 1.9405111018014244e-05,
"loss": 1.5093,
"step": 356
},
{
"epoch": 0.1495601173020528,
"grad_norm": 0.5815137624740601,
"learning_rate": 1.940343527440302e-05,
"loss": 1.5392,
"step": 357
},
{
"epoch": 0.14997905320485966,
"grad_norm": 0.5811892747879028,
"learning_rate": 1.940175953079179e-05,
"loss": 1.5553,
"step": 358
},
{
"epoch": 0.15039798910766652,
"grad_norm": 0.5487301349639893,
"learning_rate": 1.9400083787180563e-05,
"loss": 1.6208,
"step": 359
},
{
"epoch": 0.1508169250104734,
"grad_norm": 0.5299431681632996,
"learning_rate": 1.9398408043569334e-05,
"loss": 1.6097,
"step": 360
},
{
"epoch": 0.15123586091328026,
"grad_norm": 0.6918801665306091,
"learning_rate": 1.9396732299958108e-05,
"loss": 1.5027,
"step": 361
},
{
"epoch": 0.15165479681608715,
"grad_norm": 0.5945777893066406,
"learning_rate": 1.939505655634688e-05,
"loss": 1.6155,
"step": 362
},
{
"epoch": 0.15207373271889402,
"grad_norm": 0.700545072555542,
"learning_rate": 1.9393380812735653e-05,
"loss": 1.6345,
"step": 363
},
{
"epoch": 0.15249266862170088,
"grad_norm": 0.5498125553131104,
"learning_rate": 1.9391705069124424e-05,
"loss": 1.5087,
"step": 364
},
{
"epoch": 0.15291160452450775,
"grad_norm": 0.5619140267372131,
"learning_rate": 1.9390029325513198e-05,
"loss": 1.4977,
"step": 365
},
{
"epoch": 0.15333054042731462,
"grad_norm": 0.5968044400215149,
"learning_rate": 1.9388353581901972e-05,
"loss": 1.5017,
"step": 366
},
{
"epoch": 0.15374947633012148,
"grad_norm": 0.6269423365592957,
"learning_rate": 1.9386677838290743e-05,
"loss": 1.5811,
"step": 367
},
{
"epoch": 0.15416841223292838,
"grad_norm": 0.5672966241836548,
"learning_rate": 1.9385002094679514e-05,
"loss": 1.4781,
"step": 368
},
{
"epoch": 0.15458734813573524,
"grad_norm": 0.8752624988555908,
"learning_rate": 1.9383326351068288e-05,
"loss": 1.4952,
"step": 369
},
{
"epoch": 0.1550062840385421,
"grad_norm": 0.5117892622947693,
"learning_rate": 1.9381650607457062e-05,
"loss": 1.5436,
"step": 370
},
{
"epoch": 0.15542521994134897,
"grad_norm": 0.7978183031082153,
"learning_rate": 1.9379974863845833e-05,
"loss": 1.4284,
"step": 371
},
{
"epoch": 0.15584415584415584,
"grad_norm": 0.5909569263458252,
"learning_rate": 1.9378299120234603e-05,
"loss": 1.5057,
"step": 372
},
{
"epoch": 0.1562630917469627,
"grad_norm": 0.6655667424201965,
"learning_rate": 1.9376623376623377e-05,
"loss": 1.4267,
"step": 373
},
{
"epoch": 0.15668202764976957,
"grad_norm": 0.6063106656074524,
"learning_rate": 1.937494763301215e-05,
"loss": 1.5867,
"step": 374
},
{
"epoch": 0.15710096355257647,
"grad_norm": 0.9726372361183167,
"learning_rate": 1.9373271889400926e-05,
"loss": 1.4887,
"step": 375
},
{
"epoch": 0.15751989945538333,
"grad_norm": 0.711313784122467,
"learning_rate": 1.9371596145789696e-05,
"loss": 1.5464,
"step": 376
},
{
"epoch": 0.1579388353581902,
"grad_norm": 0.6071950197219849,
"learning_rate": 1.9369920402178467e-05,
"loss": 1.4917,
"step": 377
},
{
"epoch": 0.15835777126099707,
"grad_norm": 0.7539801597595215,
"learning_rate": 1.936824465856724e-05,
"loss": 1.4267,
"step": 378
},
{
"epoch": 0.15877670716380393,
"grad_norm": 0.5340871810913086,
"learning_rate": 1.9366568914956015e-05,
"loss": 1.4227,
"step": 379
},
{
"epoch": 0.1591956430666108,
"grad_norm": 0.7538002133369446,
"learning_rate": 1.9364893171344786e-05,
"loss": 1.5565,
"step": 380
},
{
"epoch": 0.1596145789694177,
"grad_norm": 0.6404510736465454,
"learning_rate": 1.9363217427733557e-05,
"loss": 1.5314,
"step": 381
},
{
"epoch": 0.16003351487222456,
"grad_norm": 0.6506287455558777,
"learning_rate": 1.936154168412233e-05,
"loss": 1.5931,
"step": 382
},
{
"epoch": 0.16045245077503142,
"grad_norm": 0.6741127967834473,
"learning_rate": 1.9359865940511105e-05,
"loss": 1.5172,
"step": 383
},
{
"epoch": 0.1608713866778383,
"grad_norm": 0.6476618647575378,
"learning_rate": 1.9358190196899876e-05,
"loss": 1.5038,
"step": 384
},
{
"epoch": 0.16129032258064516,
"grad_norm": 0.7657164931297302,
"learning_rate": 1.935651445328865e-05,
"loss": 1.461,
"step": 385
},
{
"epoch": 0.16170925848345202,
"grad_norm": 0.49177902936935425,
"learning_rate": 1.935483870967742e-05,
"loss": 1.4751,
"step": 386
},
{
"epoch": 0.1621281943862589,
"grad_norm": 1.3233708143234253,
"learning_rate": 1.935316296606619e-05,
"loss": 1.4353,
"step": 387
},
{
"epoch": 0.16254713028906578,
"grad_norm": 0.9667218923568726,
"learning_rate": 1.9351487222454966e-05,
"loss": 1.4472,
"step": 388
},
{
"epoch": 0.16296606619187265,
"grad_norm": 0.6916482448577881,
"learning_rate": 1.934981147884374e-05,
"loss": 1.5265,
"step": 389
},
{
"epoch": 0.16338500209467952,
"grad_norm": 0.870297372341156,
"learning_rate": 1.934813573523251e-05,
"loss": 1.5272,
"step": 390
},
{
"epoch": 0.16380393799748638,
"grad_norm": 0.48826926946640015,
"learning_rate": 1.934645999162128e-05,
"loss": 1.4762,
"step": 391
},
{
"epoch": 0.16422287390029325,
"grad_norm": 0.8552863597869873,
"learning_rate": 1.9344784248010055e-05,
"loss": 1.4508,
"step": 392
},
{
"epoch": 0.16464180980310011,
"grad_norm": 0.626533567905426,
"learning_rate": 1.934310850439883e-05,
"loss": 1.4639,
"step": 393
},
{
"epoch": 0.165060745705907,
"grad_norm": 0.6926416158676147,
"learning_rate": 1.93414327607876e-05,
"loss": 1.5507,
"step": 394
},
{
"epoch": 0.16547968160871387,
"grad_norm": 0.613665759563446,
"learning_rate": 1.933975701717637e-05,
"loss": 1.5496,
"step": 395
},
{
"epoch": 0.16589861751152074,
"grad_norm": 0.7060792446136475,
"learning_rate": 1.9338081273565145e-05,
"loss": 1.4823,
"step": 396
},
{
"epoch": 0.1663175534143276,
"grad_norm": 0.6209156513214111,
"learning_rate": 1.933640552995392e-05,
"loss": 1.5661,
"step": 397
},
{
"epoch": 0.16673648931713447,
"grad_norm": 0.7241356372833252,
"learning_rate": 1.9334729786342693e-05,
"loss": 1.5442,
"step": 398
},
{
"epoch": 0.16715542521994134,
"grad_norm": 0.5998069643974304,
"learning_rate": 1.9333054042731464e-05,
"loss": 1.5247,
"step": 399
},
{
"epoch": 0.16757436112274823,
"grad_norm": 0.5730302929878235,
"learning_rate": 1.9331378299120235e-05,
"loss": 1.5973,
"step": 400
},
{
"epoch": 0.1679932970255551,
"grad_norm": 0.649454653263092,
"learning_rate": 1.932970255550901e-05,
"loss": 1.5839,
"step": 401
},
{
"epoch": 0.16841223292836197,
"grad_norm": 0.5908178687095642,
"learning_rate": 1.9328026811897783e-05,
"loss": 1.4055,
"step": 402
},
{
"epoch": 0.16883116883116883,
"grad_norm": 0.4675125181674957,
"learning_rate": 1.9326351068286554e-05,
"loss": 1.5132,
"step": 403
},
{
"epoch": 0.1692501047339757,
"grad_norm": 0.738477885723114,
"learning_rate": 1.9324675324675325e-05,
"loss": 1.3856,
"step": 404
},
{
"epoch": 0.16966904063678256,
"grad_norm": 0.5623185634613037,
"learning_rate": 1.93229995810641e-05,
"loss": 1.5222,
"step": 405
},
{
"epoch": 0.17008797653958943,
"grad_norm": 0.6933448314666748,
"learning_rate": 1.9321323837452873e-05,
"loss": 1.6141,
"step": 406
},
{
"epoch": 0.17050691244239632,
"grad_norm": 0.6374297142028809,
"learning_rate": 1.9319648093841644e-05,
"loss": 1.5482,
"step": 407
},
{
"epoch": 0.1709258483452032,
"grad_norm": 0.692150354385376,
"learning_rate": 1.9317972350230418e-05,
"loss": 1.4039,
"step": 408
},
{
"epoch": 0.17134478424801006,
"grad_norm": 0.5226042866706848,
"learning_rate": 1.931629660661919e-05,
"loss": 1.4807,
"step": 409
},
{
"epoch": 0.17176372015081692,
"grad_norm": 0.602898895740509,
"learning_rate": 1.9314620863007963e-05,
"loss": 1.4713,
"step": 410
},
{
"epoch": 0.1721826560536238,
"grad_norm": 0.6459433436393738,
"learning_rate": 1.9312945119396733e-05,
"loss": 1.4061,
"step": 411
},
{
"epoch": 0.17260159195643066,
"grad_norm": 0.6216537356376648,
"learning_rate": 1.9311269375785507e-05,
"loss": 1.4756,
"step": 412
},
{
"epoch": 0.17302052785923755,
"grad_norm": 0.5865094065666199,
"learning_rate": 1.9309593632174278e-05,
"loss": 1.5048,
"step": 413
},
{
"epoch": 0.17343946376204442,
"grad_norm": 0.689116895198822,
"learning_rate": 1.9307917888563052e-05,
"loss": 1.5203,
"step": 414
},
{
"epoch": 0.17385839966485128,
"grad_norm": 0.5802826285362244,
"learning_rate": 1.9306242144951823e-05,
"loss": 1.4162,
"step": 415
},
{
"epoch": 0.17427733556765815,
"grad_norm": 0.6490867733955383,
"learning_rate": 1.9304566401340597e-05,
"loss": 1.4837,
"step": 416
},
{
"epoch": 0.17469627147046501,
"grad_norm": 0.538263738155365,
"learning_rate": 1.9302890657729368e-05,
"loss": 1.4829,
"step": 417
},
{
"epoch": 0.17511520737327188,
"grad_norm": 0.5698223114013672,
"learning_rate": 1.9301214914118142e-05,
"loss": 1.5414,
"step": 418
},
{
"epoch": 0.17553414327607875,
"grad_norm": 0.5576086640357971,
"learning_rate": 1.9299539170506913e-05,
"loss": 1.5554,
"step": 419
},
{
"epoch": 0.17595307917888564,
"grad_norm": 0.5333095192909241,
"learning_rate": 1.9297863426895687e-05,
"loss": 1.5389,
"step": 420
},
{
"epoch": 0.1763720150816925,
"grad_norm": 0.6559884548187256,
"learning_rate": 1.929618768328446e-05,
"loss": 1.4234,
"step": 421
},
{
"epoch": 0.17679095098449937,
"grad_norm": 0.5460498332977295,
"learning_rate": 1.9294511939673232e-05,
"loss": 1.39,
"step": 422
},
{
"epoch": 0.17720988688730624,
"grad_norm": 0.636465311050415,
"learning_rate": 1.9292836196062003e-05,
"loss": 1.5061,
"step": 423
},
{
"epoch": 0.1776288227901131,
"grad_norm": 1.0990898609161377,
"learning_rate": 1.9291160452450777e-05,
"loss": 1.5252,
"step": 424
},
{
"epoch": 0.17804775869291997,
"grad_norm": 0.7239777445793152,
"learning_rate": 1.928948470883955e-05,
"loss": 1.6089,
"step": 425
},
{
"epoch": 0.17846669459572687,
"grad_norm": 0.5494760870933533,
"learning_rate": 1.928780896522832e-05,
"loss": 1.5254,
"step": 426
},
{
"epoch": 0.17888563049853373,
"grad_norm": 0.8444346189498901,
"learning_rate": 1.9286133221617092e-05,
"loss": 1.5971,
"step": 427
},
{
"epoch": 0.1793045664013406,
"grad_norm": 0.6326781511306763,
"learning_rate": 1.9284457478005866e-05,
"loss": 1.6235,
"step": 428
},
{
"epoch": 0.17972350230414746,
"grad_norm": 1.274570107460022,
"learning_rate": 1.928278173439464e-05,
"loss": 1.3773,
"step": 429
},
{
"epoch": 0.18014243820695433,
"grad_norm": 0.8181143999099731,
"learning_rate": 1.928110599078341e-05,
"loss": 1.5252,
"step": 430
},
{
"epoch": 0.1805613741097612,
"grad_norm": 1.0652360916137695,
"learning_rate": 1.9279430247172185e-05,
"loss": 1.4825,
"step": 431
},
{
"epoch": 0.1809803100125681,
"grad_norm": 0.8766518831253052,
"learning_rate": 1.9277754503560956e-05,
"loss": 1.5517,
"step": 432
},
{
"epoch": 0.18139924591537496,
"grad_norm": 0.730766236782074,
"learning_rate": 1.927607875994973e-05,
"loss": 1.4122,
"step": 433
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.8081750869750977,
"learning_rate": 1.92744030163385e-05,
"loss": 1.4633,
"step": 434
},
{
"epoch": 0.1822371177209887,
"grad_norm": 0.5756118297576904,
"learning_rate": 1.9272727272727275e-05,
"loss": 1.5168,
"step": 435
},
{
"epoch": 0.18265605362379556,
"grad_norm": 0.7646397948265076,
"learning_rate": 1.9271051529116046e-05,
"loss": 1.3793,
"step": 436
},
{
"epoch": 0.18307498952660242,
"grad_norm": 0.5722802877426147,
"learning_rate": 1.926937578550482e-05,
"loss": 1.536,
"step": 437
},
{
"epoch": 0.1834939254294093,
"grad_norm": 0.6689785718917847,
"learning_rate": 1.926770004189359e-05,
"loss": 1.5591,
"step": 438
},
{
"epoch": 0.18391286133221618,
"grad_norm": 0.6856290102005005,
"learning_rate": 1.9266024298282365e-05,
"loss": 1.4544,
"step": 439
},
{
"epoch": 0.18433179723502305,
"grad_norm": 0.6789899468421936,
"learning_rate": 1.9264348554671136e-05,
"loss": 1.4653,
"step": 440
},
{
"epoch": 0.18475073313782991,
"grad_norm": 0.7284293174743652,
"learning_rate": 1.926267281105991e-05,
"loss": 1.5527,
"step": 441
},
{
"epoch": 0.18516966904063678,
"grad_norm": 0.7645947337150574,
"learning_rate": 1.926099706744868e-05,
"loss": 1.5051,
"step": 442
},
{
"epoch": 0.18558860494344365,
"grad_norm": 0.719478964805603,
"learning_rate": 1.9259321323837455e-05,
"loss": 1.5189,
"step": 443
},
{
"epoch": 0.1860075408462505,
"grad_norm": 0.695095419883728,
"learning_rate": 1.925764558022623e-05,
"loss": 1.5547,
"step": 444
},
{
"epoch": 0.1864264767490574,
"grad_norm": 0.8511472344398499,
"learning_rate": 1.9255969836615e-05,
"loss": 1.5407,
"step": 445
},
{
"epoch": 0.18684541265186427,
"grad_norm": 0.8481591939926147,
"learning_rate": 1.925429409300377e-05,
"loss": 1.4753,
"step": 446
},
{
"epoch": 0.18726434855467114,
"grad_norm": 0.5671373009681702,
"learning_rate": 1.9252618349392544e-05,
"loss": 1.6334,
"step": 447
},
{
"epoch": 0.187683284457478,
"grad_norm": 0.9575358629226685,
"learning_rate": 1.925094260578132e-05,
"loss": 1.4334,
"step": 448
},
{
"epoch": 0.18810222036028487,
"grad_norm": 0.7267637252807617,
"learning_rate": 1.924926686217009e-05,
"loss": 1.369,
"step": 449
},
{
"epoch": 0.18852115626309174,
"grad_norm": 0.9400660991668701,
"learning_rate": 1.924759111855886e-05,
"loss": 1.4424,
"step": 450
},
{
"epoch": 0.1889400921658986,
"grad_norm": 0.6154212951660156,
"learning_rate": 1.9245915374947634e-05,
"loss": 1.5379,
"step": 451
},
{
"epoch": 0.1893590280687055,
"grad_norm": 0.992761492729187,
"learning_rate": 1.9244239631336408e-05,
"loss": 1.4291,
"step": 452
},
{
"epoch": 0.18977796397151236,
"grad_norm": 0.8219314217567444,
"learning_rate": 1.924256388772518e-05,
"loss": 1.4851,
"step": 453
},
{
"epoch": 0.19019689987431923,
"grad_norm": 0.6872371435165405,
"learning_rate": 1.9240888144113953e-05,
"loss": 1.4687,
"step": 454
},
{
"epoch": 0.1906158357771261,
"grad_norm": 0.7827102541923523,
"learning_rate": 1.9239212400502724e-05,
"loss": 1.4223,
"step": 455
},
{
"epoch": 0.19103477167993296,
"grad_norm": 0.6476536989212036,
"learning_rate": 1.9237536656891498e-05,
"loss": 1.4416,
"step": 456
},
{
"epoch": 0.19145370758273983,
"grad_norm": 0.8523492813110352,
"learning_rate": 1.923586091328027e-05,
"loss": 1.4081,
"step": 457
},
{
"epoch": 0.19187264348554672,
"grad_norm": 0.7393168807029724,
"learning_rate": 1.9234185169669043e-05,
"loss": 1.4165,
"step": 458
},
{
"epoch": 0.1922915793883536,
"grad_norm": 0.5783189535140991,
"learning_rate": 1.9232509426057814e-05,
"loss": 1.5124,
"step": 459
},
{
"epoch": 0.19271051529116046,
"grad_norm": 0.5648137927055359,
"learning_rate": 1.9230833682446588e-05,
"loss": 1.4118,
"step": 460
},
{
"epoch": 0.19312945119396732,
"grad_norm": 0.6089081168174744,
"learning_rate": 1.922915793883536e-05,
"loss": 1.5284,
"step": 461
},
{
"epoch": 0.1935483870967742,
"grad_norm": 0.4999605417251587,
"learning_rate": 1.9227482195224133e-05,
"loss": 1.5311,
"step": 462
},
{
"epoch": 0.19396732299958105,
"grad_norm": 0.5938348174095154,
"learning_rate": 1.9225806451612907e-05,
"loss": 1.4788,
"step": 463
},
{
"epoch": 0.19438625890238795,
"grad_norm": 0.547335684299469,
"learning_rate": 1.9224130708001677e-05,
"loss": 1.4408,
"step": 464
},
{
"epoch": 0.19480519480519481,
"grad_norm": 0.6379075050354004,
"learning_rate": 1.9222454964390448e-05,
"loss": 1.5705,
"step": 465
},
{
"epoch": 0.19522413070800168,
"grad_norm": 0.5942078828811646,
"learning_rate": 1.9220779220779222e-05,
"loss": 1.4643,
"step": 466
},
{
"epoch": 0.19564306661080855,
"grad_norm": 0.5671261548995972,
"learning_rate": 1.9219103477167996e-05,
"loss": 1.5048,
"step": 467
},
{
"epoch": 0.1960620025136154,
"grad_norm": 0.5360516905784607,
"learning_rate": 1.9217427733556767e-05,
"loss": 1.4651,
"step": 468
},
{
"epoch": 0.19648093841642228,
"grad_norm": 0.6140695810317993,
"learning_rate": 1.9215751989945538e-05,
"loss": 1.4735,
"step": 469
},
{
"epoch": 0.19689987431922915,
"grad_norm": 0.573509931564331,
"learning_rate": 1.9214076246334312e-05,
"loss": 1.459,
"step": 470
},
{
"epoch": 0.19731881022203604,
"grad_norm": 0.6246591210365295,
"learning_rate": 1.9212400502723086e-05,
"loss": 1.4701,
"step": 471
},
{
"epoch": 0.1977377461248429,
"grad_norm": 0.5583751797676086,
"learning_rate": 1.9210724759111857e-05,
"loss": 1.6073,
"step": 472
},
{
"epoch": 0.19815668202764977,
"grad_norm": 0.7271811962127686,
"learning_rate": 1.9209049015500628e-05,
"loss": 1.4531,
"step": 473
},
{
"epoch": 0.19857561793045664,
"grad_norm": 0.7514092922210693,
"learning_rate": 1.9207373271889402e-05,
"loss": 1.4609,
"step": 474
},
{
"epoch": 0.1989945538332635,
"grad_norm": 0.6135318875312805,
"learning_rate": 1.9205697528278176e-05,
"loss": 1.46,
"step": 475
},
{
"epoch": 0.19941348973607037,
"grad_norm": 0.8104654550552368,
"learning_rate": 1.920402178466695e-05,
"loss": 1.3627,
"step": 476
},
{
"epoch": 0.19983242563887726,
"grad_norm": 0.6633170247077942,
"learning_rate": 1.920234604105572e-05,
"loss": 1.52,
"step": 477
},
{
"epoch": 0.20025136154168413,
"grad_norm": 0.7291746735572815,
"learning_rate": 1.920067029744449e-05,
"loss": 1.547,
"step": 478
},
{
"epoch": 0.200670297444491,
"grad_norm": 0.6674147248268127,
"learning_rate": 1.9198994553833266e-05,
"loss": 1.4795,
"step": 479
},
{
"epoch": 0.20108923334729786,
"grad_norm": 0.9536328911781311,
"learning_rate": 1.919731881022204e-05,
"loss": 1.4581,
"step": 480
},
{
"epoch": 0.20150816925010473,
"grad_norm": 0.8882992267608643,
"learning_rate": 1.919564306661081e-05,
"loss": 1.4351,
"step": 481
},
{
"epoch": 0.2019271051529116,
"grad_norm": 0.5723230838775635,
"learning_rate": 1.919396732299958e-05,
"loss": 1.5095,
"step": 482
},
{
"epoch": 0.20234604105571846,
"grad_norm": 1.1361421346664429,
"learning_rate": 1.9192291579388355e-05,
"loss": 1.4452,
"step": 483
},
{
"epoch": 0.20276497695852536,
"grad_norm": 0.5762605667114258,
"learning_rate": 1.9190615835777126e-05,
"loss": 1.5695,
"step": 484
},
{
"epoch": 0.20318391286133222,
"grad_norm": 0.9020100831985474,
"learning_rate": 1.91889400921659e-05,
"loss": 1.5157,
"step": 485
},
{
"epoch": 0.2036028487641391,
"grad_norm": 0.6302100419998169,
"learning_rate": 1.9187264348554674e-05,
"loss": 1.5346,
"step": 486
},
{
"epoch": 0.20402178466694595,
"grad_norm": 1.1093989610671997,
"learning_rate": 1.9185588604943445e-05,
"loss": 1.5348,
"step": 487
},
{
"epoch": 0.20444072056975282,
"grad_norm": 0.637134850025177,
"learning_rate": 1.9183912861332216e-05,
"loss": 1.4478,
"step": 488
},
{
"epoch": 0.2048596564725597,
"grad_norm": 0.9395270943641663,
"learning_rate": 1.918223711772099e-05,
"loss": 1.4659,
"step": 489
},
{
"epoch": 0.20527859237536658,
"grad_norm": 0.7030125260353088,
"learning_rate": 1.9180561374109764e-05,
"loss": 1.3621,
"step": 490
},
{
"epoch": 0.20569752827817345,
"grad_norm": 0.5922579169273376,
"learning_rate": 1.9178885630498535e-05,
"loss": 1.644,
"step": 491
},
{
"epoch": 0.2061164641809803,
"grad_norm": 1.0161982774734497,
"learning_rate": 1.9177209886887306e-05,
"loss": 1.3895,
"step": 492
},
{
"epoch": 0.20653540008378718,
"grad_norm": 1.179316759109497,
"learning_rate": 1.917553414327608e-05,
"loss": 1.4576,
"step": 493
},
{
"epoch": 0.20695433598659405,
"grad_norm": 0.8130103945732117,
"learning_rate": 1.9173858399664854e-05,
"loss": 1.4637,
"step": 494
},
{
"epoch": 0.2073732718894009,
"grad_norm": 0.7000212669372559,
"learning_rate": 1.9172182656053625e-05,
"loss": 1.5152,
"step": 495
},
{
"epoch": 0.2077922077922078,
"grad_norm": 0.8932341933250427,
"learning_rate": 1.91705069124424e-05,
"loss": 1.4741,
"step": 496
},
{
"epoch": 0.20821114369501467,
"grad_norm": 1.3901516199111938,
"learning_rate": 1.916883116883117e-05,
"loss": 1.4953,
"step": 497
},
{
"epoch": 0.20863007959782154,
"grad_norm": 1.0053791999816895,
"learning_rate": 1.9167155425219944e-05,
"loss": 1.488,
"step": 498
},
{
"epoch": 0.2090490155006284,
"grad_norm": 0.5231297612190247,
"learning_rate": 1.9165479681608718e-05,
"loss": 1.4963,
"step": 499
},
{
"epoch": 0.20946795140343527,
"grad_norm": 0.8381982445716858,
"learning_rate": 1.916380393799749e-05,
"loss": 1.5261,
"step": 500
},
{
"epoch": 0.20988688730624214,
"grad_norm": 0.9278205633163452,
"learning_rate": 1.916212819438626e-05,
"loss": 1.3259,
"step": 501
},
{
"epoch": 0.210305823209049,
"grad_norm": 0.9388899803161621,
"learning_rate": 1.9160452450775033e-05,
"loss": 1.4091,
"step": 502
},
{
"epoch": 0.2107247591118559,
"grad_norm": 1.0538358688354492,
"learning_rate": 1.9158776707163807e-05,
"loss": 1.4633,
"step": 503
},
{
"epoch": 0.21114369501466276,
"grad_norm": 0.6670867800712585,
"learning_rate": 1.9157100963552578e-05,
"loss": 1.5639,
"step": 504
},
{
"epoch": 0.21156263091746963,
"grad_norm": 1.7608953714370728,
"learning_rate": 1.915542521994135e-05,
"loss": 1.4433,
"step": 505
},
{
"epoch": 0.2119815668202765,
"grad_norm": 1.1406441926956177,
"learning_rate": 1.9153749476330123e-05,
"loss": 1.3938,
"step": 506
},
{
"epoch": 0.21240050272308336,
"grad_norm": 1.0751662254333496,
"learning_rate": 1.9152073732718897e-05,
"loss": 1.5059,
"step": 507
},
{
"epoch": 0.21281943862589023,
"grad_norm": 1.7127149105072021,
"learning_rate": 1.9150397989107668e-05,
"loss": 1.4072,
"step": 508
},
{
"epoch": 0.21323837452869712,
"grad_norm": 0.7607043385505676,
"learning_rate": 1.9148722245496442e-05,
"loss": 1.43,
"step": 509
},
{
"epoch": 0.213657310431504,
"grad_norm": 1.0876507759094238,
"learning_rate": 1.9147046501885213e-05,
"loss": 1.4219,
"step": 510
},
{
"epoch": 0.21407624633431085,
"grad_norm": 1.704974889755249,
"learning_rate": 1.9145370758273987e-05,
"loss": 1.5058,
"step": 511
},
{
"epoch": 0.21449518223711772,
"grad_norm": 1.3107315301895142,
"learning_rate": 1.9143695014662758e-05,
"loss": 1.3701,
"step": 512
},
{
"epoch": 0.2149141181399246,
"grad_norm": 0.72137850522995,
"learning_rate": 1.9142019271051532e-05,
"loss": 1.4608,
"step": 513
},
{
"epoch": 0.21533305404273145,
"grad_norm": 0.9491326808929443,
"learning_rate": 1.9140343527440303e-05,
"loss": 1.3998,
"step": 514
},
{
"epoch": 0.21575198994553832,
"grad_norm": 0.9301955103874207,
"learning_rate": 1.9138667783829073e-05,
"loss": 1.5215,
"step": 515
},
{
"epoch": 0.2161709258483452,
"grad_norm": 0.8181395530700684,
"learning_rate": 1.9136992040217847e-05,
"loss": 1.4322,
"step": 516
},
{
"epoch": 0.21658986175115208,
"grad_norm": 0.7477272748947144,
"learning_rate": 1.913531629660662e-05,
"loss": 1.4069,
"step": 517
},
{
"epoch": 0.21700879765395895,
"grad_norm": 0.8078567385673523,
"learning_rate": 1.9133640552995392e-05,
"loss": 1.4061,
"step": 518
},
{
"epoch": 0.2174277335567658,
"grad_norm": 0.6401208639144897,
"learning_rate": 1.9131964809384166e-05,
"loss": 1.4146,
"step": 519
},
{
"epoch": 0.21784666945957268,
"grad_norm": 1.1645647287368774,
"learning_rate": 1.9130289065772937e-05,
"loss": 1.4205,
"step": 520
},
{
"epoch": 0.21826560536237954,
"grad_norm": 0.5319536328315735,
"learning_rate": 1.912861332216171e-05,
"loss": 1.4255,
"step": 521
},
{
"epoch": 0.21868454126518644,
"grad_norm": 0.7974638938903809,
"learning_rate": 1.9126937578550485e-05,
"loss": 1.4991,
"step": 522
},
{
"epoch": 0.2191034771679933,
"grad_norm": 0.7385554313659668,
"learning_rate": 1.9125261834939256e-05,
"loss": 1.4917,
"step": 523
},
{
"epoch": 0.21952241307080017,
"grad_norm": 0.6328215003013611,
"learning_rate": 1.9123586091328027e-05,
"loss": 1.4466,
"step": 524
},
{
"epoch": 0.21994134897360704,
"grad_norm": 0.9712186455726624,
"learning_rate": 1.91219103477168e-05,
"loss": 1.3888,
"step": 525
},
{
"epoch": 0.2203602848764139,
"grad_norm": 0.8162040710449219,
"learning_rate": 1.9120234604105575e-05,
"loss": 1.4592,
"step": 526
},
{
"epoch": 0.22077922077922077,
"grad_norm": 0.6039196252822876,
"learning_rate": 1.9118558860494346e-05,
"loss": 1.4641,
"step": 527
},
{
"epoch": 0.22119815668202766,
"grad_norm": 0.677533745765686,
"learning_rate": 1.9116883116883117e-05,
"loss": 1.4318,
"step": 528
},
{
"epoch": 0.22161709258483453,
"grad_norm": 0.5307941436767578,
"learning_rate": 1.911520737327189e-05,
"loss": 1.4567,
"step": 529
},
{
"epoch": 0.2220360284876414,
"grad_norm": 0.6290689706802368,
"learning_rate": 1.9113531629660665e-05,
"loss": 1.5327,
"step": 530
},
{
"epoch": 0.22245496439044826,
"grad_norm": 0.668403685092926,
"learning_rate": 1.9111855886049436e-05,
"loss": 1.5186,
"step": 531
},
{
"epoch": 0.22287390029325513,
"grad_norm": 0.8590381145477295,
"learning_rate": 1.911018014243821e-05,
"loss": 1.4103,
"step": 532
},
{
"epoch": 0.223292836196062,
"grad_norm": 0.8472411036491394,
"learning_rate": 1.910850439882698e-05,
"loss": 1.4171,
"step": 533
},
{
"epoch": 0.22371177209886886,
"grad_norm": 0.8556522130966187,
"learning_rate": 1.9106828655215755e-05,
"loss": 1.4334,
"step": 534
},
{
"epoch": 0.22413070800167575,
"grad_norm": 0.6294506192207336,
"learning_rate": 1.9105152911604525e-05,
"loss": 1.4425,
"step": 535
},
{
"epoch": 0.22454964390448262,
"grad_norm": 0.8159162402153015,
"learning_rate": 1.91034771679933e-05,
"loss": 1.5275,
"step": 536
},
{
"epoch": 0.2249685798072895,
"grad_norm": 0.7663982510566711,
"learning_rate": 1.910180142438207e-05,
"loss": 1.4252,
"step": 537
},
{
"epoch": 0.22538751571009635,
"grad_norm": 0.6042879223823547,
"learning_rate": 1.9100125680770844e-05,
"loss": 1.4374,
"step": 538
},
{
"epoch": 0.22580645161290322,
"grad_norm": 0.5414412617683411,
"learning_rate": 1.9098449937159615e-05,
"loss": 1.4921,
"step": 539
},
{
"epoch": 0.22622538751571009,
"grad_norm": 0.7135291695594788,
"learning_rate": 1.909677419354839e-05,
"loss": 1.5495,
"step": 540
},
{
"epoch": 0.22664432341851698,
"grad_norm": 0.5732749104499817,
"learning_rate": 1.9095098449937163e-05,
"loss": 1.3577,
"step": 541
},
{
"epoch": 0.22706325932132385,
"grad_norm": 0.6375070810317993,
"learning_rate": 1.9093422706325934e-05,
"loss": 1.4254,
"step": 542
},
{
"epoch": 0.2274821952241307,
"grad_norm": 0.6467480063438416,
"learning_rate": 1.9091746962714705e-05,
"loss": 1.4277,
"step": 543
},
{
"epoch": 0.22790113112693758,
"grad_norm": 0.6565422415733337,
"learning_rate": 1.909007121910348e-05,
"loss": 1.55,
"step": 544
},
{
"epoch": 0.22832006702974444,
"grad_norm": 0.6464892625808716,
"learning_rate": 1.9088395475492253e-05,
"loss": 1.4626,
"step": 545
},
{
"epoch": 0.2287390029325513,
"grad_norm": 0.6357992887496948,
"learning_rate": 1.9086719731881024e-05,
"loss": 1.4343,
"step": 546
},
{
"epoch": 0.22915793883535818,
"grad_norm": 0.6153067946434021,
"learning_rate": 1.9085043988269795e-05,
"loss": 1.4146,
"step": 547
},
{
"epoch": 0.22957687473816507,
"grad_norm": 0.7078065276145935,
"learning_rate": 1.908336824465857e-05,
"loss": 1.4147,
"step": 548
},
{
"epoch": 0.22999581064097194,
"grad_norm": 0.6610170602798462,
"learning_rate": 1.9081692501047343e-05,
"loss": 1.5137,
"step": 549
},
{
"epoch": 0.2304147465437788,
"grad_norm": 0.6916443705558777,
"learning_rate": 1.9080016757436114e-05,
"loss": 1.4726,
"step": 550
},
{
"epoch": 0.23083368244658567,
"grad_norm": 0.668258547782898,
"learning_rate": 1.9078341013824884e-05,
"loss": 1.4796,
"step": 551
},
{
"epoch": 0.23125261834939254,
"grad_norm": 0.6234722137451172,
"learning_rate": 1.907666527021366e-05,
"loss": 1.4962,
"step": 552
},
{
"epoch": 0.2316715542521994,
"grad_norm": 0.7003756761550903,
"learning_rate": 1.9074989526602433e-05,
"loss": 1.4681,
"step": 553
},
{
"epoch": 0.2320904901550063,
"grad_norm": 0.6396431922912598,
"learning_rate": 1.9073313782991203e-05,
"loss": 1.3989,
"step": 554
},
{
"epoch": 0.23250942605781316,
"grad_norm": 0.6359259486198425,
"learning_rate": 1.9071638039379977e-05,
"loss": 1.4533,
"step": 555
},
{
"epoch": 0.23292836196062003,
"grad_norm": 0.6927059292793274,
"learning_rate": 1.9069962295768748e-05,
"loss": 1.4107,
"step": 556
},
{
"epoch": 0.2333472978634269,
"grad_norm": 0.633521318435669,
"learning_rate": 1.9068286552157522e-05,
"loss": 1.4099,
"step": 557
},
{
"epoch": 0.23376623376623376,
"grad_norm": 0.6133984923362732,
"learning_rate": 1.9066610808546293e-05,
"loss": 1.425,
"step": 558
},
{
"epoch": 0.23418516966904063,
"grad_norm": 0.661888062953949,
"learning_rate": 1.9064935064935067e-05,
"loss": 1.5631,
"step": 559
},
{
"epoch": 0.23460410557184752,
"grad_norm": 0.645517110824585,
"learning_rate": 1.9063259321323838e-05,
"loss": 1.4026,
"step": 560
},
{
"epoch": 0.2350230414746544,
"grad_norm": 0.5653700828552246,
"learning_rate": 1.9061583577712612e-05,
"loss": 1.413,
"step": 561
},
{
"epoch": 0.23544197737746125,
"grad_norm": 0.7444579601287842,
"learning_rate": 1.9059907834101383e-05,
"loss": 1.4424,
"step": 562
},
{
"epoch": 0.23586091328026812,
"grad_norm": 0.618651270866394,
"learning_rate": 1.9058232090490157e-05,
"loss": 1.6035,
"step": 563
},
{
"epoch": 0.23627984918307499,
"grad_norm": 0.6566857695579529,
"learning_rate": 1.905655634687893e-05,
"loss": 1.3807,
"step": 564
},
{
"epoch": 0.23669878508588185,
"grad_norm": 0.5518476963043213,
"learning_rate": 1.9054880603267702e-05,
"loss": 1.5754,
"step": 565
},
{
"epoch": 0.23711772098868872,
"grad_norm": 0.5864123106002808,
"learning_rate": 1.9053204859656472e-05,
"loss": 1.4076,
"step": 566
},
{
"epoch": 0.2375366568914956,
"grad_norm": 0.7522138953208923,
"learning_rate": 1.9051529116045247e-05,
"loss": 1.439,
"step": 567
},
{
"epoch": 0.23795559279430248,
"grad_norm": 0.616507351398468,
"learning_rate": 1.904985337243402e-05,
"loss": 1.463,
"step": 568
},
{
"epoch": 0.23837452869710934,
"grad_norm": 0.5815699100494385,
"learning_rate": 1.904817762882279e-05,
"loss": 1.4541,
"step": 569
},
{
"epoch": 0.2387934645999162,
"grad_norm": 0.6003520488739014,
"learning_rate": 1.9046501885211562e-05,
"loss": 1.4897,
"step": 570
},
{
"epoch": 0.23921240050272308,
"grad_norm": 0.6311242580413818,
"learning_rate": 1.9044826141600336e-05,
"loss": 1.4004,
"step": 571
},
{
"epoch": 0.23963133640552994,
"grad_norm": 0.607297956943512,
"learning_rate": 1.904315039798911e-05,
"loss": 1.5698,
"step": 572
},
{
"epoch": 0.24005027230833684,
"grad_norm": 0.7055544257164001,
"learning_rate": 1.904147465437788e-05,
"loss": 1.4389,
"step": 573
},
{
"epoch": 0.2404692082111437,
"grad_norm": 0.7283361554145813,
"learning_rate": 1.9039798910766655e-05,
"loss": 1.4967,
"step": 574
},
{
"epoch": 0.24088814411395057,
"grad_norm": 0.688834547996521,
"learning_rate": 1.9038123167155426e-05,
"loss": 1.4883,
"step": 575
},
{
"epoch": 0.24130708001675744,
"grad_norm": 0.47785529494285583,
"learning_rate": 1.90364474235442e-05,
"loss": 1.5188,
"step": 576
},
{
"epoch": 0.2417260159195643,
"grad_norm": 0.9133197069168091,
"learning_rate": 1.903477167993297e-05,
"loss": 1.3748,
"step": 577
},
{
"epoch": 0.24214495182237117,
"grad_norm": 0.6180113554000854,
"learning_rate": 1.9033095936321745e-05,
"loss": 1.4213,
"step": 578
},
{
"epoch": 0.24256388772517803,
"grad_norm": 0.6890195608139038,
"learning_rate": 1.9031420192710516e-05,
"loss": 1.3621,
"step": 579
},
{
"epoch": 0.24298282362798493,
"grad_norm": 0.6566532850265503,
"learning_rate": 1.902974444909929e-05,
"loss": 1.3666,
"step": 580
},
{
"epoch": 0.2434017595307918,
"grad_norm": 0.572672963142395,
"learning_rate": 1.902806870548806e-05,
"loss": 1.3654,
"step": 581
},
{
"epoch": 0.24382069543359866,
"grad_norm": 0.698391318321228,
"learning_rate": 1.9026392961876835e-05,
"loss": 1.4244,
"step": 582
},
{
"epoch": 0.24423963133640553,
"grad_norm": 0.5516411066055298,
"learning_rate": 1.9024717218265606e-05,
"loss": 1.5141,
"step": 583
},
{
"epoch": 0.2446585672392124,
"grad_norm": 0.6122570037841797,
"learning_rate": 1.902304147465438e-05,
"loss": 1.4333,
"step": 584
},
{
"epoch": 0.24507750314201926,
"grad_norm": 0.7202199697494507,
"learning_rate": 1.902136573104315e-05,
"loss": 1.4562,
"step": 585
},
{
"epoch": 0.24549643904482615,
"grad_norm": 0.938453733921051,
"learning_rate": 1.9019689987431925e-05,
"loss": 1.4605,
"step": 586
},
{
"epoch": 0.24591537494763302,
"grad_norm": 0.5673055648803711,
"learning_rate": 1.90180142438207e-05,
"loss": 1.4827,
"step": 587
},
{
"epoch": 0.24633431085043989,
"grad_norm": 0.677293062210083,
"learning_rate": 1.901633850020947e-05,
"loss": 1.375,
"step": 588
},
{
"epoch": 0.24675324675324675,
"grad_norm": 0.7138928771018982,
"learning_rate": 1.901466275659824e-05,
"loss": 1.541,
"step": 589
},
{
"epoch": 0.24717218265605362,
"grad_norm": 0.7003957033157349,
"learning_rate": 1.9012987012987014e-05,
"loss": 1.4234,
"step": 590
},
{
"epoch": 0.24759111855886048,
"grad_norm": 0.8985845446586609,
"learning_rate": 1.901131126937579e-05,
"loss": 1.2981,
"step": 591
},
{
"epoch": 0.24801005446166738,
"grad_norm": 0.6559215784072876,
"learning_rate": 1.900963552576456e-05,
"loss": 1.4871,
"step": 592
},
{
"epoch": 0.24842899036447424,
"grad_norm": 1.0038827657699585,
"learning_rate": 1.900795978215333e-05,
"loss": 1.3604,
"step": 593
},
{
"epoch": 0.2488479262672811,
"grad_norm": 0.670703649520874,
"learning_rate": 1.9006284038542104e-05,
"loss": 1.4142,
"step": 594
},
{
"epoch": 0.24926686217008798,
"grad_norm": 0.6743998527526855,
"learning_rate": 1.9004608294930878e-05,
"loss": 1.421,
"step": 595
},
{
"epoch": 0.24968579807289484,
"grad_norm": 0.90179044008255,
"learning_rate": 1.900293255131965e-05,
"loss": 1.3545,
"step": 596
},
{
"epoch": 0.2501047339757017,
"grad_norm": 0.7004808187484741,
"learning_rate": 1.9001256807708423e-05,
"loss": 1.4334,
"step": 597
},
{
"epoch": 0.2505236698785086,
"grad_norm": 0.811750590801239,
"learning_rate": 1.8999581064097194e-05,
"loss": 1.5068,
"step": 598
},
{
"epoch": 0.25094260578131544,
"grad_norm": 0.5660247206687927,
"learning_rate": 1.8997905320485968e-05,
"loss": 1.4119,
"step": 599
},
{
"epoch": 0.25136154168412234,
"grad_norm": 0.7421470880508423,
"learning_rate": 1.8996229576874742e-05,
"loss": 1.4193,
"step": 600
},
{
"epoch": 0.2517804775869292,
"grad_norm": 0.7964827418327332,
"learning_rate": 1.8994553833263513e-05,
"loss": 1.3664,
"step": 601
},
{
"epoch": 0.25219941348973607,
"grad_norm": 0.587858259677887,
"learning_rate": 1.8992878089652283e-05,
"loss": 1.4094,
"step": 602
},
{
"epoch": 0.25261834939254296,
"grad_norm": 0.8728364109992981,
"learning_rate": 1.8991202346041058e-05,
"loss": 1.2841,
"step": 603
},
{
"epoch": 0.2530372852953498,
"grad_norm": 0.7561309933662415,
"learning_rate": 1.8989526602429832e-05,
"loss": 1.351,
"step": 604
},
{
"epoch": 0.2534562211981567,
"grad_norm": 0.6205732822418213,
"learning_rate": 1.8987850858818602e-05,
"loss": 1.4043,
"step": 605
},
{
"epoch": 0.25387515710096353,
"grad_norm": 0.7390834093093872,
"learning_rate": 1.8986175115207373e-05,
"loss": 1.4274,
"step": 606
},
{
"epoch": 0.2542940930037704,
"grad_norm": 0.7050930261611938,
"learning_rate": 1.8984499371596147e-05,
"loss": 1.4875,
"step": 607
},
{
"epoch": 0.2547130289065773,
"grad_norm": 0.5341117978096008,
"learning_rate": 1.898282362798492e-05,
"loss": 1.4482,
"step": 608
},
{
"epoch": 0.25513196480938416,
"grad_norm": 0.8297886848449707,
"learning_rate": 1.8981147884373692e-05,
"loss": 1.3514,
"step": 609
},
{
"epoch": 0.25555090071219105,
"grad_norm": 0.6667957901954651,
"learning_rate": 1.8979472140762466e-05,
"loss": 1.3185,
"step": 610
},
{
"epoch": 0.2559698366149979,
"grad_norm": 0.6411688923835754,
"learning_rate": 1.8977796397151237e-05,
"loss": 1.4442,
"step": 611
},
{
"epoch": 0.2563887725178048,
"grad_norm": 0.6379684805870056,
"learning_rate": 1.8976120653540008e-05,
"loss": 1.4377,
"step": 612
},
{
"epoch": 0.2568077084206116,
"grad_norm": 0.6216014623641968,
"learning_rate": 1.8974444909928782e-05,
"loss": 1.4474,
"step": 613
},
{
"epoch": 0.2572266443234185,
"grad_norm": 0.5671195983886719,
"learning_rate": 1.8972769166317556e-05,
"loss": 1.4434,
"step": 614
},
{
"epoch": 0.2576455802262254,
"grad_norm": 0.8159009218215942,
"learning_rate": 1.8971093422706327e-05,
"loss": 1.3781,
"step": 615
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.6750363111495972,
"learning_rate": 1.8969417679095098e-05,
"loss": 1.5244,
"step": 616
},
{
"epoch": 0.25848345203183914,
"grad_norm": 0.5345395803451538,
"learning_rate": 1.896774193548387e-05,
"loss": 1.4944,
"step": 617
},
{
"epoch": 0.258902387934646,
"grad_norm": 1.0422483682632446,
"learning_rate": 1.8966066191872646e-05,
"loss": 1.4257,
"step": 618
},
{
"epoch": 0.2593213238374529,
"grad_norm": 0.6644807457923889,
"learning_rate": 1.896439044826142e-05,
"loss": 1.4396,
"step": 619
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.6991039514541626,
"learning_rate": 1.896271470465019e-05,
"loss": 1.4456,
"step": 620
},
{
"epoch": 0.2601591956430666,
"grad_norm": 0.6287118792533875,
"learning_rate": 1.896103896103896e-05,
"loss": 1.3556,
"step": 621
},
{
"epoch": 0.2605781315458735,
"grad_norm": 0.7149505615234375,
"learning_rate": 1.8959363217427736e-05,
"loss": 1.3708,
"step": 622
},
{
"epoch": 0.26099706744868034,
"grad_norm": 0.6485461592674255,
"learning_rate": 1.895768747381651e-05,
"loss": 1.3813,
"step": 623
},
{
"epoch": 0.26141600335148724,
"grad_norm": 0.796234130859375,
"learning_rate": 1.895601173020528e-05,
"loss": 1.4557,
"step": 624
},
{
"epoch": 0.2618349392542941,
"grad_norm": 0.6059120297431946,
"learning_rate": 1.895433598659405e-05,
"loss": 1.4393,
"step": 625
},
{
"epoch": 0.26225387515710097,
"grad_norm": 0.7328737378120422,
"learning_rate": 1.8952660242982825e-05,
"loss": 1.4259,
"step": 626
},
{
"epoch": 0.2626728110599078,
"grad_norm": 0.5874695181846619,
"learning_rate": 1.89509844993716e-05,
"loss": 1.51,
"step": 627
},
{
"epoch": 0.2630917469627147,
"grad_norm": 0.7244255542755127,
"learning_rate": 1.894930875576037e-05,
"loss": 1.4277,
"step": 628
},
{
"epoch": 0.2635106828655216,
"grad_norm": 0.5406452417373657,
"learning_rate": 1.894763301214914e-05,
"loss": 1.4015,
"step": 629
},
{
"epoch": 0.26392961876832843,
"grad_norm": 0.7187069058418274,
"learning_rate": 1.8945957268537915e-05,
"loss": 1.417,
"step": 630
},
{
"epoch": 0.2643485546711353,
"grad_norm": 0.6457657217979431,
"learning_rate": 1.894428152492669e-05,
"loss": 1.4711,
"step": 631
},
{
"epoch": 0.26476749057394217,
"grad_norm": 0.5307900309562683,
"learning_rate": 1.894260578131546e-05,
"loss": 1.4237,
"step": 632
},
{
"epoch": 0.26518642647674906,
"grad_norm": 0.9521127343177795,
"learning_rate": 1.8940930037704234e-05,
"loss": 1.3934,
"step": 633
},
{
"epoch": 0.26560536237955595,
"grad_norm": 0.596931517124176,
"learning_rate": 1.8939254294093005e-05,
"loss": 1.4957,
"step": 634
},
{
"epoch": 0.2660242982823628,
"grad_norm": 0.8038269281387329,
"learning_rate": 1.893757855048178e-05,
"loss": 1.4551,
"step": 635
},
{
"epoch": 0.2664432341851697,
"grad_norm": 0.7987629771232605,
"learning_rate": 1.893590280687055e-05,
"loss": 1.4973,
"step": 636
},
{
"epoch": 0.2668621700879765,
"grad_norm": 0.7971741557121277,
"learning_rate": 1.8934227063259324e-05,
"loss": 1.331,
"step": 637
},
{
"epoch": 0.2672811059907834,
"grad_norm": 0.6776058077812195,
"learning_rate": 1.8932551319648094e-05,
"loss": 1.3397,
"step": 638
},
{
"epoch": 0.26770004189359026,
"grad_norm": 0.7527434825897217,
"learning_rate": 1.893087557603687e-05,
"loss": 1.3912,
"step": 639
},
{
"epoch": 0.26811897779639715,
"grad_norm": 0.6726639866828918,
"learning_rate": 1.892919983242564e-05,
"loss": 1.3428,
"step": 640
},
{
"epoch": 0.26853791369920404,
"grad_norm": 0.7143589854240417,
"learning_rate": 1.8927524088814413e-05,
"loss": 1.4311,
"step": 641
},
{
"epoch": 0.2689568496020109,
"grad_norm": 0.6544737815856934,
"learning_rate": 1.8925848345203188e-05,
"loss": 1.4774,
"step": 642
},
{
"epoch": 0.2693757855048178,
"grad_norm": 0.707067608833313,
"learning_rate": 1.892417260159196e-05,
"loss": 1.3982,
"step": 643
},
{
"epoch": 0.2697947214076246,
"grad_norm": 0.7015464901924133,
"learning_rate": 1.892249685798073e-05,
"loss": 1.3921,
"step": 644
},
{
"epoch": 0.2702136573104315,
"grad_norm": 0.7379519939422607,
"learning_rate": 1.8920821114369503e-05,
"loss": 1.4429,
"step": 645
},
{
"epoch": 0.27063259321323835,
"grad_norm": 0.6990654468536377,
"learning_rate": 1.8919145370758277e-05,
"loss": 1.3888,
"step": 646
},
{
"epoch": 0.27105152911604524,
"grad_norm": 0.8418135643005371,
"learning_rate": 1.8917469627147048e-05,
"loss": 1.3989,
"step": 647
},
{
"epoch": 0.27147046501885214,
"grad_norm": 0.7323577404022217,
"learning_rate": 1.891579388353582e-05,
"loss": 1.4053,
"step": 648
},
{
"epoch": 0.271889400921659,
"grad_norm": 0.5955424904823303,
"learning_rate": 1.8914118139924593e-05,
"loss": 1.4684,
"step": 649
},
{
"epoch": 0.27230833682446587,
"grad_norm": 0.7422502040863037,
"learning_rate": 1.8912442396313367e-05,
"loss": 1.394,
"step": 650
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.6807714700698853,
"learning_rate": 1.8910766652702138e-05,
"loss": 1.315,
"step": 651
},
{
"epoch": 0.2731462086300796,
"grad_norm": 0.6865885257720947,
"learning_rate": 1.8909090909090912e-05,
"loss": 1.3815,
"step": 652
},
{
"epoch": 0.2735651445328865,
"grad_norm": 0.7024874091148376,
"learning_rate": 1.8907415165479683e-05,
"loss": 1.4215,
"step": 653
},
{
"epoch": 0.27398408043569333,
"grad_norm": 0.7904298305511475,
"learning_rate": 1.8905739421868457e-05,
"loss": 1.3908,
"step": 654
},
{
"epoch": 0.2744030163385002,
"grad_norm": 0.6477106213569641,
"learning_rate": 1.8904063678257228e-05,
"loss": 1.4599,
"step": 655
},
{
"epoch": 0.27482195224130707,
"grad_norm": 0.8508483171463013,
"learning_rate": 1.8902387934646e-05,
"loss": 1.4838,
"step": 656
},
{
"epoch": 0.27524088814411396,
"grad_norm": 0.5711894631385803,
"learning_rate": 1.8900712191034772e-05,
"loss": 1.4615,
"step": 657
},
{
"epoch": 0.2756598240469208,
"grad_norm": 0.6993704438209534,
"learning_rate": 1.8899036447423547e-05,
"loss": 1.4493,
"step": 658
},
{
"epoch": 0.2760787599497277,
"grad_norm": 1.0374524593353271,
"learning_rate": 1.8897360703812317e-05,
"loss": 1.482,
"step": 659
},
{
"epoch": 0.2764976958525346,
"grad_norm": 0.9369934797286987,
"learning_rate": 1.889568496020109e-05,
"loss": 1.3472,
"step": 660
},
{
"epoch": 0.2769166317553414,
"grad_norm": 0.8173794746398926,
"learning_rate": 1.8894009216589862e-05,
"loss": 1.4428,
"step": 661
},
{
"epoch": 0.2773355676581483,
"grad_norm": 1.059898018836975,
"learning_rate": 1.8892333472978636e-05,
"loss": 1.395,
"step": 662
},
{
"epoch": 0.27775450356095516,
"grad_norm": 0.5998404026031494,
"learning_rate": 1.8890657729367407e-05,
"loss": 1.3945,
"step": 663
},
{
"epoch": 0.27817343946376205,
"grad_norm": 0.6833025217056274,
"learning_rate": 1.888898198575618e-05,
"loss": 1.4655,
"step": 664
},
{
"epoch": 0.2785923753665689,
"grad_norm": 0.7907499670982361,
"learning_rate": 1.8887306242144955e-05,
"loss": 1.3141,
"step": 665
},
{
"epoch": 0.2790113112693758,
"grad_norm": 0.6411296129226685,
"learning_rate": 1.8885630498533726e-05,
"loss": 1.507,
"step": 666
},
{
"epoch": 0.2794302471721827,
"grad_norm": 0.8076937794685364,
"learning_rate": 1.8883954754922497e-05,
"loss": 1.3578,
"step": 667
},
{
"epoch": 0.2798491830749895,
"grad_norm": 0.6930973529815674,
"learning_rate": 1.888227901131127e-05,
"loss": 1.3186,
"step": 668
},
{
"epoch": 0.2802681189777964,
"grad_norm": 0.7560698390007019,
"learning_rate": 1.8880603267700045e-05,
"loss": 1.3772,
"step": 669
},
{
"epoch": 0.28068705488060325,
"grad_norm": 0.5842419266700745,
"learning_rate": 1.8878927524088816e-05,
"loss": 1.4772,
"step": 670
},
{
"epoch": 0.28110599078341014,
"grad_norm": 0.9055956602096558,
"learning_rate": 1.8877251780477586e-05,
"loss": 1.4802,
"step": 671
},
{
"epoch": 0.28152492668621704,
"grad_norm": 0.5749624371528625,
"learning_rate": 1.887557603686636e-05,
"loss": 1.345,
"step": 672
},
{
"epoch": 0.2819438625890239,
"grad_norm": 0.6721547245979309,
"learning_rate": 1.8873900293255135e-05,
"loss": 1.3641,
"step": 673
},
{
"epoch": 0.28236279849183077,
"grad_norm": 0.580252468585968,
"learning_rate": 1.8872224549643905e-05,
"loss": 1.4225,
"step": 674
},
{
"epoch": 0.2827817343946376,
"grad_norm": 0.8793032765388489,
"learning_rate": 1.887054880603268e-05,
"loss": 1.3721,
"step": 675
},
{
"epoch": 0.2832006702974445,
"grad_norm": 0.5774890184402466,
"learning_rate": 1.886887306242145e-05,
"loss": 1.4358,
"step": 676
},
{
"epoch": 0.28361960620025134,
"grad_norm": 0.6694848537445068,
"learning_rate": 1.8867197318810224e-05,
"loss": 1.3601,
"step": 677
},
{
"epoch": 0.28403854210305823,
"grad_norm": 0.5719591975212097,
"learning_rate": 1.8865521575198995e-05,
"loss": 1.526,
"step": 678
},
{
"epoch": 0.2844574780058651,
"grad_norm": 0.6238727569580078,
"learning_rate": 1.886384583158777e-05,
"loss": 1.3982,
"step": 679
},
{
"epoch": 0.28487641390867197,
"grad_norm": 0.6695346236228943,
"learning_rate": 1.886217008797654e-05,
"loss": 1.384,
"step": 680
},
{
"epoch": 0.28529534981147886,
"grad_norm": 0.7487378120422363,
"learning_rate": 1.8860494344365314e-05,
"loss": 1.364,
"step": 681
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.6982246041297913,
"learning_rate": 1.8858818600754085e-05,
"loss": 1.4175,
"step": 682
},
{
"epoch": 0.2861332216170926,
"grad_norm": 0.6678481698036194,
"learning_rate": 1.885714285714286e-05,
"loss": 1.3014,
"step": 683
},
{
"epoch": 0.28655215751989943,
"grad_norm": 0.5933229923248291,
"learning_rate": 1.885546711353163e-05,
"loss": 1.4171,
"step": 684
},
{
"epoch": 0.2869710934227063,
"grad_norm": 0.6992403864860535,
"learning_rate": 1.8853791369920404e-05,
"loss": 1.3231,
"step": 685
},
{
"epoch": 0.2873900293255132,
"grad_norm": 0.4942586421966553,
"learning_rate": 1.8852115626309175e-05,
"loss": 1.4685,
"step": 686
},
{
"epoch": 0.28780896522832006,
"grad_norm": 0.6683540940284729,
"learning_rate": 1.885043988269795e-05,
"loss": 1.3826,
"step": 687
},
{
"epoch": 0.28822790113112695,
"grad_norm": 0.4600916802883148,
"learning_rate": 1.8848764139086723e-05,
"loss": 1.4668,
"step": 688
},
{
"epoch": 0.2886468370339338,
"grad_norm": 0.6156571507453918,
"learning_rate": 1.8847088395475494e-05,
"loss": 1.4465,
"step": 689
},
{
"epoch": 0.2890657729367407,
"grad_norm": 0.5449684262275696,
"learning_rate": 1.8845412651864264e-05,
"loss": 1.2964,
"step": 690
},
{
"epoch": 0.2894847088395475,
"grad_norm": 0.5813759565353394,
"learning_rate": 1.884373690825304e-05,
"loss": 1.4211,
"step": 691
},
{
"epoch": 0.2899036447423544,
"grad_norm": 0.618743360042572,
"learning_rate": 1.8842061164641813e-05,
"loss": 1.3976,
"step": 692
},
{
"epoch": 0.2903225806451613,
"grad_norm": 0.5242084264755249,
"learning_rate": 1.8840385421030583e-05,
"loss": 1.4076,
"step": 693
},
{
"epoch": 0.29074151654796815,
"grad_norm": 0.676110029220581,
"learning_rate": 1.8838709677419354e-05,
"loss": 1.3143,
"step": 694
},
{
"epoch": 0.29116045245077504,
"grad_norm": 0.5773074626922607,
"learning_rate": 1.8837033933808128e-05,
"loss": 1.4443,
"step": 695
},
{
"epoch": 0.2915793883535819,
"grad_norm": 0.8076342940330505,
"learning_rate": 1.8835358190196902e-05,
"loss": 1.469,
"step": 696
},
{
"epoch": 0.2919983242563888,
"grad_norm": 0.6035589575767517,
"learning_rate": 1.8833682446585677e-05,
"loss": 1.349,
"step": 697
},
{
"epoch": 0.29241726015919567,
"grad_norm": 1.1530474424362183,
"learning_rate": 1.8832006702974447e-05,
"loss": 1.4056,
"step": 698
},
{
"epoch": 0.2928361960620025,
"grad_norm": 0.6804980635643005,
"learning_rate": 1.8830330959363218e-05,
"loss": 1.45,
"step": 699
},
{
"epoch": 0.2932551319648094,
"grad_norm": 0.8612604141235352,
"learning_rate": 1.8828655215751992e-05,
"loss": 1.4082,
"step": 700
},
{
"epoch": 0.29367406786761624,
"grad_norm": 0.7837015390396118,
"learning_rate": 1.8826979472140766e-05,
"loss": 1.3432,
"step": 701
},
{
"epoch": 0.29409300377042313,
"grad_norm": 1.0070463418960571,
"learning_rate": 1.8825303728529537e-05,
"loss": 1.3149,
"step": 702
},
{
"epoch": 0.29451193967322997,
"grad_norm": 0.8743970990180969,
"learning_rate": 1.8823627984918308e-05,
"loss": 1.2927,
"step": 703
},
{
"epoch": 0.29493087557603687,
"grad_norm": 0.7475900053977966,
"learning_rate": 1.8821952241307082e-05,
"loss": 1.4228,
"step": 704
},
{
"epoch": 0.29534981147884376,
"grad_norm": 0.5288072824478149,
"learning_rate": 1.8820276497695853e-05,
"loss": 1.3062,
"step": 705
},
{
"epoch": 0.2957687473816506,
"grad_norm": 0.718515932559967,
"learning_rate": 1.8818600754084627e-05,
"loss": 1.4571,
"step": 706
},
{
"epoch": 0.2961876832844575,
"grad_norm": 0.6150030493736267,
"learning_rate": 1.8816925010473397e-05,
"loss": 1.4581,
"step": 707
},
{
"epoch": 0.29660661918726433,
"grad_norm": 0.6937044858932495,
"learning_rate": 1.881524926686217e-05,
"loss": 1.3419,
"step": 708
},
{
"epoch": 0.2970255550900712,
"grad_norm": 0.8995934128761292,
"learning_rate": 1.8813573523250942e-05,
"loss": 1.5374,
"step": 709
},
{
"epoch": 0.29744449099287806,
"grad_norm": 0.5294831991195679,
"learning_rate": 1.8811897779639716e-05,
"loss": 1.4163,
"step": 710
},
{
"epoch": 0.29786342689568496,
"grad_norm": 1.1763445138931274,
"learning_rate": 1.881022203602849e-05,
"loss": 1.3632,
"step": 711
},
{
"epoch": 0.29828236279849185,
"grad_norm": 0.6113332509994507,
"learning_rate": 1.880854629241726e-05,
"loss": 1.417,
"step": 712
},
{
"epoch": 0.2987012987012987,
"grad_norm": 0.7323436737060547,
"learning_rate": 1.8806870548806032e-05,
"loss": 1.4475,
"step": 713
},
{
"epoch": 0.2991202346041056,
"grad_norm": 0.6122335195541382,
"learning_rate": 1.8805194805194806e-05,
"loss": 1.4296,
"step": 714
},
{
"epoch": 0.2995391705069124,
"grad_norm": 0.8511880040168762,
"learning_rate": 1.880351906158358e-05,
"loss": 1.2773,
"step": 715
},
{
"epoch": 0.2999581064097193,
"grad_norm": 0.540932297706604,
"learning_rate": 1.880184331797235e-05,
"loss": 1.4429,
"step": 716
},
{
"epoch": 0.3003770423125262,
"grad_norm": 0.7005630731582642,
"learning_rate": 1.8800167574361122e-05,
"loss": 1.3484,
"step": 717
},
{
"epoch": 0.30079597821533305,
"grad_norm": 0.4778623878955841,
"learning_rate": 1.8798491830749896e-05,
"loss": 1.4279,
"step": 718
},
{
"epoch": 0.30121491411813994,
"grad_norm": 0.6638504266738892,
"learning_rate": 1.879681608713867e-05,
"loss": 1.3713,
"step": 719
},
{
"epoch": 0.3016338500209468,
"grad_norm": 0.6170998811721802,
"learning_rate": 1.8795140343527444e-05,
"loss": 1.4593,
"step": 720
},
{
"epoch": 0.3020527859237537,
"grad_norm": 0.8390569686889648,
"learning_rate": 1.8793464599916215e-05,
"loss": 1.3165,
"step": 721
},
{
"epoch": 0.3024717218265605,
"grad_norm": 0.5174708962440491,
"learning_rate": 1.8791788856304986e-05,
"loss": 1.3133,
"step": 722
},
{
"epoch": 0.3028906577293674,
"grad_norm": 0.6793212294578552,
"learning_rate": 1.879011311269376e-05,
"loss": 1.4321,
"step": 723
},
{
"epoch": 0.3033095936321743,
"grad_norm": 0.6248990893363953,
"learning_rate": 1.8788437369082534e-05,
"loss": 1.3265,
"step": 724
},
{
"epoch": 0.30372852953498114,
"grad_norm": 0.6905636191368103,
"learning_rate": 1.8786761625471305e-05,
"loss": 1.4365,
"step": 725
},
{
"epoch": 0.30414746543778803,
"grad_norm": 0.5848572850227356,
"learning_rate": 1.8785085881860075e-05,
"loss": 1.3977,
"step": 726
},
{
"epoch": 0.30456640134059487,
"grad_norm": 0.715691089630127,
"learning_rate": 1.878341013824885e-05,
"loss": 1.43,
"step": 727
},
{
"epoch": 0.30498533724340177,
"grad_norm": 0.5724084377288818,
"learning_rate": 1.8781734394637624e-05,
"loss": 1.4189,
"step": 728
},
{
"epoch": 0.3054042731462086,
"grad_norm": 0.6158417463302612,
"learning_rate": 1.8780058651026394e-05,
"loss": 1.4262,
"step": 729
},
{
"epoch": 0.3058232090490155,
"grad_norm": 0.5914372801780701,
"learning_rate": 1.877838290741517e-05,
"loss": 1.3335,
"step": 730
},
{
"epoch": 0.3062421449518224,
"grad_norm": 0.6226646900177002,
"learning_rate": 1.877670716380394e-05,
"loss": 1.366,
"step": 731
},
{
"epoch": 0.30666108085462923,
"grad_norm": 0.5656031370162964,
"learning_rate": 1.8775031420192713e-05,
"loss": 1.2984,
"step": 732
},
{
"epoch": 0.3070800167574361,
"grad_norm": 0.7473766207695007,
"learning_rate": 1.8773355676581484e-05,
"loss": 1.3788,
"step": 733
},
{
"epoch": 0.30749895266024296,
"grad_norm": 0.686081051826477,
"learning_rate": 1.8771679932970258e-05,
"loss": 1.2504,
"step": 734
},
{
"epoch": 0.30791788856304986,
"grad_norm": 0.6029173135757446,
"learning_rate": 1.877000418935903e-05,
"loss": 1.3931,
"step": 735
},
{
"epoch": 0.30833682446585675,
"grad_norm": 0.761979341506958,
"learning_rate": 1.8768328445747803e-05,
"loss": 1.3497,
"step": 736
},
{
"epoch": 0.3087557603686636,
"grad_norm": 0.6504870057106018,
"learning_rate": 1.8766652702136574e-05,
"loss": 1.4025,
"step": 737
},
{
"epoch": 0.3091746962714705,
"grad_norm": 0.7684826254844666,
"learning_rate": 1.8764976958525348e-05,
"loss": 1.506,
"step": 738
},
{
"epoch": 0.3095936321742773,
"grad_norm": 0.8104509115219116,
"learning_rate": 1.876330121491412e-05,
"loss": 1.4321,
"step": 739
},
{
"epoch": 0.3100125680770842,
"grad_norm": 0.5708764791488647,
"learning_rate": 1.876162547130289e-05,
"loss": 1.3636,
"step": 740
},
{
"epoch": 0.31043150397989105,
"grad_norm": 0.5689902901649475,
"learning_rate": 1.8759949727691664e-05,
"loss": 1.3956,
"step": 741
},
{
"epoch": 0.31085043988269795,
"grad_norm": 0.6868107914924622,
"learning_rate": 1.8758273984080438e-05,
"loss": 1.3395,
"step": 742
},
{
"epoch": 0.31126937578550484,
"grad_norm": 0.6060523390769958,
"learning_rate": 1.8756598240469212e-05,
"loss": 1.369,
"step": 743
},
{
"epoch": 0.3116883116883117,
"grad_norm": 0.8488364219665527,
"learning_rate": 1.8754922496857983e-05,
"loss": 1.3584,
"step": 744
},
{
"epoch": 0.3121072475911186,
"grad_norm": 0.5008958578109741,
"learning_rate": 1.8753246753246753e-05,
"loss": 1.4263,
"step": 745
},
{
"epoch": 0.3125261834939254,
"grad_norm": 0.5795634388923645,
"learning_rate": 1.8751571009635527e-05,
"loss": 1.4323,
"step": 746
},
{
"epoch": 0.3129451193967323,
"grad_norm": 0.5958297848701477,
"learning_rate": 1.87498952660243e-05,
"loss": 1.435,
"step": 747
},
{
"epoch": 0.31336405529953915,
"grad_norm": 0.610854983329773,
"learning_rate": 1.8748219522413072e-05,
"loss": 1.3164,
"step": 748
},
{
"epoch": 0.31378299120234604,
"grad_norm": 0.6233651638031006,
"learning_rate": 1.8746543778801843e-05,
"loss": 1.3757,
"step": 749
},
{
"epoch": 0.31420192710515293,
"grad_norm": 0.6277685165405273,
"learning_rate": 1.8744868035190617e-05,
"loss": 1.3958,
"step": 750
},
{
"epoch": 0.3146208630079598,
"grad_norm": 0.6157351732254028,
"learning_rate": 1.874319229157939e-05,
"loss": 1.3395,
"step": 751
},
{
"epoch": 0.31503979891076667,
"grad_norm": 0.5872085690498352,
"learning_rate": 1.8741516547968162e-05,
"loss": 1.4568,
"step": 752
},
{
"epoch": 0.3154587348135735,
"grad_norm": 0.6481055021286011,
"learning_rate": 1.8739840804356936e-05,
"loss": 1.466,
"step": 753
},
{
"epoch": 0.3158776707163804,
"grad_norm": 0.5024817585945129,
"learning_rate": 1.8738165060745707e-05,
"loss": 1.4321,
"step": 754
},
{
"epoch": 0.31629660661918724,
"grad_norm": 0.817737340927124,
"learning_rate": 1.873648931713448e-05,
"loss": 1.3119,
"step": 755
},
{
"epoch": 0.31671554252199413,
"grad_norm": 0.5730132460594177,
"learning_rate": 1.8734813573523252e-05,
"loss": 1.4508,
"step": 756
},
{
"epoch": 0.317134478424801,
"grad_norm": 0.7577594518661499,
"learning_rate": 1.8733137829912026e-05,
"loss": 1.3521,
"step": 757
},
{
"epoch": 0.31755341432760786,
"grad_norm": 0.6062861680984497,
"learning_rate": 1.8731462086300797e-05,
"loss": 1.384,
"step": 758
},
{
"epoch": 0.31797235023041476,
"grad_norm": 0.7353479266166687,
"learning_rate": 1.872978634268957e-05,
"loss": 1.2669,
"step": 759
},
{
"epoch": 0.3183912861332216,
"grad_norm": 0.7415457963943481,
"learning_rate": 1.872811059907834e-05,
"loss": 1.3057,
"step": 760
},
{
"epoch": 0.3188102220360285,
"grad_norm": 0.7072665691375732,
"learning_rate": 1.8726434855467116e-05,
"loss": 1.3939,
"step": 761
},
{
"epoch": 0.3192291579388354,
"grad_norm": 0.6156973838806152,
"learning_rate": 1.8724759111855886e-05,
"loss": 1.444,
"step": 762
},
{
"epoch": 0.3196480938416422,
"grad_norm": 0.8771636486053467,
"learning_rate": 1.872308336824466e-05,
"loss": 1.3528,
"step": 763
},
{
"epoch": 0.3200670297444491,
"grad_norm": 0.7313105463981628,
"learning_rate": 1.872140762463343e-05,
"loss": 1.403,
"step": 764
},
{
"epoch": 0.32048596564725595,
"grad_norm": 0.8612370491027832,
"learning_rate": 1.8719731881022205e-05,
"loss": 1.2753,
"step": 765
},
{
"epoch": 0.32090490155006285,
"grad_norm": 0.7663282155990601,
"learning_rate": 1.871805613741098e-05,
"loss": 1.2537,
"step": 766
},
{
"epoch": 0.3213238374528697,
"grad_norm": 0.6050071716308594,
"learning_rate": 1.871638039379975e-05,
"loss": 1.2666,
"step": 767
},
{
"epoch": 0.3217427733556766,
"grad_norm": 0.878713846206665,
"learning_rate": 1.871470465018852e-05,
"loss": 1.2472,
"step": 768
},
{
"epoch": 0.3221617092584835,
"grad_norm": 0.5855206251144409,
"learning_rate": 1.8713028906577295e-05,
"loss": 1.3284,
"step": 769
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.6722334623336792,
"learning_rate": 1.871135316296607e-05,
"loss": 1.3594,
"step": 770
},
{
"epoch": 0.3229995810640972,
"grad_norm": 0.5793231725692749,
"learning_rate": 1.870967741935484e-05,
"loss": 1.3275,
"step": 771
},
{
"epoch": 0.32341851696690405,
"grad_norm": 0.604360818862915,
"learning_rate": 1.870800167574361e-05,
"loss": 1.385,
"step": 772
},
{
"epoch": 0.32383745286971094,
"grad_norm": 0.6285070776939392,
"learning_rate": 1.8706325932132385e-05,
"loss": 1.3477,
"step": 773
},
{
"epoch": 0.3242563887725178,
"grad_norm": 0.637465238571167,
"learning_rate": 1.870465018852116e-05,
"loss": 1.3904,
"step": 774
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.6604148149490356,
"learning_rate": 1.870297444490993e-05,
"loss": 1.4409,
"step": 775
},
{
"epoch": 0.32509426057813157,
"grad_norm": 0.5733173489570618,
"learning_rate": 1.8701298701298704e-05,
"loss": 1.4067,
"step": 776
},
{
"epoch": 0.3255131964809384,
"grad_norm": 0.6333217620849609,
"learning_rate": 1.8699622957687475e-05,
"loss": 1.3755,
"step": 777
},
{
"epoch": 0.3259321323837453,
"grad_norm": 0.5488452315330505,
"learning_rate": 1.869794721407625e-05,
"loss": 1.3702,
"step": 778
},
{
"epoch": 0.32635106828655214,
"grad_norm": 0.501512885093689,
"learning_rate": 1.869627147046502e-05,
"loss": 1.354,
"step": 779
},
{
"epoch": 0.32677000418935903,
"grad_norm": 0.6487094163894653,
"learning_rate": 1.8694595726853794e-05,
"loss": 1.436,
"step": 780
},
{
"epoch": 0.3271889400921659,
"grad_norm": 0.6414726972579956,
"learning_rate": 1.8692919983242564e-05,
"loss": 1.3531,
"step": 781
},
{
"epoch": 0.32760787599497276,
"grad_norm": 0.6292533278465271,
"learning_rate": 1.869124423963134e-05,
"loss": 1.3462,
"step": 782
},
{
"epoch": 0.32802681189777966,
"grad_norm": 0.593009889125824,
"learning_rate": 1.868956849602011e-05,
"loss": 1.3921,
"step": 783
},
{
"epoch": 0.3284457478005865,
"grad_norm": 0.8031853437423706,
"learning_rate": 1.8687892752408883e-05,
"loss": 1.3397,
"step": 784
},
{
"epoch": 0.3288646837033934,
"grad_norm": 0.6368310451507568,
"learning_rate": 1.8686217008797654e-05,
"loss": 1.3801,
"step": 785
},
{
"epoch": 0.32928361960620023,
"grad_norm": 0.540608286857605,
"learning_rate": 1.8684541265186428e-05,
"loss": 1.4328,
"step": 786
},
{
"epoch": 0.3297025555090071,
"grad_norm": 0.6946301460266113,
"learning_rate": 1.86828655215752e-05,
"loss": 1.3398,
"step": 787
},
{
"epoch": 0.330121491411814,
"grad_norm": 0.57708340883255,
"learning_rate": 1.8681189777963973e-05,
"loss": 1.3537,
"step": 788
},
{
"epoch": 0.33054042731462085,
"grad_norm": 0.647994875907898,
"learning_rate": 1.8679514034352747e-05,
"loss": 1.4367,
"step": 789
},
{
"epoch": 0.33095936321742775,
"grad_norm": 0.7245671153068542,
"learning_rate": 1.8677838290741518e-05,
"loss": 1.4373,
"step": 790
},
{
"epoch": 0.3313782991202346,
"grad_norm": 0.5863420963287354,
"learning_rate": 1.867616254713029e-05,
"loss": 1.3749,
"step": 791
},
{
"epoch": 0.3317972350230415,
"grad_norm": 0.5827834010124207,
"learning_rate": 1.8674486803519063e-05,
"loss": 1.4547,
"step": 792
},
{
"epoch": 0.3322161709258483,
"grad_norm": 0.61444091796875,
"learning_rate": 1.8672811059907837e-05,
"loss": 1.4826,
"step": 793
},
{
"epoch": 0.3326351068286552,
"grad_norm": 0.7494825124740601,
"learning_rate": 1.8671135316296608e-05,
"loss": 1.3186,
"step": 794
},
{
"epoch": 0.3330540427314621,
"grad_norm": 0.708984375,
"learning_rate": 1.866945957268538e-05,
"loss": 1.3216,
"step": 795
},
{
"epoch": 0.33347297863426895,
"grad_norm": 0.7570633888244629,
"learning_rate": 1.8667783829074153e-05,
"loss": 1.2755,
"step": 796
},
{
"epoch": 0.33389191453707584,
"grad_norm": 0.6864930987358093,
"learning_rate": 1.8666108085462927e-05,
"loss": 1.5026,
"step": 797
},
{
"epoch": 0.3343108504398827,
"grad_norm": 0.7167800068855286,
"learning_rate": 1.86644323418517e-05,
"loss": 1.417,
"step": 798
},
{
"epoch": 0.3347297863426896,
"grad_norm": 0.6132422089576721,
"learning_rate": 1.866275659824047e-05,
"loss": 1.3885,
"step": 799
},
{
"epoch": 0.33514872224549647,
"grad_norm": 0.496698796749115,
"learning_rate": 1.8661080854629242e-05,
"loss": 1.3729,
"step": 800
},
{
"epoch": 0.3355676581483033,
"grad_norm": 0.718532383441925,
"learning_rate": 1.8659405111018016e-05,
"loss": 1.391,
"step": 801
},
{
"epoch": 0.3359865940511102,
"grad_norm": 0.7083394527435303,
"learning_rate": 1.8657729367406787e-05,
"loss": 1.3375,
"step": 802
},
{
"epoch": 0.33640552995391704,
"grad_norm": 0.6132792830467224,
"learning_rate": 1.865605362379556e-05,
"loss": 1.4201,
"step": 803
},
{
"epoch": 0.33682446585672393,
"grad_norm": 0.7242376804351807,
"learning_rate": 1.8654377880184332e-05,
"loss": 1.4025,
"step": 804
},
{
"epoch": 0.33724340175953077,
"grad_norm": 0.6561216711997986,
"learning_rate": 1.8652702136573106e-05,
"loss": 1.2498,
"step": 805
},
{
"epoch": 0.33766233766233766,
"grad_norm": 0.6856805086135864,
"learning_rate": 1.8651026392961877e-05,
"loss": 1.3054,
"step": 806
},
{
"epoch": 0.33808127356514456,
"grad_norm": 0.7083545923233032,
"learning_rate": 1.864935064935065e-05,
"loss": 1.4073,
"step": 807
},
{
"epoch": 0.3385002094679514,
"grad_norm": 0.6021366119384766,
"learning_rate": 1.8647674905739425e-05,
"loss": 1.4361,
"step": 808
},
{
"epoch": 0.3389191453707583,
"grad_norm": 0.5841922760009766,
"learning_rate": 1.8645999162128196e-05,
"loss": 1.4091,
"step": 809
},
{
"epoch": 0.33933808127356513,
"grad_norm": 0.7605058550834656,
"learning_rate": 1.8644323418516967e-05,
"loss": 1.4554,
"step": 810
},
{
"epoch": 0.339757017176372,
"grad_norm": 0.6625474095344543,
"learning_rate": 1.864264767490574e-05,
"loss": 1.3343,
"step": 811
},
{
"epoch": 0.34017595307917886,
"grad_norm": 0.5613833665847778,
"learning_rate": 1.8640971931294515e-05,
"loss": 1.3195,
"step": 812
},
{
"epoch": 0.34059488898198576,
"grad_norm": 0.7579832673072815,
"learning_rate": 1.8639296187683286e-05,
"loss": 1.2687,
"step": 813
},
{
"epoch": 0.34101382488479265,
"grad_norm": 0.6240825653076172,
"learning_rate": 1.8637620444072056e-05,
"loss": 1.3975,
"step": 814
},
{
"epoch": 0.3414327607875995,
"grad_norm": 0.669330894947052,
"learning_rate": 1.863594470046083e-05,
"loss": 1.3588,
"step": 815
},
{
"epoch": 0.3418516966904064,
"grad_norm": 1.4403901100158691,
"learning_rate": 1.8634268956849605e-05,
"loss": 1.3911,
"step": 816
},
{
"epoch": 0.3422706325932132,
"grad_norm": 0.5918375253677368,
"learning_rate": 1.8632593213238375e-05,
"loss": 1.4409,
"step": 817
},
{
"epoch": 0.3426895684960201,
"grad_norm": 0.8068619966506958,
"learning_rate": 1.8630917469627146e-05,
"loss": 1.373,
"step": 818
},
{
"epoch": 0.34310850439882695,
"grad_norm": 0.6585489511489868,
"learning_rate": 1.862924172601592e-05,
"loss": 1.3883,
"step": 819
},
{
"epoch": 0.34352744030163385,
"grad_norm": 0.7592496275901794,
"learning_rate": 1.8627565982404694e-05,
"loss": 1.274,
"step": 820
},
{
"epoch": 0.34394637620444074,
"grad_norm": 0.7368906736373901,
"learning_rate": 1.862589023879347e-05,
"loss": 1.3069,
"step": 821
},
{
"epoch": 0.3443653121072476,
"grad_norm": 1.1015337705612183,
"learning_rate": 1.862421449518224e-05,
"loss": 1.359,
"step": 822
},
{
"epoch": 0.3447842480100545,
"grad_norm": 1.0174856185913086,
"learning_rate": 1.862253875157101e-05,
"loss": 1.3323,
"step": 823
},
{
"epoch": 0.3452031839128613,
"grad_norm": 0.5378293991088867,
"learning_rate": 1.8620863007959784e-05,
"loss": 1.363,
"step": 824
},
{
"epoch": 0.3456221198156682,
"grad_norm": 0.5914802551269531,
"learning_rate": 1.8619187264348558e-05,
"loss": 1.4108,
"step": 825
},
{
"epoch": 0.3460410557184751,
"grad_norm": 0.7762312889099121,
"learning_rate": 1.861751152073733e-05,
"loss": 1.5178,
"step": 826
},
{
"epoch": 0.34645999162128194,
"grad_norm": 0.5453881025314331,
"learning_rate": 1.86158357771261e-05,
"loss": 1.2591,
"step": 827
},
{
"epoch": 0.34687892752408883,
"grad_norm": 0.8273909091949463,
"learning_rate": 1.8614160033514874e-05,
"loss": 1.4152,
"step": 828
},
{
"epoch": 0.34729786342689567,
"grad_norm": 0.619732677936554,
"learning_rate": 1.8612484289903648e-05,
"loss": 1.3843,
"step": 829
},
{
"epoch": 0.34771679932970256,
"grad_norm": 0.8301693797111511,
"learning_rate": 1.861080854629242e-05,
"loss": 1.3134,
"step": 830
},
{
"epoch": 0.3481357352325094,
"grad_norm": 0.5382381081581116,
"learning_rate": 1.8609132802681193e-05,
"loss": 1.3369,
"step": 831
},
{
"epoch": 0.3485546711353163,
"grad_norm": 0.6819462776184082,
"learning_rate": 1.8607457059069964e-05,
"loss": 1.3153,
"step": 832
},
{
"epoch": 0.3489736070381232,
"grad_norm": 0.5813642144203186,
"learning_rate": 1.8605781315458734e-05,
"loss": 1.3567,
"step": 833
},
{
"epoch": 0.34939254294093003,
"grad_norm": 0.5303306579589844,
"learning_rate": 1.860410557184751e-05,
"loss": 1.3297,
"step": 834
},
{
"epoch": 0.3498114788437369,
"grad_norm": 0.7988172769546509,
"learning_rate": 1.8602429828236283e-05,
"loss": 1.3171,
"step": 835
},
{
"epoch": 0.35023041474654376,
"grad_norm": 0.6038556694984436,
"learning_rate": 1.8600754084625053e-05,
"loss": 1.4674,
"step": 836
},
{
"epoch": 0.35064935064935066,
"grad_norm": 0.7679703235626221,
"learning_rate": 1.8599078341013824e-05,
"loss": 1.3359,
"step": 837
},
{
"epoch": 0.3510682865521575,
"grad_norm": 0.5809141993522644,
"learning_rate": 1.8597402597402598e-05,
"loss": 1.3567,
"step": 838
},
{
"epoch": 0.3514872224549644,
"grad_norm": 0.7988621592521667,
"learning_rate": 1.8595726853791372e-05,
"loss": 1.3453,
"step": 839
},
{
"epoch": 0.3519061583577713,
"grad_norm": 0.5147368907928467,
"learning_rate": 1.8594051110180143e-05,
"loss": 1.4187,
"step": 840
},
{
"epoch": 0.3523250942605781,
"grad_norm": 0.8276616334915161,
"learning_rate": 1.8592375366568917e-05,
"loss": 1.3438,
"step": 841
},
{
"epoch": 0.352744030163385,
"grad_norm": 0.630969762802124,
"learning_rate": 1.8590699622957688e-05,
"loss": 1.3271,
"step": 842
},
{
"epoch": 0.35316296606619185,
"grad_norm": 0.823090136051178,
"learning_rate": 1.8589023879346462e-05,
"loss": 1.3266,
"step": 843
},
{
"epoch": 0.35358190196899875,
"grad_norm": 0.6593233346939087,
"learning_rate": 1.8587348135735236e-05,
"loss": 1.2582,
"step": 844
},
{
"epoch": 0.35400083787180564,
"grad_norm": 0.5821310877799988,
"learning_rate": 1.8585672392124007e-05,
"loss": 1.3739,
"step": 845
},
{
"epoch": 0.3544197737746125,
"grad_norm": 0.6901243329048157,
"learning_rate": 1.8583996648512778e-05,
"loss": 1.4388,
"step": 846
},
{
"epoch": 0.3548387096774194,
"grad_norm": 0.521477460861206,
"learning_rate": 1.8582320904901552e-05,
"loss": 1.4263,
"step": 847
},
{
"epoch": 0.3552576455802262,
"grad_norm": 0.6898438930511475,
"learning_rate": 1.8580645161290326e-05,
"loss": 1.3508,
"step": 848
},
{
"epoch": 0.3556765814830331,
"grad_norm": 0.7993562817573547,
"learning_rate": 1.8578969417679097e-05,
"loss": 1.2651,
"step": 849
},
{
"epoch": 0.35609551738583994,
"grad_norm": 0.5602364540100098,
"learning_rate": 1.8577293674067867e-05,
"loss": 1.3954,
"step": 850
},
{
"epoch": 0.35651445328864684,
"grad_norm": 1.204590082168579,
"learning_rate": 1.857561793045664e-05,
"loss": 1.2695,
"step": 851
},
{
"epoch": 0.35693338919145373,
"grad_norm": 0.7635987997055054,
"learning_rate": 1.8573942186845416e-05,
"loss": 1.5585,
"step": 852
},
{
"epoch": 0.35735232509426057,
"grad_norm": 0.9088107943534851,
"learning_rate": 1.8572266443234186e-05,
"loss": 1.4667,
"step": 853
},
{
"epoch": 0.35777126099706746,
"grad_norm": 0.9393168687820435,
"learning_rate": 1.857059069962296e-05,
"loss": 1.4196,
"step": 854
},
{
"epoch": 0.3581901968998743,
"grad_norm": 0.7522462606430054,
"learning_rate": 1.856891495601173e-05,
"loss": 1.3734,
"step": 855
},
{
"epoch": 0.3586091328026812,
"grad_norm": 0.868783712387085,
"learning_rate": 1.8567239212400505e-05,
"loss": 1.3904,
"step": 856
},
{
"epoch": 0.35902806870548803,
"grad_norm": 0.721829354763031,
"learning_rate": 1.8565563468789276e-05,
"loss": 1.3377,
"step": 857
},
{
"epoch": 0.35944700460829493,
"grad_norm": 0.6955874562263489,
"learning_rate": 1.856388772517805e-05,
"loss": 1.3995,
"step": 858
},
{
"epoch": 0.3598659405111018,
"grad_norm": 0.7859539985656738,
"learning_rate": 1.856221198156682e-05,
"loss": 1.4676,
"step": 859
},
{
"epoch": 0.36028487641390866,
"grad_norm": 0.8676955699920654,
"learning_rate": 1.8560536237955595e-05,
"loss": 1.3937,
"step": 860
},
{
"epoch": 0.36070381231671556,
"grad_norm": 0.8304063081741333,
"learning_rate": 1.8558860494344366e-05,
"loss": 1.3601,
"step": 861
},
{
"epoch": 0.3611227482195224,
"grad_norm": 0.6992982029914856,
"learning_rate": 1.855718475073314e-05,
"loss": 1.3896,
"step": 862
},
{
"epoch": 0.3615416841223293,
"grad_norm": 0.7726747989654541,
"learning_rate": 1.855550900712191e-05,
"loss": 1.3639,
"step": 863
},
{
"epoch": 0.3619606200251362,
"grad_norm": 0.6171780228614807,
"learning_rate": 1.8553833263510685e-05,
"loss": 1.447,
"step": 864
},
{
"epoch": 0.362379555927943,
"grad_norm": 0.6462947130203247,
"learning_rate": 1.8552157519899456e-05,
"loss": 1.4144,
"step": 865
},
{
"epoch": 0.3627984918307499,
"grad_norm": 0.6577037572860718,
"learning_rate": 1.855048177628823e-05,
"loss": 1.3563,
"step": 866
},
{
"epoch": 0.36321742773355675,
"grad_norm": 0.633479118347168,
"learning_rate": 1.8548806032677004e-05,
"loss": 1.4013,
"step": 867
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.5869176387786865,
"learning_rate": 1.8547130289065775e-05,
"loss": 1.2304,
"step": 868
},
{
"epoch": 0.3640552995391705,
"grad_norm": 0.9644981026649475,
"learning_rate": 1.8545454545454545e-05,
"loss": 1.4055,
"step": 869
},
{
"epoch": 0.3644742354419774,
"grad_norm": 0.4988062381744385,
"learning_rate": 1.854377880184332e-05,
"loss": 1.3402,
"step": 870
},
{
"epoch": 0.3648931713447843,
"grad_norm": 0.6213245391845703,
"learning_rate": 1.8542103058232094e-05,
"loss": 1.3781,
"step": 871
},
{
"epoch": 0.3653121072475911,
"grad_norm": 1.2338489294052124,
"learning_rate": 1.8540427314620864e-05,
"loss": 1.3705,
"step": 872
},
{
"epoch": 0.365731043150398,
"grad_norm": 0.642489492893219,
"learning_rate": 1.8538751571009635e-05,
"loss": 1.302,
"step": 873
},
{
"epoch": 0.36614997905320484,
"grad_norm": 0.5393372178077698,
"learning_rate": 1.853707582739841e-05,
"loss": 1.3932,
"step": 874
},
{
"epoch": 0.36656891495601174,
"grad_norm": 0.702558696269989,
"learning_rate": 1.8535400083787183e-05,
"loss": 1.3118,
"step": 875
},
{
"epoch": 0.3669878508588186,
"grad_norm": 0.5150250792503357,
"learning_rate": 1.8533724340175954e-05,
"loss": 1.3945,
"step": 876
},
{
"epoch": 0.36740678676162547,
"grad_norm": 0.5648869872093201,
"learning_rate": 1.8532048596564728e-05,
"loss": 1.2966,
"step": 877
},
{
"epoch": 0.36782572266443236,
"grad_norm": 0.507340133190155,
"learning_rate": 1.85303728529535e-05,
"loss": 1.3102,
"step": 878
},
{
"epoch": 0.3682446585672392,
"grad_norm": 0.4831449091434479,
"learning_rate": 1.8528697109342273e-05,
"loss": 1.4016,
"step": 879
},
{
"epoch": 0.3686635944700461,
"grad_norm": 0.5660369396209717,
"learning_rate": 1.8527021365731044e-05,
"loss": 1.3923,
"step": 880
},
{
"epoch": 0.36908253037285293,
"grad_norm": 0.5805203318595886,
"learning_rate": 1.8525345622119818e-05,
"loss": 1.3403,
"step": 881
},
{
"epoch": 0.36950146627565983,
"grad_norm": 0.5939772129058838,
"learning_rate": 1.852366987850859e-05,
"loss": 1.433,
"step": 882
},
{
"epoch": 0.36992040217846667,
"grad_norm": 0.5323980450630188,
"learning_rate": 1.8521994134897363e-05,
"loss": 1.3378,
"step": 883
},
{
"epoch": 0.37033933808127356,
"grad_norm": 0.6497474312782288,
"learning_rate": 1.8520318391286134e-05,
"loss": 1.3351,
"step": 884
},
{
"epoch": 0.37075827398408046,
"grad_norm": 0.564187228679657,
"learning_rate": 1.8518642647674908e-05,
"loss": 1.3354,
"step": 885
},
{
"epoch": 0.3711772098868873,
"grad_norm": 0.5568170547485352,
"learning_rate": 1.8516966904063682e-05,
"loss": 1.3902,
"step": 886
},
{
"epoch": 0.3715961457896942,
"grad_norm": 0.635718584060669,
"learning_rate": 1.8515291160452453e-05,
"loss": 1.2659,
"step": 887
},
{
"epoch": 0.372015081692501,
"grad_norm": 0.5702401399612427,
"learning_rate": 1.8513615416841223e-05,
"loss": 1.3953,
"step": 888
},
{
"epoch": 0.3724340175953079,
"grad_norm": 0.9754594564437866,
"learning_rate": 1.8511939673229997e-05,
"loss": 1.523,
"step": 889
},
{
"epoch": 0.3728529534981148,
"grad_norm": 0.6205545663833618,
"learning_rate": 1.851026392961877e-05,
"loss": 1.4438,
"step": 890
},
{
"epoch": 0.37327188940092165,
"grad_norm": 0.6615620255470276,
"learning_rate": 1.8508588186007542e-05,
"loss": 1.3249,
"step": 891
},
{
"epoch": 0.37369082530372855,
"grad_norm": 1.1198056936264038,
"learning_rate": 1.8506912442396313e-05,
"loss": 1.3237,
"step": 892
},
{
"epoch": 0.3741097612065354,
"grad_norm": 0.4639185070991516,
"learning_rate": 1.8505236698785087e-05,
"loss": 1.3922,
"step": 893
},
{
"epoch": 0.3745286971093423,
"grad_norm": 0.9922068119049072,
"learning_rate": 1.850356095517386e-05,
"loss": 1.4176,
"step": 894
},
{
"epoch": 0.3749476330121491,
"grad_norm": 0.6543740034103394,
"learning_rate": 1.8501885211562632e-05,
"loss": 1.3416,
"step": 895
},
{
"epoch": 0.375366568914956,
"grad_norm": 0.6506433486938477,
"learning_rate": 1.8500209467951403e-05,
"loss": 1.3429,
"step": 896
},
{
"epoch": 0.3757855048177629,
"grad_norm": 0.6528995633125305,
"learning_rate": 1.8498533724340177e-05,
"loss": 1.3,
"step": 897
},
{
"epoch": 0.37620444072056974,
"grad_norm": 0.5251516103744507,
"learning_rate": 1.849685798072895e-05,
"loss": 1.3076,
"step": 898
},
{
"epoch": 0.37662337662337664,
"grad_norm": 0.6646214723587036,
"learning_rate": 1.8495182237117722e-05,
"loss": 1.3381,
"step": 899
},
{
"epoch": 0.3770423125261835,
"grad_norm": 0.7112960815429688,
"learning_rate": 1.8493506493506496e-05,
"loss": 1.342,
"step": 900
},
{
"epoch": 0.37746124842899037,
"grad_norm": 0.6115846037864685,
"learning_rate": 1.8491830749895267e-05,
"loss": 1.3231,
"step": 901
},
{
"epoch": 0.3778801843317972,
"grad_norm": 0.5398994088172913,
"learning_rate": 1.849015500628404e-05,
"loss": 1.3294,
"step": 902
},
{
"epoch": 0.3782991202346041,
"grad_norm": 0.6577754616737366,
"learning_rate": 1.848847926267281e-05,
"loss": 1.4425,
"step": 903
},
{
"epoch": 0.378718056137411,
"grad_norm": 0.6344985365867615,
"learning_rate": 1.8486803519061586e-05,
"loss": 1.4265,
"step": 904
},
{
"epoch": 0.37913699204021784,
"grad_norm": 0.6111234426498413,
"learning_rate": 1.8485127775450356e-05,
"loss": 1.358,
"step": 905
},
{
"epoch": 0.37955592794302473,
"grad_norm": 0.7081183195114136,
"learning_rate": 1.848345203183913e-05,
"loss": 1.3682,
"step": 906
},
{
"epoch": 0.37997486384583157,
"grad_norm": 0.5750409364700317,
"learning_rate": 1.84817762882279e-05,
"loss": 1.2761,
"step": 907
},
{
"epoch": 0.38039379974863846,
"grad_norm": 0.5604552030563354,
"learning_rate": 1.8480100544616675e-05,
"loss": 1.3865,
"step": 908
},
{
"epoch": 0.38081273565144536,
"grad_norm": 0.5597730875015259,
"learning_rate": 1.847842480100545e-05,
"loss": 1.4277,
"step": 909
},
{
"epoch": 0.3812316715542522,
"grad_norm": 0.5526629686355591,
"learning_rate": 1.847674905739422e-05,
"loss": 1.2744,
"step": 910
},
{
"epoch": 0.3816506074570591,
"grad_norm": 0.5272534489631653,
"learning_rate": 1.847507331378299e-05,
"loss": 1.3739,
"step": 911
},
{
"epoch": 0.3820695433598659,
"grad_norm": 0.7306801676750183,
"learning_rate": 1.8473397570171765e-05,
"loss": 1.2646,
"step": 912
},
{
"epoch": 0.3824884792626728,
"grad_norm": 0.8041632771492004,
"learning_rate": 1.847172182656054e-05,
"loss": 1.3597,
"step": 913
},
{
"epoch": 0.38290741516547966,
"grad_norm": 0.6282241344451904,
"learning_rate": 1.847004608294931e-05,
"loss": 1.2337,
"step": 914
},
{
"epoch": 0.38332635106828655,
"grad_norm": 0.6238358020782471,
"learning_rate": 1.846837033933808e-05,
"loss": 1.4054,
"step": 915
},
{
"epoch": 0.38374528697109345,
"grad_norm": 0.701452910900116,
"learning_rate": 1.8466694595726855e-05,
"loss": 1.2824,
"step": 916
},
{
"epoch": 0.3841642228739003,
"grad_norm": 0.7733535170555115,
"learning_rate": 1.846501885211563e-05,
"loss": 1.251,
"step": 917
},
{
"epoch": 0.3845831587767072,
"grad_norm": 0.5265078544616699,
"learning_rate": 1.84633431085044e-05,
"loss": 1.4453,
"step": 918
},
{
"epoch": 0.385002094679514,
"grad_norm": 0.452823668718338,
"learning_rate": 1.8461667364893174e-05,
"loss": 1.3404,
"step": 919
},
{
"epoch": 0.3854210305823209,
"grad_norm": 0.8962199091911316,
"learning_rate": 1.8459991621281945e-05,
"loss": 1.323,
"step": 920
},
{
"epoch": 0.38583996648512775,
"grad_norm": 0.6026387810707092,
"learning_rate": 1.845831587767072e-05,
"loss": 1.3102,
"step": 921
},
{
"epoch": 0.38625890238793464,
"grad_norm": 0.8394970297813416,
"learning_rate": 1.8456640134059493e-05,
"loss": 1.3595,
"step": 922
},
{
"epoch": 0.38667783829074154,
"grad_norm": 0.5857053399085999,
"learning_rate": 1.8454964390448264e-05,
"loss": 1.4405,
"step": 923
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.987432599067688,
"learning_rate": 1.8453288646837034e-05,
"loss": 1.4044,
"step": 924
},
{
"epoch": 0.38751571009635527,
"grad_norm": 0.5757207870483398,
"learning_rate": 1.845161290322581e-05,
"loss": 1.5337,
"step": 925
},
{
"epoch": 0.3879346459991621,
"grad_norm": 1.0578022003173828,
"learning_rate": 1.8449937159614583e-05,
"loss": 1.3209,
"step": 926
},
{
"epoch": 0.388353581901969,
"grad_norm": 0.5220828652381897,
"learning_rate": 1.8448261416003353e-05,
"loss": 1.3895,
"step": 927
},
{
"epoch": 0.3887725178047759,
"grad_norm": 1.179112434387207,
"learning_rate": 1.8446585672392124e-05,
"loss": 1.3342,
"step": 928
},
{
"epoch": 0.38919145370758274,
"grad_norm": 0.646702229976654,
"learning_rate": 1.8444909928780898e-05,
"loss": 1.3178,
"step": 929
},
{
"epoch": 0.38961038961038963,
"grad_norm": 0.8807336091995239,
"learning_rate": 1.844323418516967e-05,
"loss": 1.2941,
"step": 930
},
{
"epoch": 0.39002932551319647,
"grad_norm": 0.5306640267372131,
"learning_rate": 1.8441558441558443e-05,
"loss": 1.3575,
"step": 931
},
{
"epoch": 0.39044826141600336,
"grad_norm": 0.99175626039505,
"learning_rate": 1.8439882697947217e-05,
"loss": 1.3536,
"step": 932
},
{
"epoch": 0.3908671973188102,
"grad_norm": 0.5181053280830383,
"learning_rate": 1.8438206954335988e-05,
"loss": 1.3834,
"step": 933
},
{
"epoch": 0.3912861332216171,
"grad_norm": 0.5490570664405823,
"learning_rate": 1.843653121072476e-05,
"loss": 1.3659,
"step": 934
},
{
"epoch": 0.391705069124424,
"grad_norm": 0.9038254022598267,
"learning_rate": 1.8434855467113533e-05,
"loss": 1.4163,
"step": 935
},
{
"epoch": 0.3921240050272308,
"grad_norm": 0.8316247463226318,
"learning_rate": 1.8433179723502307e-05,
"loss": 1.2176,
"step": 936
},
{
"epoch": 0.3925429409300377,
"grad_norm": 0.9821082353591919,
"learning_rate": 1.8431503979891078e-05,
"loss": 1.2935,
"step": 937
},
{
"epoch": 0.39296187683284456,
"grad_norm": 0.8063921332359314,
"learning_rate": 1.842982823627985e-05,
"loss": 1.4265,
"step": 938
},
{
"epoch": 0.39338081273565145,
"grad_norm": 0.9902571439743042,
"learning_rate": 1.8428152492668622e-05,
"loss": 1.3794,
"step": 939
},
{
"epoch": 0.3937997486384583,
"grad_norm": 0.5868860483169556,
"learning_rate": 1.8426476749057397e-05,
"loss": 1.3488,
"step": 940
},
{
"epoch": 0.3942186845412652,
"grad_norm": 1.5364601612091064,
"learning_rate": 1.8424801005446167e-05,
"loss": 1.2805,
"step": 941
},
{
"epoch": 0.3946376204440721,
"grad_norm": 0.5772972106933594,
"learning_rate": 1.842312526183494e-05,
"loss": 1.3529,
"step": 942
},
{
"epoch": 0.3950565563468789,
"grad_norm": 0.7915850877761841,
"learning_rate": 1.8421449518223712e-05,
"loss": 1.4225,
"step": 943
},
{
"epoch": 0.3954754922496858,
"grad_norm": 0.5602070689201355,
"learning_rate": 1.8419773774612486e-05,
"loss": 1.3962,
"step": 944
},
{
"epoch": 0.39589442815249265,
"grad_norm": 0.6039633750915527,
"learning_rate": 1.841809803100126e-05,
"loss": 1.4176,
"step": 945
},
{
"epoch": 0.39631336405529954,
"grad_norm": 0.7987331748008728,
"learning_rate": 1.841642228739003e-05,
"loss": 1.3436,
"step": 946
},
{
"epoch": 0.3967322999581064,
"grad_norm": 0.572047233581543,
"learning_rate": 1.8414746543778802e-05,
"loss": 1.31,
"step": 947
},
{
"epoch": 0.3971512358609133,
"grad_norm": 0.5866878032684326,
"learning_rate": 1.8413070800167576e-05,
"loss": 1.3156,
"step": 948
},
{
"epoch": 0.39757017176372017,
"grad_norm": 0.6000043749809265,
"learning_rate": 1.841139505655635e-05,
"loss": 1.3694,
"step": 949
},
{
"epoch": 0.397989107666527,
"grad_norm": 0.5844922661781311,
"learning_rate": 1.840971931294512e-05,
"loss": 1.3165,
"step": 950
},
{
"epoch": 0.3984080435693339,
"grad_norm": 0.5939188599586487,
"learning_rate": 1.840804356933389e-05,
"loss": 1.4149,
"step": 951
},
{
"epoch": 0.39882697947214074,
"grad_norm": 0.7252548933029175,
"learning_rate": 1.8406367825722666e-05,
"loss": 1.3719,
"step": 952
},
{
"epoch": 0.39924591537494764,
"grad_norm": 0.5265613794326782,
"learning_rate": 1.840469208211144e-05,
"loss": 1.4457,
"step": 953
},
{
"epoch": 0.39966485127775453,
"grad_norm": 0.8471786379814148,
"learning_rate": 1.840301633850021e-05,
"loss": 1.3068,
"step": 954
},
{
"epoch": 0.40008378718056137,
"grad_norm": 0.5706002116203308,
"learning_rate": 1.8401340594888985e-05,
"loss": 1.3501,
"step": 955
},
{
"epoch": 0.40050272308336826,
"grad_norm": 0.9383030533790588,
"learning_rate": 1.8399664851277756e-05,
"loss": 1.2473,
"step": 956
},
{
"epoch": 0.4009216589861751,
"grad_norm": 0.5859097838401794,
"learning_rate": 1.839798910766653e-05,
"loss": 1.3109,
"step": 957
},
{
"epoch": 0.401340594888982,
"grad_norm": 0.8028270602226257,
"learning_rate": 1.83963133640553e-05,
"loss": 1.4197,
"step": 958
},
{
"epoch": 0.40175953079178883,
"grad_norm": 0.5209934115409851,
"learning_rate": 1.8394637620444075e-05,
"loss": 1.3811,
"step": 959
},
{
"epoch": 0.4021784666945957,
"grad_norm": 0.6815239191055298,
"learning_rate": 1.8392961876832845e-05,
"loss": 1.3674,
"step": 960
},
{
"epoch": 0.4025974025974026,
"grad_norm": 0.5413818359375,
"learning_rate": 1.8391286133221616e-05,
"loss": 1.3582,
"step": 961
},
{
"epoch": 0.40301633850020946,
"grad_norm": 0.6124288439750671,
"learning_rate": 1.838961038961039e-05,
"loss": 1.2951,
"step": 962
},
{
"epoch": 0.40343527440301635,
"grad_norm": 0.6622788906097412,
"learning_rate": 1.8387934645999164e-05,
"loss": 1.2459,
"step": 963
},
{
"epoch": 0.4038542103058232,
"grad_norm": 0.6401671767234802,
"learning_rate": 1.838625890238794e-05,
"loss": 1.3094,
"step": 964
},
{
"epoch": 0.4042731462086301,
"grad_norm": 0.5091043710708618,
"learning_rate": 1.838458315877671e-05,
"loss": 1.3991,
"step": 965
},
{
"epoch": 0.4046920821114369,
"grad_norm": 0.6116538643836975,
"learning_rate": 1.838290741516548e-05,
"loss": 1.3644,
"step": 966
},
{
"epoch": 0.4051110180142438,
"grad_norm": 0.5258737206459045,
"learning_rate": 1.8381231671554254e-05,
"loss": 1.3097,
"step": 967
},
{
"epoch": 0.4055299539170507,
"grad_norm": 0.5490679740905762,
"learning_rate": 1.8379555927943028e-05,
"loss": 1.2985,
"step": 968
},
{
"epoch": 0.40594888981985755,
"grad_norm": 0.5685926079750061,
"learning_rate": 1.83778801843318e-05,
"loss": 1.3255,
"step": 969
},
{
"epoch": 0.40636782572266444,
"grad_norm": 0.5925977230072021,
"learning_rate": 1.837620444072057e-05,
"loss": 1.3693,
"step": 970
},
{
"epoch": 0.4067867616254713,
"grad_norm": 0.5333212614059448,
"learning_rate": 1.8374528697109344e-05,
"loss": 1.3521,
"step": 971
},
{
"epoch": 0.4072056975282782,
"grad_norm": 0.5770822763442993,
"learning_rate": 1.8372852953498118e-05,
"loss": 1.263,
"step": 972
},
{
"epoch": 0.40762463343108507,
"grad_norm": 0.6561490893363953,
"learning_rate": 1.837117720988689e-05,
"loss": 1.3494,
"step": 973
},
{
"epoch": 0.4080435693338919,
"grad_norm": 0.5791774392127991,
"learning_rate": 1.836950146627566e-05,
"loss": 1.422,
"step": 974
},
{
"epoch": 0.4084625052366988,
"grad_norm": 0.6861572861671448,
"learning_rate": 1.8367825722664433e-05,
"loss": 1.2504,
"step": 975
},
{
"epoch": 0.40888144113950564,
"grad_norm": 0.5480408072471619,
"learning_rate": 1.8366149979053208e-05,
"loss": 1.3194,
"step": 976
},
{
"epoch": 0.40930037704231254,
"grad_norm": 0.6359973549842834,
"learning_rate": 1.836447423544198e-05,
"loss": 1.2548,
"step": 977
},
{
"epoch": 0.4097193129451194,
"grad_norm": 0.5910494327545166,
"learning_rate": 1.8362798491830752e-05,
"loss": 1.2948,
"step": 978
},
{
"epoch": 0.41013824884792627,
"grad_norm": 0.5293029546737671,
"learning_rate": 1.8361122748219523e-05,
"loss": 1.3814,
"step": 979
},
{
"epoch": 0.41055718475073316,
"grad_norm": 0.5520710349082947,
"learning_rate": 1.8359447004608297e-05,
"loss": 1.4673,
"step": 980
},
{
"epoch": 0.41097612065354,
"grad_norm": 0.7098038196563721,
"learning_rate": 1.8357771260997068e-05,
"loss": 1.3521,
"step": 981
},
{
"epoch": 0.4113950565563469,
"grad_norm": 0.6136484146118164,
"learning_rate": 1.8356095517385842e-05,
"loss": 1.3526,
"step": 982
},
{
"epoch": 0.41181399245915373,
"grad_norm": 0.6027216911315918,
"learning_rate": 1.8354419773774613e-05,
"loss": 1.2975,
"step": 983
},
{
"epoch": 0.4122329283619606,
"grad_norm": 0.5980489253997803,
"learning_rate": 1.8352744030163387e-05,
"loss": 1.3214,
"step": 984
},
{
"epoch": 0.41265186426476747,
"grad_norm": 0.6312512755393982,
"learning_rate": 1.8351068286552158e-05,
"loss": 1.3157,
"step": 985
},
{
"epoch": 0.41307080016757436,
"grad_norm": 0.9795253872871399,
"learning_rate": 1.8349392542940932e-05,
"loss": 1.3644,
"step": 986
},
{
"epoch": 0.41348973607038125,
"grad_norm": 0.5261685252189636,
"learning_rate": 1.8347716799329706e-05,
"loss": 1.3392,
"step": 987
},
{
"epoch": 0.4139086719731881,
"grad_norm": 0.6193227767944336,
"learning_rate": 1.8346041055718477e-05,
"loss": 1.3444,
"step": 988
},
{
"epoch": 0.414327607875995,
"grad_norm": 0.4504217505455017,
"learning_rate": 1.8344365312107248e-05,
"loss": 1.2957,
"step": 989
},
{
"epoch": 0.4147465437788018,
"grad_norm": 0.6061900854110718,
"learning_rate": 1.834268956849602e-05,
"loss": 1.3048,
"step": 990
},
{
"epoch": 0.4151654796816087,
"grad_norm": 0.5700473189353943,
"learning_rate": 1.8341013824884796e-05,
"loss": 1.2925,
"step": 991
},
{
"epoch": 0.4155844155844156,
"grad_norm": 0.5876666307449341,
"learning_rate": 1.8339338081273567e-05,
"loss": 1.3248,
"step": 992
},
{
"epoch": 0.41600335148722245,
"grad_norm": 0.5364834666252136,
"learning_rate": 1.8337662337662337e-05,
"loss": 1.302,
"step": 993
},
{
"epoch": 0.41642228739002934,
"grad_norm": 0.7233411073684692,
"learning_rate": 1.833598659405111e-05,
"loss": 1.2268,
"step": 994
},
{
"epoch": 0.4168412232928362,
"grad_norm": 0.5373887419700623,
"learning_rate": 1.8334310850439886e-05,
"loss": 1.2544,
"step": 995
},
{
"epoch": 0.4172601591956431,
"grad_norm": 0.6809514164924622,
"learning_rate": 1.8332635106828656e-05,
"loss": 1.3745,
"step": 996
},
{
"epoch": 0.4176790950984499,
"grad_norm": 0.4963758587837219,
"learning_rate": 1.833095936321743e-05,
"loss": 1.3757,
"step": 997
},
{
"epoch": 0.4180980310012568,
"grad_norm": 0.6156362295150757,
"learning_rate": 1.83292836196062e-05,
"loss": 1.3654,
"step": 998
},
{
"epoch": 0.4185169669040637,
"grad_norm": 0.6025580167770386,
"learning_rate": 1.8327607875994975e-05,
"loss": 1.4092,
"step": 999
},
{
"epoch": 0.41893590280687054,
"grad_norm": 0.4706215262413025,
"learning_rate": 1.8325932132383746e-05,
"loss": 1.2553,
"step": 1000
},
{
"epoch": 0.41935483870967744,
"grad_norm": 0.6168606281280518,
"learning_rate": 1.832425638877252e-05,
"loss": 1.263,
"step": 1001
},
{
"epoch": 0.4197737746124843,
"grad_norm": 0.5337375402450562,
"learning_rate": 1.832258064516129e-05,
"loss": 1.2473,
"step": 1002
},
{
"epoch": 0.42019271051529117,
"grad_norm": 0.5635408759117126,
"learning_rate": 1.8320904901550065e-05,
"loss": 1.1962,
"step": 1003
},
{
"epoch": 0.420611646418098,
"grad_norm": 0.5376746654510498,
"learning_rate": 1.8319229157938836e-05,
"loss": 1.3885,
"step": 1004
},
{
"epoch": 0.4210305823209049,
"grad_norm": 0.5582394599914551,
"learning_rate": 1.831755341432761e-05,
"loss": 1.4008,
"step": 1005
},
{
"epoch": 0.4214495182237118,
"grad_norm": 0.5136025547981262,
"learning_rate": 1.831587767071638e-05,
"loss": 1.4049,
"step": 1006
},
{
"epoch": 0.42186845412651863,
"grad_norm": 0.6063160300254822,
"learning_rate": 1.8314201927105155e-05,
"loss": 1.3763,
"step": 1007
},
{
"epoch": 0.4222873900293255,
"grad_norm": 0.5643933415412903,
"learning_rate": 1.8312526183493925e-05,
"loss": 1.2941,
"step": 1008
},
{
"epoch": 0.42270632593213237,
"grad_norm": 0.5687242150306702,
"learning_rate": 1.83108504398827e-05,
"loss": 1.274,
"step": 1009
},
{
"epoch": 0.42312526183493926,
"grad_norm": 0.5480166077613831,
"learning_rate": 1.8309174696271474e-05,
"loss": 1.4593,
"step": 1010
},
{
"epoch": 0.4235441977377461,
"grad_norm": 0.5176064968109131,
"learning_rate": 1.8307498952660244e-05,
"loss": 1.3344,
"step": 1011
},
{
"epoch": 0.423963133640553,
"grad_norm": 0.7011755108833313,
"learning_rate": 1.8305823209049015e-05,
"loss": 1.3585,
"step": 1012
},
{
"epoch": 0.4243820695433599,
"grad_norm": 0.5866506695747375,
"learning_rate": 1.830414746543779e-05,
"loss": 1.4471,
"step": 1013
},
{
"epoch": 0.4248010054461667,
"grad_norm": 0.7366320490837097,
"learning_rate": 1.8302471721826563e-05,
"loss": 1.2817,
"step": 1014
},
{
"epoch": 0.4252199413489736,
"grad_norm": 0.47257664799690247,
"learning_rate": 1.8300795978215334e-05,
"loss": 1.4617,
"step": 1015
},
{
"epoch": 0.42563887725178046,
"grad_norm": 0.8454380035400391,
"learning_rate": 1.8299120234604105e-05,
"loss": 1.2917,
"step": 1016
},
{
"epoch": 0.42605781315458735,
"grad_norm": 0.7998992800712585,
"learning_rate": 1.829744449099288e-05,
"loss": 1.2522,
"step": 1017
},
{
"epoch": 0.42647674905739424,
"grad_norm": 0.6851061582565308,
"learning_rate": 1.8295768747381653e-05,
"loss": 1.2284,
"step": 1018
},
{
"epoch": 0.4268956849602011,
"grad_norm": 0.9322745203971863,
"learning_rate": 1.8294093003770427e-05,
"loss": 1.2665,
"step": 1019
},
{
"epoch": 0.427314620863008,
"grad_norm": 0.5133146047592163,
"learning_rate": 1.8292417260159198e-05,
"loss": 1.3387,
"step": 1020
},
{
"epoch": 0.4277335567658148,
"grad_norm": 0.6651824712753296,
"learning_rate": 1.829074151654797e-05,
"loss": 1.3604,
"step": 1021
},
{
"epoch": 0.4281524926686217,
"grad_norm": 0.6780356764793396,
"learning_rate": 1.8289065772936743e-05,
"loss": 1.3237,
"step": 1022
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.6781860589981079,
"learning_rate": 1.8287390029325517e-05,
"loss": 1.2915,
"step": 1023
},
{
"epoch": 0.42899036447423544,
"grad_norm": 0.7052691578865051,
"learning_rate": 1.8285714285714288e-05,
"loss": 1.3323,
"step": 1024
},
{
"epoch": 0.42940930037704234,
"grad_norm": 0.5380198955535889,
"learning_rate": 1.828403854210306e-05,
"loss": 1.2701,
"step": 1025
},
{
"epoch": 0.4298282362798492,
"grad_norm": 1.0681602954864502,
"learning_rate": 1.8282362798491833e-05,
"loss": 1.2479,
"step": 1026
},
{
"epoch": 0.43024717218265607,
"grad_norm": 0.6280472874641418,
"learning_rate": 1.8280687054880603e-05,
"loss": 1.3451,
"step": 1027
},
{
"epoch": 0.4306661080854629,
"grad_norm": 0.6782708764076233,
"learning_rate": 1.8279011311269378e-05,
"loss": 1.2888,
"step": 1028
},
{
"epoch": 0.4310850439882698,
"grad_norm": 0.5504584908485413,
"learning_rate": 1.8277335567658148e-05,
"loss": 1.3863,
"step": 1029
},
{
"epoch": 0.43150397989107664,
"grad_norm": 0.557868242263794,
"learning_rate": 1.8275659824046922e-05,
"loss": 1.3399,
"step": 1030
},
{
"epoch": 0.43192291579388353,
"grad_norm": 1.2210026979446411,
"learning_rate": 1.8273984080435693e-05,
"loss": 1.2104,
"step": 1031
},
{
"epoch": 0.4323418516966904,
"grad_norm": 0.6031039357185364,
"learning_rate": 1.8272308336824467e-05,
"loss": 1.4665,
"step": 1032
},
{
"epoch": 0.43276078759949727,
"grad_norm": 0.9053773283958435,
"learning_rate": 1.827063259321324e-05,
"loss": 1.3219,
"step": 1033
},
{
"epoch": 0.43317972350230416,
"grad_norm": 0.7819105386734009,
"learning_rate": 1.8268956849602012e-05,
"loss": 1.3573,
"step": 1034
},
{
"epoch": 0.433598659405111,
"grad_norm": 0.967070996761322,
"learning_rate": 1.8267281105990783e-05,
"loss": 1.4162,
"step": 1035
},
{
"epoch": 0.4340175953079179,
"grad_norm": 1.1320526599884033,
"learning_rate": 1.8265605362379557e-05,
"loss": 1.4091,
"step": 1036
},
{
"epoch": 0.4344365312107248,
"grad_norm": 0.7298519611358643,
"learning_rate": 1.826392961876833e-05,
"loss": 1.2992,
"step": 1037
},
{
"epoch": 0.4348554671135316,
"grad_norm": 1.2868174314498901,
"learning_rate": 1.8262253875157102e-05,
"loss": 1.2175,
"step": 1038
},
{
"epoch": 0.4352744030163385,
"grad_norm": 0.5379632711410522,
"learning_rate": 1.8260578131545873e-05,
"loss": 1.4073,
"step": 1039
},
{
"epoch": 0.43569333891914536,
"grad_norm": 1.5946475267410278,
"learning_rate": 1.8258902387934647e-05,
"loss": 1.2777,
"step": 1040
},
{
"epoch": 0.43611227482195225,
"grad_norm": 0.7467783093452454,
"learning_rate": 1.825722664432342e-05,
"loss": 1.3326,
"step": 1041
},
{
"epoch": 0.4365312107247591,
"grad_norm": 0.8855487108230591,
"learning_rate": 1.8255550900712195e-05,
"loss": 1.2844,
"step": 1042
},
{
"epoch": 0.436950146627566,
"grad_norm": 0.9139724969863892,
"learning_rate": 1.8253875157100966e-05,
"loss": 1.2526,
"step": 1043
},
{
"epoch": 0.4373690825303729,
"grad_norm": 0.6316019892692566,
"learning_rate": 1.8252199413489736e-05,
"loss": 1.2152,
"step": 1044
},
{
"epoch": 0.4377880184331797,
"grad_norm": 1.1355504989624023,
"learning_rate": 1.825052366987851e-05,
"loss": 1.3436,
"step": 1045
},
{
"epoch": 0.4382069543359866,
"grad_norm": 0.5658217668533325,
"learning_rate": 1.8248847926267285e-05,
"loss": 1.2746,
"step": 1046
},
{
"epoch": 0.43862589023879345,
"grad_norm": 1.0458660125732422,
"learning_rate": 1.8247172182656055e-05,
"loss": 1.2238,
"step": 1047
},
{
"epoch": 0.43904482614160034,
"grad_norm": 0.6728708744049072,
"learning_rate": 1.8245496439044826e-05,
"loss": 1.2646,
"step": 1048
},
{
"epoch": 0.4394637620444072,
"grad_norm": 0.6456686854362488,
"learning_rate": 1.82438206954336e-05,
"loss": 1.4044,
"step": 1049
},
{
"epoch": 0.4398826979472141,
"grad_norm": 0.7611178755760193,
"learning_rate": 1.8242144951822374e-05,
"loss": 1.3145,
"step": 1050
},
{
"epoch": 0.44030163385002097,
"grad_norm": 0.5446016192436218,
"learning_rate": 1.8240469208211145e-05,
"loss": 1.2648,
"step": 1051
},
{
"epoch": 0.4407205697528278,
"grad_norm": 0.6662243604660034,
"learning_rate": 1.8238793464599916e-05,
"loss": 1.3399,
"step": 1052
},
{
"epoch": 0.4411395056556347,
"grad_norm": 0.6562479138374329,
"learning_rate": 1.823711772098869e-05,
"loss": 1.3618,
"step": 1053
},
{
"epoch": 0.44155844155844154,
"grad_norm": 0.5235145688056946,
"learning_rate": 1.8235441977377464e-05,
"loss": 1.386,
"step": 1054
},
{
"epoch": 0.44197737746124843,
"grad_norm": 0.6828937530517578,
"learning_rate": 1.8233766233766235e-05,
"loss": 1.3648,
"step": 1055
},
{
"epoch": 0.4423963133640553,
"grad_norm": 0.6108437776565552,
"learning_rate": 1.823209049015501e-05,
"loss": 1.3764,
"step": 1056
},
{
"epoch": 0.44281524926686217,
"grad_norm": 0.5251436233520508,
"learning_rate": 1.823041474654378e-05,
"loss": 1.3468,
"step": 1057
},
{
"epoch": 0.44323418516966906,
"grad_norm": 0.6740531921386719,
"learning_rate": 1.822873900293255e-05,
"loss": 1.3186,
"step": 1058
},
{
"epoch": 0.4436531210724759,
"grad_norm": 0.5247359275817871,
"learning_rate": 1.8227063259321325e-05,
"loss": 1.3343,
"step": 1059
},
{
"epoch": 0.4440720569752828,
"grad_norm": 0.821925938129425,
"learning_rate": 1.82253875157101e-05,
"loss": 1.224,
"step": 1060
},
{
"epoch": 0.44449099287808963,
"grad_norm": 0.5397830009460449,
"learning_rate": 1.822371177209887e-05,
"loss": 1.3741,
"step": 1061
},
{
"epoch": 0.4449099287808965,
"grad_norm": 0.5710780024528503,
"learning_rate": 1.822203602848764e-05,
"loss": 1.2652,
"step": 1062
},
{
"epoch": 0.4453288646837034,
"grad_norm": 0.5801980495452881,
"learning_rate": 1.8220360284876414e-05,
"loss": 1.3644,
"step": 1063
},
{
"epoch": 0.44574780058651026,
"grad_norm": 0.5152921080589294,
"learning_rate": 1.821868454126519e-05,
"loss": 1.3002,
"step": 1064
},
{
"epoch": 0.44616673648931715,
"grad_norm": 0.6770956516265869,
"learning_rate": 1.8217008797653963e-05,
"loss": 1.3419,
"step": 1065
},
{
"epoch": 0.446585672392124,
"grad_norm": 0.5643904209136963,
"learning_rate": 1.8215333054042733e-05,
"loss": 1.2847,
"step": 1066
},
{
"epoch": 0.4470046082949309,
"grad_norm": 0.4762922525405884,
"learning_rate": 1.8213657310431504e-05,
"loss": 1.4463,
"step": 1067
},
{
"epoch": 0.4474235441977377,
"grad_norm": 0.7292268872261047,
"learning_rate": 1.8211981566820278e-05,
"loss": 1.3816,
"step": 1068
},
{
"epoch": 0.4478424801005446,
"grad_norm": 0.5174683928489685,
"learning_rate": 1.8210305823209052e-05,
"loss": 1.3471,
"step": 1069
},
{
"epoch": 0.4482614160033515,
"grad_norm": 0.6858307719230652,
"learning_rate": 1.8208630079597823e-05,
"loss": 1.2831,
"step": 1070
},
{
"epoch": 0.44868035190615835,
"grad_norm": 0.5826430320739746,
"learning_rate": 1.8206954335986594e-05,
"loss": 1.2377,
"step": 1071
},
{
"epoch": 0.44909928780896524,
"grad_norm": 0.8610800504684448,
"learning_rate": 1.8205278592375368e-05,
"loss": 1.3182,
"step": 1072
},
{
"epoch": 0.4495182237117721,
"grad_norm": 0.5191478133201599,
"learning_rate": 1.8203602848764142e-05,
"loss": 1.3731,
"step": 1073
},
{
"epoch": 0.449937159614579,
"grad_norm": 0.625361979007721,
"learning_rate": 1.8201927105152913e-05,
"loss": 1.3104,
"step": 1074
},
{
"epoch": 0.4503560955173858,
"grad_norm": 0.5182881951332092,
"learning_rate": 1.8200251361541687e-05,
"loss": 1.2699,
"step": 1075
},
{
"epoch": 0.4507750314201927,
"grad_norm": 0.5343324542045593,
"learning_rate": 1.8198575617930458e-05,
"loss": 1.2777,
"step": 1076
},
{
"epoch": 0.4511939673229996,
"grad_norm": 0.5854357481002808,
"learning_rate": 1.8196899874319232e-05,
"loss": 1.37,
"step": 1077
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.6107653379440308,
"learning_rate": 1.8195224130708003e-05,
"loss": 1.2805,
"step": 1078
},
{
"epoch": 0.45203183912861333,
"grad_norm": 0.5591204762458801,
"learning_rate": 1.8193548387096777e-05,
"loss": 1.3273,
"step": 1079
},
{
"epoch": 0.45245077503142017,
"grad_norm": 0.5599054098129272,
"learning_rate": 1.8191872643485547e-05,
"loss": 1.383,
"step": 1080
},
{
"epoch": 0.45286971093422707,
"grad_norm": 0.7507011294364929,
"learning_rate": 1.819019689987432e-05,
"loss": 1.3572,
"step": 1081
},
{
"epoch": 0.45328864683703396,
"grad_norm": 0.5637994408607483,
"learning_rate": 1.8188521156263092e-05,
"loss": 1.3475,
"step": 1082
},
{
"epoch": 0.4537075827398408,
"grad_norm": 0.5319277048110962,
"learning_rate": 1.8186845412651866e-05,
"loss": 1.3858,
"step": 1083
},
{
"epoch": 0.4541265186426477,
"grad_norm": 0.48302850127220154,
"learning_rate": 1.8185169669040637e-05,
"loss": 1.4242,
"step": 1084
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.7405598759651184,
"learning_rate": 1.818349392542941e-05,
"loss": 1.2827,
"step": 1085
},
{
"epoch": 0.4549643904482614,
"grad_norm": 0.6391623020172119,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.2835,
"step": 1086
},
{
"epoch": 0.45538332635106826,
"grad_norm": 0.667966365814209,
"learning_rate": 1.8180142438206956e-05,
"loss": 1.3476,
"step": 1087
},
{
"epoch": 0.45580226225387516,
"grad_norm": 0.5612936019897461,
"learning_rate": 1.817846669459573e-05,
"loss": 1.4179,
"step": 1088
},
{
"epoch": 0.45622119815668205,
"grad_norm": 0.6100994944572449,
"learning_rate": 1.81767909509845e-05,
"loss": 1.3792,
"step": 1089
},
{
"epoch": 0.4566401340594889,
"grad_norm": 0.5509599447250366,
"learning_rate": 1.8175115207373272e-05,
"loss": 1.2592,
"step": 1090
},
{
"epoch": 0.4570590699622958,
"grad_norm": 0.6930477619171143,
"learning_rate": 1.8173439463762046e-05,
"loss": 1.4418,
"step": 1091
},
{
"epoch": 0.4574780058651026,
"grad_norm": 0.5277242064476013,
"learning_rate": 1.817176372015082e-05,
"loss": 1.2418,
"step": 1092
},
{
"epoch": 0.4578969417679095,
"grad_norm": 0.605899453163147,
"learning_rate": 1.817008797653959e-05,
"loss": 1.3442,
"step": 1093
},
{
"epoch": 0.45831587767071635,
"grad_norm": 0.5759694576263428,
"learning_rate": 1.816841223292836e-05,
"loss": 1.2769,
"step": 1094
},
{
"epoch": 0.45873481357352325,
"grad_norm": 0.676699161529541,
"learning_rate": 1.8166736489317136e-05,
"loss": 1.3406,
"step": 1095
},
{
"epoch": 0.45915374947633014,
"grad_norm": 0.5942062139511108,
"learning_rate": 1.816506074570591e-05,
"loss": 1.3825,
"step": 1096
},
{
"epoch": 0.459572685379137,
"grad_norm": 0.5384318828582764,
"learning_rate": 1.816338500209468e-05,
"loss": 1.4372,
"step": 1097
},
{
"epoch": 0.4599916212819439,
"grad_norm": 0.6452490091323853,
"learning_rate": 1.8161709258483455e-05,
"loss": 1.1995,
"step": 1098
},
{
"epoch": 0.4604105571847507,
"grad_norm": 0.7278120517730713,
"learning_rate": 1.8160033514872225e-05,
"loss": 1.2887,
"step": 1099
},
{
"epoch": 0.4608294930875576,
"grad_norm": 0.5139358043670654,
"learning_rate": 1.8158357771261e-05,
"loss": 1.3544,
"step": 1100
},
{
"epoch": 0.4612484289903645,
"grad_norm": 0.5915756225585938,
"learning_rate": 1.815668202764977e-05,
"loss": 1.3052,
"step": 1101
},
{
"epoch": 0.46166736489317134,
"grad_norm": 0.6267953515052795,
"learning_rate": 1.8155006284038544e-05,
"loss": 1.2779,
"step": 1102
},
{
"epoch": 0.46208630079597823,
"grad_norm": 0.4867702126502991,
"learning_rate": 1.8153330540427315e-05,
"loss": 1.3432,
"step": 1103
},
{
"epoch": 0.46250523669878507,
"grad_norm": 0.7477414011955261,
"learning_rate": 1.815165479681609e-05,
"loss": 1.3504,
"step": 1104
},
{
"epoch": 0.46292417260159197,
"grad_norm": 0.5418640375137329,
"learning_rate": 1.814997905320486e-05,
"loss": 1.3845,
"step": 1105
},
{
"epoch": 0.4633431085043988,
"grad_norm": 0.6884581446647644,
"learning_rate": 1.8148303309593634e-05,
"loss": 1.3813,
"step": 1106
},
{
"epoch": 0.4637620444072057,
"grad_norm": 0.47973325848579407,
"learning_rate": 1.8146627565982405e-05,
"loss": 1.3473,
"step": 1107
},
{
"epoch": 0.4641809803100126,
"grad_norm": 0.6327143311500549,
"learning_rate": 1.814495182237118e-05,
"loss": 1.3873,
"step": 1108
},
{
"epoch": 0.46459991621281943,
"grad_norm": 0.5946708917617798,
"learning_rate": 1.814327607875995e-05,
"loss": 1.4171,
"step": 1109
},
{
"epoch": 0.4650188521156263,
"grad_norm": 0.5836149454116821,
"learning_rate": 1.8141600335148724e-05,
"loss": 1.3661,
"step": 1110
},
{
"epoch": 0.46543778801843316,
"grad_norm": 0.5664911866188049,
"learning_rate": 1.8139924591537498e-05,
"loss": 1.3206,
"step": 1111
},
{
"epoch": 0.46585672392124006,
"grad_norm": 0.5648526549339294,
"learning_rate": 1.813824884792627e-05,
"loss": 1.291,
"step": 1112
},
{
"epoch": 0.4662756598240469,
"grad_norm": 0.5593236088752747,
"learning_rate": 1.813657310431504e-05,
"loss": 1.2954,
"step": 1113
},
{
"epoch": 0.4666945957268538,
"grad_norm": 0.6017231345176697,
"learning_rate": 1.8134897360703814e-05,
"loss": 1.3553,
"step": 1114
},
{
"epoch": 0.4671135316296607,
"grad_norm": 0.5916247963905334,
"learning_rate": 1.8133221617092588e-05,
"loss": 1.3504,
"step": 1115
},
{
"epoch": 0.4675324675324675,
"grad_norm": 0.5841718912124634,
"learning_rate": 1.813154587348136e-05,
"loss": 1.3576,
"step": 1116
},
{
"epoch": 0.4679514034352744,
"grad_norm": 0.5624353289604187,
"learning_rate": 1.812987012987013e-05,
"loss": 1.2994,
"step": 1117
},
{
"epoch": 0.46837033933808125,
"grad_norm": 0.5776383280754089,
"learning_rate": 1.8128194386258903e-05,
"loss": 1.2497,
"step": 1118
},
{
"epoch": 0.46878927524088815,
"grad_norm": 0.5227569341659546,
"learning_rate": 1.8126518642647677e-05,
"loss": 1.2978,
"step": 1119
},
{
"epoch": 0.46920821114369504,
"grad_norm": 0.5693500638008118,
"learning_rate": 1.812484289903645e-05,
"loss": 1.2476,
"step": 1120
},
{
"epoch": 0.4696271470465019,
"grad_norm": 0.5316402912139893,
"learning_rate": 1.8123167155425222e-05,
"loss": 1.356,
"step": 1121
},
{
"epoch": 0.4700460829493088,
"grad_norm": 0.5823123455047607,
"learning_rate": 1.8121491411813993e-05,
"loss": 1.357,
"step": 1122
},
{
"epoch": 0.4704650188521156,
"grad_norm": 0.6034800410270691,
"learning_rate": 1.8119815668202767e-05,
"loss": 1.4305,
"step": 1123
},
{
"epoch": 0.4708839547549225,
"grad_norm": 0.5354906916618347,
"learning_rate": 1.8118139924591538e-05,
"loss": 1.3806,
"step": 1124
},
{
"epoch": 0.47130289065772935,
"grad_norm": 0.5435880422592163,
"learning_rate": 1.8116464180980312e-05,
"loss": 1.3818,
"step": 1125
},
{
"epoch": 0.47172182656053624,
"grad_norm": 0.540764570236206,
"learning_rate": 1.8114788437369083e-05,
"loss": 1.3036,
"step": 1126
},
{
"epoch": 0.47214076246334313,
"grad_norm": 0.5142596364021301,
"learning_rate": 1.8113112693757857e-05,
"loss": 1.3244,
"step": 1127
},
{
"epoch": 0.47255969836614997,
"grad_norm": 0.6185094714164734,
"learning_rate": 1.8111436950146628e-05,
"loss": 1.1574,
"step": 1128
},
{
"epoch": 0.47297863426895687,
"grad_norm": 0.5076654553413391,
"learning_rate": 1.8109761206535402e-05,
"loss": 1.4598,
"step": 1129
},
{
"epoch": 0.4733975701717637,
"grad_norm": 0.6925384402275085,
"learning_rate": 1.8108085462924173e-05,
"loss": 1.2539,
"step": 1130
},
{
"epoch": 0.4738165060745706,
"grad_norm": 0.5750361680984497,
"learning_rate": 1.8106409719312947e-05,
"loss": 1.3353,
"step": 1131
},
{
"epoch": 0.47423544197737744,
"grad_norm": 0.7001599669456482,
"learning_rate": 1.8104733975701717e-05,
"loss": 1.1343,
"step": 1132
},
{
"epoch": 0.47465437788018433,
"grad_norm": 0.6924251914024353,
"learning_rate": 1.810305823209049e-05,
"loss": 1.3168,
"step": 1133
},
{
"epoch": 0.4750733137829912,
"grad_norm": 0.5786885023117065,
"learning_rate": 1.8101382488479266e-05,
"loss": 1.305,
"step": 1134
},
{
"epoch": 0.47549224968579806,
"grad_norm": 0.5598960518836975,
"learning_rate": 1.8099706744868036e-05,
"loss": 1.3827,
"step": 1135
},
{
"epoch": 0.47591118558860496,
"grad_norm": 0.6162183284759521,
"learning_rate": 1.8098031001256807e-05,
"loss": 1.336,
"step": 1136
},
{
"epoch": 0.4763301214914118,
"grad_norm": 0.5528692603111267,
"learning_rate": 1.809635525764558e-05,
"loss": 1.3069,
"step": 1137
},
{
"epoch": 0.4767490573942187,
"grad_norm": 0.5544486045837402,
"learning_rate": 1.8094679514034355e-05,
"loss": 1.4149,
"step": 1138
},
{
"epoch": 0.47716799329702553,
"grad_norm": 0.6026493310928345,
"learning_rate": 1.8093003770423126e-05,
"loss": 1.3589,
"step": 1139
},
{
"epoch": 0.4775869291998324,
"grad_norm": 0.7158576846122742,
"learning_rate": 1.8091328026811897e-05,
"loss": 1.2462,
"step": 1140
},
{
"epoch": 0.4780058651026393,
"grad_norm": 0.6430186629295349,
"learning_rate": 1.808965228320067e-05,
"loss": 1.2165,
"step": 1141
},
{
"epoch": 0.47842480100544615,
"grad_norm": 0.6099768877029419,
"learning_rate": 1.8087976539589445e-05,
"loss": 1.2787,
"step": 1142
},
{
"epoch": 0.47884373690825305,
"grad_norm": 0.5819302201271057,
"learning_rate": 1.808630079597822e-05,
"loss": 1.3125,
"step": 1143
},
{
"epoch": 0.4792626728110599,
"grad_norm": 0.609936535358429,
"learning_rate": 1.808462505236699e-05,
"loss": 1.359,
"step": 1144
},
{
"epoch": 0.4796816087138668,
"grad_norm": 0.6110237836837769,
"learning_rate": 1.808294930875576e-05,
"loss": 1.354,
"step": 1145
},
{
"epoch": 0.4801005446166737,
"grad_norm": 0.6096034646034241,
"learning_rate": 1.8081273565144535e-05,
"loss": 1.3211,
"step": 1146
},
{
"epoch": 0.4805194805194805,
"grad_norm": 0.6327634453773499,
"learning_rate": 1.807959782153331e-05,
"loss": 1.2609,
"step": 1147
},
{
"epoch": 0.4809384164222874,
"grad_norm": 0.48535141348838806,
"learning_rate": 1.807792207792208e-05,
"loss": 1.4099,
"step": 1148
},
{
"epoch": 0.48135735232509425,
"grad_norm": 0.7460819482803345,
"learning_rate": 1.807624633431085e-05,
"loss": 1.2552,
"step": 1149
},
{
"epoch": 0.48177628822790114,
"grad_norm": 0.5781182646751404,
"learning_rate": 1.8074570590699625e-05,
"loss": 1.3387,
"step": 1150
},
{
"epoch": 0.482195224130708,
"grad_norm": 0.47077783942222595,
"learning_rate": 1.80728948470884e-05,
"loss": 1.3174,
"step": 1151
},
{
"epoch": 0.48261416003351487,
"grad_norm": 0.5862193703651428,
"learning_rate": 1.807121910347717e-05,
"loss": 1.3355,
"step": 1152
},
{
"epoch": 0.48303309593632177,
"grad_norm": 0.681327223777771,
"learning_rate": 1.8069543359865944e-05,
"loss": 1.3116,
"step": 1153
},
{
"epoch": 0.4834520318391286,
"grad_norm": 0.6501558423042297,
"learning_rate": 1.8067867616254714e-05,
"loss": 1.2705,
"step": 1154
},
{
"epoch": 0.4838709677419355,
"grad_norm": 0.6103442311286926,
"learning_rate": 1.8066191872643485e-05,
"loss": 1.4161,
"step": 1155
},
{
"epoch": 0.48428990364474234,
"grad_norm": 0.7739853858947754,
"learning_rate": 1.806451612903226e-05,
"loss": 1.2534,
"step": 1156
},
{
"epoch": 0.48470883954754923,
"grad_norm": 0.7075815200805664,
"learning_rate": 1.8062840385421033e-05,
"loss": 1.3316,
"step": 1157
},
{
"epoch": 0.48512777545035607,
"grad_norm": 0.7182810306549072,
"learning_rate": 1.8061164641809804e-05,
"loss": 1.2406,
"step": 1158
},
{
"epoch": 0.48554671135316296,
"grad_norm": 0.6222896575927734,
"learning_rate": 1.8059488898198575e-05,
"loss": 1.232,
"step": 1159
},
{
"epoch": 0.48596564725596986,
"grad_norm": 0.6076582074165344,
"learning_rate": 1.805781315458735e-05,
"loss": 1.3484,
"step": 1160
},
{
"epoch": 0.4863845831587767,
"grad_norm": 0.8611488342285156,
"learning_rate": 1.8056137410976123e-05,
"loss": 1.2146,
"step": 1161
},
{
"epoch": 0.4868035190615836,
"grad_norm": 0.5255362391471863,
"learning_rate": 1.8054461667364894e-05,
"loss": 1.4347,
"step": 1162
},
{
"epoch": 0.48722245496439043,
"grad_norm": 0.8451843857765198,
"learning_rate": 1.8052785923753665e-05,
"loss": 1.3022,
"step": 1163
},
{
"epoch": 0.4876413908671973,
"grad_norm": 0.6206266283988953,
"learning_rate": 1.805111018014244e-05,
"loss": 1.2237,
"step": 1164
},
{
"epoch": 0.4880603267700042,
"grad_norm": 0.5774354338645935,
"learning_rate": 1.8049434436531213e-05,
"loss": 1.268,
"step": 1165
},
{
"epoch": 0.48847926267281105,
"grad_norm": 0.9592013955116272,
"learning_rate": 1.8047758692919987e-05,
"loss": 1.2488,
"step": 1166
},
{
"epoch": 0.48889819857561795,
"grad_norm": 0.759455144405365,
"learning_rate": 1.8046082949308758e-05,
"loss": 1.2773,
"step": 1167
},
{
"epoch": 0.4893171344784248,
"grad_norm": 0.5964170694351196,
"learning_rate": 1.804440720569753e-05,
"loss": 1.3743,
"step": 1168
},
{
"epoch": 0.4897360703812317,
"grad_norm": 0.7656179070472717,
"learning_rate": 1.8042731462086303e-05,
"loss": 1.355,
"step": 1169
},
{
"epoch": 0.4901550062840385,
"grad_norm": 0.6207166314125061,
"learning_rate": 1.8041055718475077e-05,
"loss": 1.2902,
"step": 1170
},
{
"epoch": 0.4905739421868454,
"grad_norm": 0.6878586411476135,
"learning_rate": 1.8039379974863847e-05,
"loss": 1.3046,
"step": 1171
},
{
"epoch": 0.4909928780896523,
"grad_norm": 0.5206657648086548,
"learning_rate": 1.8037704231252618e-05,
"loss": 1.3606,
"step": 1172
},
{
"epoch": 0.49141181399245915,
"grad_norm": 0.6583306789398193,
"learning_rate": 1.8036028487641392e-05,
"loss": 1.3502,
"step": 1173
},
{
"epoch": 0.49183074989526604,
"grad_norm": 0.6136510968208313,
"learning_rate": 1.8034352744030166e-05,
"loss": 1.2949,
"step": 1174
},
{
"epoch": 0.4922496857980729,
"grad_norm": 0.8145105838775635,
"learning_rate": 1.8032677000418937e-05,
"loss": 1.3587,
"step": 1175
},
{
"epoch": 0.49266862170087977,
"grad_norm": 0.6916869282722473,
"learning_rate": 1.803100125680771e-05,
"loss": 1.3045,
"step": 1176
},
{
"epoch": 0.4930875576036866,
"grad_norm": 0.6330479979515076,
"learning_rate": 1.8029325513196482e-05,
"loss": 1.3547,
"step": 1177
},
{
"epoch": 0.4935064935064935,
"grad_norm": 0.5480479598045349,
"learning_rate": 1.8027649769585256e-05,
"loss": 1.3268,
"step": 1178
},
{
"epoch": 0.4939254294093004,
"grad_norm": 0.5610688328742981,
"learning_rate": 1.8025974025974027e-05,
"loss": 1.341,
"step": 1179
},
{
"epoch": 0.49434436531210724,
"grad_norm": 0.5203456282615662,
"learning_rate": 1.80242982823628e-05,
"loss": 1.3454,
"step": 1180
},
{
"epoch": 0.49476330121491413,
"grad_norm": 0.5318114161491394,
"learning_rate": 1.8022622538751572e-05,
"loss": 1.3553,
"step": 1181
},
{
"epoch": 0.49518223711772097,
"grad_norm": 0.7349583506584167,
"learning_rate": 1.8020946795140346e-05,
"loss": 1.3107,
"step": 1182
},
{
"epoch": 0.49560117302052786,
"grad_norm": 0.5046969056129456,
"learning_rate": 1.8019271051529117e-05,
"loss": 1.2994,
"step": 1183
},
{
"epoch": 0.49602010892333476,
"grad_norm": 0.6813132166862488,
"learning_rate": 1.801759530791789e-05,
"loss": 1.198,
"step": 1184
},
{
"epoch": 0.4964390448261416,
"grad_norm": 0.6181411147117615,
"learning_rate": 1.801591956430666e-05,
"loss": 1.2702,
"step": 1185
},
{
"epoch": 0.4968579807289485,
"grad_norm": 0.5381226539611816,
"learning_rate": 1.8014243820695436e-05,
"loss": 1.4557,
"step": 1186
},
{
"epoch": 0.49727691663175533,
"grad_norm": 0.6054872870445251,
"learning_rate": 1.8012568077084206e-05,
"loss": 1.4159,
"step": 1187
},
{
"epoch": 0.4976958525345622,
"grad_norm": 0.5704550743103027,
"learning_rate": 1.801089233347298e-05,
"loss": 1.3056,
"step": 1188
},
{
"epoch": 0.49811478843736906,
"grad_norm": 0.6517187356948853,
"learning_rate": 1.8009216589861755e-05,
"loss": 1.2846,
"step": 1189
},
{
"epoch": 0.49853372434017595,
"grad_norm": 0.5502644181251526,
"learning_rate": 1.8007540846250525e-05,
"loss": 1.3019,
"step": 1190
},
{
"epoch": 0.49895266024298285,
"grad_norm": 0.8192209601402283,
"learning_rate": 1.8005865102639296e-05,
"loss": 1.3248,
"step": 1191
},
{
"epoch": 0.4993715961457897,
"grad_norm": 0.6397109627723694,
"learning_rate": 1.800418935902807e-05,
"loss": 1.3134,
"step": 1192
},
{
"epoch": 0.4997905320485966,
"grad_norm": 0.6563698649406433,
"learning_rate": 1.8002513615416844e-05,
"loss": 1.3183,
"step": 1193
},
{
"epoch": 0.5002094679514034,
"grad_norm": 0.5266475081443787,
"learning_rate": 1.8000837871805615e-05,
"loss": 1.3474,
"step": 1194
},
{
"epoch": 0.5006284038542103,
"grad_norm": 0.6310097575187683,
"learning_rate": 1.7999162128194386e-05,
"loss": 1.3524,
"step": 1195
},
{
"epoch": 0.5010473397570172,
"grad_norm": 0.6334779262542725,
"learning_rate": 1.799748638458316e-05,
"loss": 1.2801,
"step": 1196
},
{
"epoch": 0.501466275659824,
"grad_norm": 0.663242518901825,
"learning_rate": 1.7995810640971934e-05,
"loss": 1.4329,
"step": 1197
},
{
"epoch": 0.5018852115626309,
"grad_norm": 0.5669201612472534,
"learning_rate": 1.7994134897360705e-05,
"loss": 1.3236,
"step": 1198
},
{
"epoch": 0.5023041474654378,
"grad_norm": 0.6204095482826233,
"learning_rate": 1.799245915374948e-05,
"loss": 1.3376,
"step": 1199
},
{
"epoch": 0.5027230833682447,
"grad_norm": 0.5199185609817505,
"learning_rate": 1.799078341013825e-05,
"loss": 1.3079,
"step": 1200
},
{
"epoch": 0.5031420192710515,
"grad_norm": 0.5151832699775696,
"learning_rate": 1.7989107666527024e-05,
"loss": 1.3648,
"step": 1201
},
{
"epoch": 0.5035609551738583,
"grad_norm": 0.46930187940597534,
"learning_rate": 1.7987431922915795e-05,
"loss": 1.2752,
"step": 1202
},
{
"epoch": 0.5039798910766653,
"grad_norm": 0.5534581542015076,
"learning_rate": 1.798575617930457e-05,
"loss": 1.2502,
"step": 1203
},
{
"epoch": 0.5043988269794721,
"grad_norm": 0.640783965587616,
"learning_rate": 1.798408043569334e-05,
"loss": 1.3008,
"step": 1204
},
{
"epoch": 0.504817762882279,
"grad_norm": 0.5893279314041138,
"learning_rate": 1.7982404692082114e-05,
"loss": 1.3014,
"step": 1205
},
{
"epoch": 0.5052366987850859,
"grad_norm": 0.49928784370422363,
"learning_rate": 1.7980728948470884e-05,
"loss": 1.3347,
"step": 1206
},
{
"epoch": 0.5056556346878928,
"grad_norm": 0.5527677536010742,
"learning_rate": 1.797905320485966e-05,
"loss": 1.3459,
"step": 1207
},
{
"epoch": 0.5060745705906996,
"grad_norm": 0.5975616574287415,
"learning_rate": 1.7977377461248433e-05,
"loss": 1.3116,
"step": 1208
},
{
"epoch": 0.5064935064935064,
"grad_norm": 0.5838543176651001,
"learning_rate": 1.7975701717637203e-05,
"loss": 1.2198,
"step": 1209
},
{
"epoch": 0.5069124423963134,
"grad_norm": 0.619463324546814,
"learning_rate": 1.7974025974025974e-05,
"loss": 1.302,
"step": 1210
},
{
"epoch": 0.5073313782991202,
"grad_norm": 0.5463703274726868,
"learning_rate": 1.7972350230414748e-05,
"loss": 1.2406,
"step": 1211
},
{
"epoch": 0.5077503142019271,
"grad_norm": 0.5434020161628723,
"learning_rate": 1.7970674486803522e-05,
"loss": 1.335,
"step": 1212
},
{
"epoch": 0.508169250104734,
"grad_norm": 0.5855716466903687,
"learning_rate": 1.7968998743192293e-05,
"loss": 1.3243,
"step": 1213
},
{
"epoch": 0.5085881860075409,
"grad_norm": 0.49900901317596436,
"learning_rate": 1.7967322999581064e-05,
"loss": 1.3723,
"step": 1214
},
{
"epoch": 0.5090071219103477,
"grad_norm": 0.48565396666526794,
"learning_rate": 1.7965647255969838e-05,
"loss": 1.3522,
"step": 1215
},
{
"epoch": 0.5094260578131546,
"grad_norm": 0.643878698348999,
"learning_rate": 1.7963971512358612e-05,
"loss": 1.2914,
"step": 1216
},
{
"epoch": 0.5098449937159615,
"grad_norm": 0.46946290135383606,
"learning_rate": 1.7962295768747383e-05,
"loss": 1.4005,
"step": 1217
},
{
"epoch": 0.5102639296187683,
"grad_norm": 0.5753692388534546,
"learning_rate": 1.7960620025136153e-05,
"loss": 1.2426,
"step": 1218
},
{
"epoch": 0.5106828655215752,
"grad_norm": 0.49606966972351074,
"learning_rate": 1.7958944281524928e-05,
"loss": 1.4139,
"step": 1219
},
{
"epoch": 0.5111018014243821,
"grad_norm": 0.599481999874115,
"learning_rate": 1.7957268537913702e-05,
"loss": 1.2502,
"step": 1220
},
{
"epoch": 0.511520737327189,
"grad_norm": 0.6043007373809814,
"learning_rate": 1.7955592794302472e-05,
"loss": 1.3828,
"step": 1221
},
{
"epoch": 0.5119396732299958,
"grad_norm": 0.5853987336158752,
"learning_rate": 1.7953917050691247e-05,
"loss": 1.4044,
"step": 1222
},
{
"epoch": 0.5123586091328027,
"grad_norm": 0.616538405418396,
"learning_rate": 1.7952241307080017e-05,
"loss": 1.2093,
"step": 1223
},
{
"epoch": 0.5127775450356096,
"grad_norm": 0.5173774361610413,
"learning_rate": 1.795056556346879e-05,
"loss": 1.4132,
"step": 1224
},
{
"epoch": 0.5131964809384164,
"grad_norm": 0.5851691961288452,
"learning_rate": 1.7948889819857562e-05,
"loss": 1.3407,
"step": 1225
},
{
"epoch": 0.5136154168412232,
"grad_norm": 0.545215368270874,
"learning_rate": 1.7947214076246336e-05,
"loss": 1.2557,
"step": 1226
},
{
"epoch": 0.5140343527440302,
"grad_norm": 0.5817573070526123,
"learning_rate": 1.7945538332635107e-05,
"loss": 1.4913,
"step": 1227
},
{
"epoch": 0.514453288646837,
"grad_norm": 0.6271103620529175,
"learning_rate": 1.794386258902388e-05,
"loss": 1.3123,
"step": 1228
},
{
"epoch": 0.5148722245496439,
"grad_norm": 0.5343185067176819,
"learning_rate": 1.7942186845412652e-05,
"loss": 1.2736,
"step": 1229
},
{
"epoch": 0.5152911604524508,
"grad_norm": 0.568390965461731,
"learning_rate": 1.7940511101801426e-05,
"loss": 1.2693,
"step": 1230
},
{
"epoch": 0.5157100963552577,
"grad_norm": 0.8113637566566467,
"learning_rate": 1.79388353581902e-05,
"loss": 1.1909,
"step": 1231
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.5547187328338623,
"learning_rate": 1.793715961457897e-05,
"loss": 1.3286,
"step": 1232
},
{
"epoch": 0.5165479681608713,
"grad_norm": 0.7221224904060364,
"learning_rate": 1.7935483870967742e-05,
"loss": 1.2616,
"step": 1233
},
{
"epoch": 0.5169669040636783,
"grad_norm": 0.5355010628700256,
"learning_rate": 1.7933808127356516e-05,
"loss": 1.2733,
"step": 1234
},
{
"epoch": 0.5173858399664851,
"grad_norm": 0.5288854837417603,
"learning_rate": 1.793213238374529e-05,
"loss": 1.2773,
"step": 1235
},
{
"epoch": 0.517804775869292,
"grad_norm": 0.6158447265625,
"learning_rate": 1.793045664013406e-05,
"loss": 1.2732,
"step": 1236
},
{
"epoch": 0.5182237117720989,
"grad_norm": 0.5706437826156616,
"learning_rate": 1.792878089652283e-05,
"loss": 1.3005,
"step": 1237
},
{
"epoch": 0.5186426476749058,
"grad_norm": 0.7265235185623169,
"learning_rate": 1.7927105152911606e-05,
"loss": 1.258,
"step": 1238
},
{
"epoch": 0.5190615835777126,
"grad_norm": 0.59310382604599,
"learning_rate": 1.792542940930038e-05,
"loss": 1.2875,
"step": 1239
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.531716525554657,
"learning_rate": 1.792375366568915e-05,
"loss": 1.2751,
"step": 1240
},
{
"epoch": 0.5198994553833264,
"grad_norm": 1.1989325284957886,
"learning_rate": 1.792207792207792e-05,
"loss": 1.2632,
"step": 1241
},
{
"epoch": 0.5203183912861332,
"grad_norm": 0.5530804991722107,
"learning_rate": 1.7920402178466695e-05,
"loss": 1.3474,
"step": 1242
},
{
"epoch": 0.5207373271889401,
"grad_norm": 0.6528697609901428,
"learning_rate": 1.791872643485547e-05,
"loss": 1.2502,
"step": 1243
},
{
"epoch": 0.521156263091747,
"grad_norm": 0.6965655088424683,
"learning_rate": 1.7917050691244244e-05,
"loss": 1.2548,
"step": 1244
},
{
"epoch": 0.5215751989945538,
"grad_norm": 0.521445095539093,
"learning_rate": 1.7915374947633014e-05,
"loss": 1.3501,
"step": 1245
},
{
"epoch": 0.5219941348973607,
"grad_norm": 0.7838431000709534,
"learning_rate": 1.7913699204021785e-05,
"loss": 1.3237,
"step": 1246
},
{
"epoch": 0.5224130708001675,
"grad_norm": 0.5507728457450867,
"learning_rate": 1.791202346041056e-05,
"loss": 1.3779,
"step": 1247
},
{
"epoch": 0.5228320067029745,
"grad_norm": 0.8786928653717041,
"learning_rate": 1.7910347716799333e-05,
"loss": 1.2335,
"step": 1248
},
{
"epoch": 0.5232509426057813,
"grad_norm": 0.6260472536087036,
"learning_rate": 1.7908671973188104e-05,
"loss": 1.2753,
"step": 1249
},
{
"epoch": 0.5236698785085881,
"grad_norm": 0.6856904029846191,
"learning_rate": 1.7906996229576875e-05,
"loss": 1.2704,
"step": 1250
},
{
"epoch": 0.5240888144113951,
"grad_norm": 0.5970323085784912,
"learning_rate": 1.790532048596565e-05,
"loss": 1.3673,
"step": 1251
},
{
"epoch": 0.5245077503142019,
"grad_norm": 0.5879629254341125,
"learning_rate": 1.790364474235442e-05,
"loss": 1.3472,
"step": 1252
},
{
"epoch": 0.5249266862170088,
"grad_norm": 0.6091963648796082,
"learning_rate": 1.7901968998743194e-05,
"loss": 1.3109,
"step": 1253
},
{
"epoch": 0.5253456221198156,
"grad_norm": 0.5746055841445923,
"learning_rate": 1.7900293255131968e-05,
"loss": 1.3595,
"step": 1254
},
{
"epoch": 0.5257645580226226,
"grad_norm": 0.6204288005828857,
"learning_rate": 1.789861751152074e-05,
"loss": 1.2801,
"step": 1255
},
{
"epoch": 0.5261834939254294,
"grad_norm": 0.5878499746322632,
"learning_rate": 1.789694176790951e-05,
"loss": 1.3157,
"step": 1256
},
{
"epoch": 0.5266024298282362,
"grad_norm": 0.5726604461669922,
"learning_rate": 1.7895266024298283e-05,
"loss": 1.3268,
"step": 1257
},
{
"epoch": 0.5270213657310432,
"grad_norm": 0.5824220180511475,
"learning_rate": 1.7893590280687058e-05,
"loss": 1.2098,
"step": 1258
},
{
"epoch": 0.52744030163385,
"grad_norm": 0.5844467878341675,
"learning_rate": 1.789191453707583e-05,
"loss": 1.2789,
"step": 1259
},
{
"epoch": 0.5278592375366569,
"grad_norm": 0.8804576992988586,
"learning_rate": 1.78902387934646e-05,
"loss": 1.236,
"step": 1260
},
{
"epoch": 0.5282781734394638,
"grad_norm": 0.6094422936439514,
"learning_rate": 1.7888563049853373e-05,
"loss": 1.2946,
"step": 1261
},
{
"epoch": 0.5286971093422707,
"grad_norm": 0.9808681011199951,
"learning_rate": 1.7886887306242147e-05,
"loss": 1.2285,
"step": 1262
},
{
"epoch": 0.5291160452450775,
"grad_norm": 0.9245563745498657,
"learning_rate": 1.7885211562630918e-05,
"loss": 1.2421,
"step": 1263
},
{
"epoch": 0.5295349811478843,
"grad_norm": 0.5571091175079346,
"learning_rate": 1.7883535819019692e-05,
"loss": 1.2987,
"step": 1264
},
{
"epoch": 0.5299539170506913,
"grad_norm": 0.48179373145103455,
"learning_rate": 1.7881860075408463e-05,
"loss": 1.4172,
"step": 1265
},
{
"epoch": 0.5303728529534981,
"grad_norm": 0.9155369997024536,
"learning_rate": 1.7880184331797237e-05,
"loss": 1.3419,
"step": 1266
},
{
"epoch": 0.530791788856305,
"grad_norm": 0.6026879549026489,
"learning_rate": 1.787850858818601e-05,
"loss": 1.2469,
"step": 1267
},
{
"epoch": 0.5312107247591119,
"grad_norm": 0.9528511762619019,
"learning_rate": 1.7876832844574782e-05,
"loss": 1.2443,
"step": 1268
},
{
"epoch": 0.5316296606619187,
"grad_norm": 0.5990762114524841,
"learning_rate": 1.7875157100963553e-05,
"loss": 1.3168,
"step": 1269
},
{
"epoch": 0.5320485965647256,
"grad_norm": 1.0758265256881714,
"learning_rate": 1.7873481357352327e-05,
"loss": 1.3656,
"step": 1270
},
{
"epoch": 0.5324675324675324,
"grad_norm": 0.5298543572425842,
"learning_rate": 1.78718056137411e-05,
"loss": 1.4026,
"step": 1271
},
{
"epoch": 0.5328864683703394,
"grad_norm": 0.7479298710823059,
"learning_rate": 1.7870129870129872e-05,
"loss": 1.3189,
"step": 1272
},
{
"epoch": 0.5333054042731462,
"grad_norm": 0.8419678807258606,
"learning_rate": 1.7868454126518642e-05,
"loss": 1.3479,
"step": 1273
},
{
"epoch": 0.533724340175953,
"grad_norm": 0.6195015907287598,
"learning_rate": 1.7866778382907417e-05,
"loss": 1.2711,
"step": 1274
},
{
"epoch": 0.53414327607876,
"grad_norm": 0.9979670643806458,
"learning_rate": 1.786510263929619e-05,
"loss": 1.3364,
"step": 1275
},
{
"epoch": 0.5345622119815668,
"grad_norm": 0.5945103168487549,
"learning_rate": 1.786342689568496e-05,
"loss": 1.3895,
"step": 1276
},
{
"epoch": 0.5349811478843737,
"grad_norm": 0.8684617877006531,
"learning_rate": 1.7861751152073736e-05,
"loss": 1.3637,
"step": 1277
},
{
"epoch": 0.5354000837871805,
"grad_norm": 0.7205759882926941,
"learning_rate": 1.7860075408462506e-05,
"loss": 1.2979,
"step": 1278
},
{
"epoch": 0.5358190196899875,
"grad_norm": 0.6859915852546692,
"learning_rate": 1.785839966485128e-05,
"loss": 1.3394,
"step": 1279
},
{
"epoch": 0.5362379555927943,
"grad_norm": 0.6070793867111206,
"learning_rate": 1.785672392124005e-05,
"loss": 1.4482,
"step": 1280
},
{
"epoch": 0.5366568914956011,
"grad_norm": 0.5955244302749634,
"learning_rate": 1.7855048177628825e-05,
"loss": 1.2846,
"step": 1281
},
{
"epoch": 0.5370758273984081,
"grad_norm": 0.589763879776001,
"learning_rate": 1.7853372434017596e-05,
"loss": 1.2811,
"step": 1282
},
{
"epoch": 0.5374947633012149,
"grad_norm": 0.6333023905754089,
"learning_rate": 1.7851696690406367e-05,
"loss": 1.369,
"step": 1283
},
{
"epoch": 0.5379136992040218,
"grad_norm": 0.6410380601882935,
"learning_rate": 1.785002094679514e-05,
"loss": 1.2449,
"step": 1284
},
{
"epoch": 0.5383326351068286,
"grad_norm": 0.6279928684234619,
"learning_rate": 1.7848345203183915e-05,
"loss": 1.3662,
"step": 1285
},
{
"epoch": 0.5387515710096356,
"grad_norm": 0.6539936065673828,
"learning_rate": 1.784666945957269e-05,
"loss": 1.3048,
"step": 1286
},
{
"epoch": 0.5391705069124424,
"grad_norm": 0.6298508048057556,
"learning_rate": 1.784499371596146e-05,
"loss": 1.1403,
"step": 1287
},
{
"epoch": 0.5395894428152492,
"grad_norm": 0.4872311055660248,
"learning_rate": 1.784331797235023e-05,
"loss": 1.3026,
"step": 1288
},
{
"epoch": 0.5400083787180562,
"grad_norm": 0.5511845350265503,
"learning_rate": 1.7841642228739005e-05,
"loss": 1.2766,
"step": 1289
},
{
"epoch": 0.540427314620863,
"grad_norm": 0.5141856074333191,
"learning_rate": 1.783996648512778e-05,
"loss": 1.2894,
"step": 1290
},
{
"epoch": 0.5408462505236699,
"grad_norm": 0.5504962801933289,
"learning_rate": 1.783829074151655e-05,
"loss": 1.3424,
"step": 1291
},
{
"epoch": 0.5412651864264767,
"grad_norm": 0.6576582193374634,
"learning_rate": 1.783661499790532e-05,
"loss": 1.3112,
"step": 1292
},
{
"epoch": 0.5416841223292836,
"grad_norm": 0.6444346308708191,
"learning_rate": 1.7834939254294095e-05,
"loss": 1.3218,
"step": 1293
},
{
"epoch": 0.5421030582320905,
"grad_norm": 0.6057987809181213,
"learning_rate": 1.783326351068287e-05,
"loss": 1.3376,
"step": 1294
},
{
"epoch": 0.5425219941348973,
"grad_norm": 0.5632282495498657,
"learning_rate": 1.783158776707164e-05,
"loss": 1.3227,
"step": 1295
},
{
"epoch": 0.5429409300377043,
"grad_norm": 0.6084432601928711,
"learning_rate": 1.782991202346041e-05,
"loss": 1.3502,
"step": 1296
},
{
"epoch": 0.5433598659405111,
"grad_norm": 0.5181003212928772,
"learning_rate": 1.7828236279849184e-05,
"loss": 1.3187,
"step": 1297
},
{
"epoch": 0.543778801843318,
"grad_norm": 0.557337760925293,
"learning_rate": 1.782656053623796e-05,
"loss": 1.2896,
"step": 1298
},
{
"epoch": 0.5441977377461248,
"grad_norm": 0.5090380311012268,
"learning_rate": 1.782488479262673e-05,
"loss": 1.3751,
"step": 1299
},
{
"epoch": 0.5446166736489317,
"grad_norm": 0.4427342414855957,
"learning_rate": 1.7823209049015503e-05,
"loss": 1.2677,
"step": 1300
}
],
"logging_steps": 1.0,
"max_steps": 11935,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.6673390806892544e+18,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}