huseinzol05's picture
Add files using upload-large-folder tool
dc2fc76 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3770423125261835,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00041893590280687055,
"grad_norm": 8.333969116210938,
"learning_rate": 2e-05,
"loss": 3.3261,
"step": 1
},
{
"epoch": 0.0008378718056137411,
"grad_norm": 7.348582744598389,
"learning_rate": 1.9998324256388776e-05,
"loss": 3.2312,
"step": 2
},
{
"epoch": 0.0012568077084206116,
"grad_norm": 3.1899449825286865,
"learning_rate": 1.9996648512777547e-05,
"loss": 2.9318,
"step": 3
},
{
"epoch": 0.0016757436112274822,
"grad_norm": 2.177359104156494,
"learning_rate": 1.9994972769166317e-05,
"loss": 2.8876,
"step": 4
},
{
"epoch": 0.0020946795140343527,
"grad_norm": 2.1708009243011475,
"learning_rate": 1.999329702555509e-05,
"loss": 3.068,
"step": 5
},
{
"epoch": 0.0025136154168412233,
"grad_norm": 1.77362060546875,
"learning_rate": 1.9991621281943865e-05,
"loss": 2.6598,
"step": 6
},
{
"epoch": 0.002932551319648094,
"grad_norm": 1.709831714630127,
"learning_rate": 1.9989945538332636e-05,
"loss": 2.767,
"step": 7
},
{
"epoch": 0.0033514872224549644,
"grad_norm": 1.5493128299713135,
"learning_rate": 1.998826979472141e-05,
"loss": 2.7182,
"step": 8
},
{
"epoch": 0.003770423125261835,
"grad_norm": 1.39557945728302,
"learning_rate": 1.998659405111018e-05,
"loss": 2.8695,
"step": 9
},
{
"epoch": 0.0041893590280687055,
"grad_norm": 1.1357755661010742,
"learning_rate": 1.9984918307498955e-05,
"loss": 2.5497,
"step": 10
},
{
"epoch": 0.004608294930875576,
"grad_norm": 1.0683544874191284,
"learning_rate": 1.9983242563887726e-05,
"loss": 2.7363,
"step": 11
},
{
"epoch": 0.005027230833682447,
"grad_norm": 0.9199109673500061,
"learning_rate": 1.99815668202765e-05,
"loss": 2.4438,
"step": 12
},
{
"epoch": 0.005446166736489317,
"grad_norm": 0.9708887338638306,
"learning_rate": 1.997989107666527e-05,
"loss": 2.6098,
"step": 13
},
{
"epoch": 0.005865102639296188,
"grad_norm": 0.9026995301246643,
"learning_rate": 1.9978215333054045e-05,
"loss": 2.4444,
"step": 14
},
{
"epoch": 0.006284038542103058,
"grad_norm": 1.2418183088302612,
"learning_rate": 1.9976539589442816e-05,
"loss": 2.5436,
"step": 15
},
{
"epoch": 0.006702974444909929,
"grad_norm": 0.8761052489280701,
"learning_rate": 1.997486384583159e-05,
"loss": 2.3926,
"step": 16
},
{
"epoch": 0.007121910347716799,
"grad_norm": 0.8849633932113647,
"learning_rate": 1.997318810222036e-05,
"loss": 2.5935,
"step": 17
},
{
"epoch": 0.00754084625052367,
"grad_norm": 0.8368175029754639,
"learning_rate": 1.9971512358609135e-05,
"loss": 2.4684,
"step": 18
},
{
"epoch": 0.007959782153330541,
"grad_norm": 0.697807788848877,
"learning_rate": 1.9969836614997905e-05,
"loss": 2.3537,
"step": 19
},
{
"epoch": 0.008378718056137411,
"grad_norm": 0.7476556301116943,
"learning_rate": 1.996816087138668e-05,
"loss": 2.4659,
"step": 20
},
{
"epoch": 0.008797653958944282,
"grad_norm": 0.7885666489601135,
"learning_rate": 1.9966485127775454e-05,
"loss": 2.4466,
"step": 21
},
{
"epoch": 0.009216589861751152,
"grad_norm": 0.7271686792373657,
"learning_rate": 1.9964809384164224e-05,
"loss": 2.3659,
"step": 22
},
{
"epoch": 0.009635525764558023,
"grad_norm": 0.7286465764045715,
"learning_rate": 1.9963133640552995e-05,
"loss": 2.4473,
"step": 23
},
{
"epoch": 0.010054461667364893,
"grad_norm": 0.8572853207588196,
"learning_rate": 1.996145789694177e-05,
"loss": 2.3595,
"step": 24
},
{
"epoch": 0.010473397570171765,
"grad_norm": 0.8283334374427795,
"learning_rate": 1.9959782153330543e-05,
"loss": 2.5291,
"step": 25
},
{
"epoch": 0.010892333472978634,
"grad_norm": 0.6586313843727112,
"learning_rate": 1.9958106409719314e-05,
"loss": 2.3985,
"step": 26
},
{
"epoch": 0.011311269375785506,
"grad_norm": 0.6430657505989075,
"learning_rate": 1.9956430666108085e-05,
"loss": 2.3578,
"step": 27
},
{
"epoch": 0.011730205278592375,
"grad_norm": 0.6550448536872864,
"learning_rate": 1.995475492249686e-05,
"loss": 2.4077,
"step": 28
},
{
"epoch": 0.012149141181399247,
"grad_norm": 0.7592840194702148,
"learning_rate": 1.9953079178885633e-05,
"loss": 2.5008,
"step": 29
},
{
"epoch": 0.012568077084206116,
"grad_norm": 0.7858672738075256,
"learning_rate": 1.9951403435274407e-05,
"loss": 2.4674,
"step": 30
},
{
"epoch": 0.012987012987012988,
"grad_norm": 0.6130352020263672,
"learning_rate": 1.9949727691663178e-05,
"loss": 2.3526,
"step": 31
},
{
"epoch": 0.013405948889819858,
"grad_norm": 0.6684207320213318,
"learning_rate": 1.994805194805195e-05,
"loss": 2.3732,
"step": 32
},
{
"epoch": 0.013824884792626729,
"grad_norm": 0.8275600671768188,
"learning_rate": 1.9946376204440723e-05,
"loss": 2.135,
"step": 33
},
{
"epoch": 0.014243820695433599,
"grad_norm": 0.5858725309371948,
"learning_rate": 1.9944700460829494e-05,
"loss": 2.1368,
"step": 34
},
{
"epoch": 0.01466275659824047,
"grad_norm": 0.7133444547653198,
"learning_rate": 1.9943024717218268e-05,
"loss": 2.304,
"step": 35
},
{
"epoch": 0.01508169250104734,
"grad_norm": 0.5466803312301636,
"learning_rate": 1.994134897360704e-05,
"loss": 2.1682,
"step": 36
},
{
"epoch": 0.015500628403854211,
"grad_norm": 0.5196086168289185,
"learning_rate": 1.9939673229995813e-05,
"loss": 2.1546,
"step": 37
},
{
"epoch": 0.015919564306661083,
"grad_norm": 0.5088497400283813,
"learning_rate": 1.9937997486384583e-05,
"loss": 2.1018,
"step": 38
},
{
"epoch": 0.016338500209467952,
"grad_norm": 0.6117899417877197,
"learning_rate": 1.9936321742773358e-05,
"loss": 2.2346,
"step": 39
},
{
"epoch": 0.016757436112274822,
"grad_norm": 0.5710458159446716,
"learning_rate": 1.993464599916213e-05,
"loss": 2.2147,
"step": 40
},
{
"epoch": 0.01717637201508169,
"grad_norm": 0.5152861475944519,
"learning_rate": 1.9932970255550902e-05,
"loss": 2.2716,
"step": 41
},
{
"epoch": 0.017595307917888565,
"grad_norm": 0.6851192712783813,
"learning_rate": 1.9931294511939673e-05,
"loss": 2.3158,
"step": 42
},
{
"epoch": 0.018014243820695434,
"grad_norm": 0.5485531687736511,
"learning_rate": 1.9929618768328447e-05,
"loss": 2.2679,
"step": 43
},
{
"epoch": 0.018433179723502304,
"grad_norm": 0.48592010140419006,
"learning_rate": 1.992794302471722e-05,
"loss": 2.1303,
"step": 44
},
{
"epoch": 0.018852115626309174,
"grad_norm": 0.5533665418624878,
"learning_rate": 1.9926267281105992e-05,
"loss": 2.1981,
"step": 45
},
{
"epoch": 0.019271051529116047,
"grad_norm": 0.5932656526565552,
"learning_rate": 1.9924591537494763e-05,
"loss": 2.3737,
"step": 46
},
{
"epoch": 0.019689987431922917,
"grad_norm": 0.5236673951148987,
"learning_rate": 1.9922915793883537e-05,
"loss": 2.2694,
"step": 47
},
{
"epoch": 0.020108923334729786,
"grad_norm": 0.5357316732406616,
"learning_rate": 1.992124005027231e-05,
"loss": 2.2368,
"step": 48
},
{
"epoch": 0.020527859237536656,
"grad_norm": 0.5500349998474121,
"learning_rate": 1.9919564306661082e-05,
"loss": 2.213,
"step": 49
},
{
"epoch": 0.02094679514034353,
"grad_norm": 0.48040810227394104,
"learning_rate": 1.9917888563049853e-05,
"loss": 2.1892,
"step": 50
},
{
"epoch": 0.0213657310431504,
"grad_norm": 0.5716186165809631,
"learning_rate": 1.9916212819438627e-05,
"loss": 2.2039,
"step": 51
},
{
"epoch": 0.02178466694595727,
"grad_norm": 0.5564374923706055,
"learning_rate": 1.99145370758274e-05,
"loss": 2.1411,
"step": 52
},
{
"epoch": 0.022203602848764138,
"grad_norm": 0.4996980130672455,
"learning_rate": 1.9912861332216175e-05,
"loss": 2.1521,
"step": 53
},
{
"epoch": 0.02262253875157101,
"grad_norm": 0.5239240527153015,
"learning_rate": 1.9911185588604946e-05,
"loss": 2.0742,
"step": 54
},
{
"epoch": 0.02304147465437788,
"grad_norm": 0.4403076767921448,
"learning_rate": 1.9909509844993716e-05,
"loss": 1.9841,
"step": 55
},
{
"epoch": 0.02346041055718475,
"grad_norm": 0.5169032216072083,
"learning_rate": 1.990783410138249e-05,
"loss": 2.0327,
"step": 56
},
{
"epoch": 0.02387934645999162,
"grad_norm": 0.4901898503303528,
"learning_rate": 1.9906158357771265e-05,
"loss": 2.0063,
"step": 57
},
{
"epoch": 0.024298282362798494,
"grad_norm": 0.6581910252571106,
"learning_rate": 1.9904482614160035e-05,
"loss": 2.1385,
"step": 58
},
{
"epoch": 0.024717218265605363,
"grad_norm": 0.4522070586681366,
"learning_rate": 1.9902806870548806e-05,
"loss": 1.9944,
"step": 59
},
{
"epoch": 0.025136154168412233,
"grad_norm": 0.5315820574760437,
"learning_rate": 1.990113112693758e-05,
"loss": 2.1579,
"step": 60
},
{
"epoch": 0.025555090071219103,
"grad_norm": 0.4661259353160858,
"learning_rate": 1.9899455383326354e-05,
"loss": 2.1193,
"step": 61
},
{
"epoch": 0.025974025974025976,
"grad_norm": 0.4940222203731537,
"learning_rate": 1.9897779639715125e-05,
"loss": 2.0844,
"step": 62
},
{
"epoch": 0.026392961876832845,
"grad_norm": 0.46520665287971497,
"learning_rate": 1.98961038961039e-05,
"loss": 1.9306,
"step": 63
},
{
"epoch": 0.026811897779639715,
"grad_norm": 0.5645989179611206,
"learning_rate": 1.989442815249267e-05,
"loss": 2.1236,
"step": 64
},
{
"epoch": 0.027230833682446585,
"grad_norm": 0.47880157828330994,
"learning_rate": 1.989275240888144e-05,
"loss": 2.0206,
"step": 65
},
{
"epoch": 0.027649769585253458,
"grad_norm": 0.6371349692344666,
"learning_rate": 1.9891076665270215e-05,
"loss": 2.019,
"step": 66
},
{
"epoch": 0.028068705488060328,
"grad_norm": 0.5742272734642029,
"learning_rate": 1.988940092165899e-05,
"loss": 2.0899,
"step": 67
},
{
"epoch": 0.028487641390867197,
"grad_norm": 0.5579768419265747,
"learning_rate": 1.988772517804776e-05,
"loss": 2.081,
"step": 68
},
{
"epoch": 0.028906577293674067,
"grad_norm": 0.5897182822227478,
"learning_rate": 1.988604943443653e-05,
"loss": 1.9601,
"step": 69
},
{
"epoch": 0.02932551319648094,
"grad_norm": 0.46881428360939026,
"learning_rate": 1.9884373690825305e-05,
"loss": 1.9085,
"step": 70
},
{
"epoch": 0.02974444909928781,
"grad_norm": 0.6095844507217407,
"learning_rate": 1.988269794721408e-05,
"loss": 1.9762,
"step": 71
},
{
"epoch": 0.03016338500209468,
"grad_norm": 0.599513053894043,
"learning_rate": 1.988102220360285e-05,
"loss": 1.8723,
"step": 72
},
{
"epoch": 0.03058232090490155,
"grad_norm": 0.585457980632782,
"learning_rate": 1.987934645999162e-05,
"loss": 1.9209,
"step": 73
},
{
"epoch": 0.031001256807708422,
"grad_norm": 0.42224225401878357,
"learning_rate": 1.9877670716380394e-05,
"loss": 1.9186,
"step": 74
},
{
"epoch": 0.03142019271051529,
"grad_norm": 0.4566991329193115,
"learning_rate": 1.987599497276917e-05,
"loss": 2.018,
"step": 75
},
{
"epoch": 0.031839128613322165,
"grad_norm": 0.47718995809555054,
"learning_rate": 1.9874319229157943e-05,
"loss": 2.0119,
"step": 76
},
{
"epoch": 0.03225806451612903,
"grad_norm": 0.4412285089492798,
"learning_rate": 1.9872643485546713e-05,
"loss": 1.9211,
"step": 77
},
{
"epoch": 0.032677000418935905,
"grad_norm": 0.4711454212665558,
"learning_rate": 1.9870967741935484e-05,
"loss": 1.9051,
"step": 78
},
{
"epoch": 0.03309593632174277,
"grad_norm": 0.4665948450565338,
"learning_rate": 1.9869291998324258e-05,
"loss": 1.9571,
"step": 79
},
{
"epoch": 0.033514872224549644,
"grad_norm": 0.46011775732040405,
"learning_rate": 1.9867616254713032e-05,
"loss": 1.9599,
"step": 80
},
{
"epoch": 0.03393380812735652,
"grad_norm": 0.46272069215774536,
"learning_rate": 1.9865940511101803e-05,
"loss": 1.9161,
"step": 81
},
{
"epoch": 0.03435274403016338,
"grad_norm": 0.5554195046424866,
"learning_rate": 1.9864264767490574e-05,
"loss": 2.0202,
"step": 82
},
{
"epoch": 0.034771679932970256,
"grad_norm": 0.5324104428291321,
"learning_rate": 1.9862589023879348e-05,
"loss": 1.9356,
"step": 83
},
{
"epoch": 0.03519061583577713,
"grad_norm": 0.5279750823974609,
"learning_rate": 1.9860913280268122e-05,
"loss": 1.9511,
"step": 84
},
{
"epoch": 0.035609551738583996,
"grad_norm": 0.5002080202102661,
"learning_rate": 1.9859237536656893e-05,
"loss": 1.9248,
"step": 85
},
{
"epoch": 0.03602848764139087,
"grad_norm": 0.5625497102737427,
"learning_rate": 1.9857561793045667e-05,
"loss": 2.0023,
"step": 86
},
{
"epoch": 0.036447423544197735,
"grad_norm": 0.6030247807502747,
"learning_rate": 1.9855886049434438e-05,
"loss": 1.893,
"step": 87
},
{
"epoch": 0.03686635944700461,
"grad_norm": 0.4760509729385376,
"learning_rate": 1.9854210305823212e-05,
"loss": 1.8902,
"step": 88
},
{
"epoch": 0.03728529534981148,
"grad_norm": 0.6618624925613403,
"learning_rate": 1.9852534562211983e-05,
"loss": 1.9173,
"step": 89
},
{
"epoch": 0.03770423125261835,
"grad_norm": 0.47204822301864624,
"learning_rate": 1.9850858818600757e-05,
"loss": 1.9266,
"step": 90
},
{
"epoch": 0.03812316715542522,
"grad_norm": 0.5421533584594727,
"learning_rate": 1.9849183074989527e-05,
"loss": 1.9796,
"step": 91
},
{
"epoch": 0.038542103058232094,
"grad_norm": 0.48972201347351074,
"learning_rate": 1.98475073313783e-05,
"loss": 1.91,
"step": 92
},
{
"epoch": 0.03896103896103896,
"grad_norm": 0.5566658973693848,
"learning_rate": 1.9845831587767072e-05,
"loss": 1.8992,
"step": 93
},
{
"epoch": 0.03937997486384583,
"grad_norm": 0.4685937464237213,
"learning_rate": 1.9844155844155846e-05,
"loss": 1.9231,
"step": 94
},
{
"epoch": 0.0397989107666527,
"grad_norm": 0.6744531393051147,
"learning_rate": 1.9842480100544617e-05,
"loss": 1.9109,
"step": 95
},
{
"epoch": 0.04021784666945957,
"grad_norm": 0.6984325051307678,
"learning_rate": 1.984080435693339e-05,
"loss": 1.9566,
"step": 96
},
{
"epoch": 0.040636782572266446,
"grad_norm": 0.6627328991889954,
"learning_rate": 1.9839128613322162e-05,
"loss": 1.9933,
"step": 97
},
{
"epoch": 0.04105571847507331,
"grad_norm": 0.4586343765258789,
"learning_rate": 1.9837452869710936e-05,
"loss": 1.7939,
"step": 98
},
{
"epoch": 0.041474654377880185,
"grad_norm": 0.6211162805557251,
"learning_rate": 1.983577712609971e-05,
"loss": 1.9164,
"step": 99
},
{
"epoch": 0.04189359028068706,
"grad_norm": 0.9397639632225037,
"learning_rate": 1.983410138248848e-05,
"loss": 2.0262,
"step": 100
},
{
"epoch": 0.042312526183493925,
"grad_norm": 0.7698065638542175,
"learning_rate": 1.9832425638877252e-05,
"loss": 1.8979,
"step": 101
},
{
"epoch": 0.0427314620863008,
"grad_norm": 0.5800043940544128,
"learning_rate": 1.9830749895266026e-05,
"loss": 1.9483,
"step": 102
},
{
"epoch": 0.043150397989107664,
"grad_norm": 0.7634892463684082,
"learning_rate": 1.98290741516548e-05,
"loss": 1.7777,
"step": 103
},
{
"epoch": 0.04356933389191454,
"grad_norm": 0.5963580012321472,
"learning_rate": 1.982739840804357e-05,
"loss": 1.9406,
"step": 104
},
{
"epoch": 0.04398826979472141,
"grad_norm": 0.6970496773719788,
"learning_rate": 1.982572266443234e-05,
"loss": 1.967,
"step": 105
},
{
"epoch": 0.044407205697528276,
"grad_norm": 0.5826534032821655,
"learning_rate": 1.9824046920821116e-05,
"loss": 1.8577,
"step": 106
},
{
"epoch": 0.04482614160033515,
"grad_norm": 0.60413658618927,
"learning_rate": 1.982237117720989e-05,
"loss": 1.9528,
"step": 107
},
{
"epoch": 0.04524507750314202,
"grad_norm": 0.7267922759056091,
"learning_rate": 1.982069543359866e-05,
"loss": 1.7067,
"step": 108
},
{
"epoch": 0.04566401340594889,
"grad_norm": 0.6376165747642517,
"learning_rate": 1.9819019689987435e-05,
"loss": 1.9888,
"step": 109
},
{
"epoch": 0.04608294930875576,
"grad_norm": 0.5887031555175781,
"learning_rate": 1.9817343946376205e-05,
"loss": 1.7777,
"step": 110
},
{
"epoch": 0.04650188521156263,
"grad_norm": 0.6548938155174255,
"learning_rate": 1.981566820276498e-05,
"loss": 1.9222,
"step": 111
},
{
"epoch": 0.0469208211143695,
"grad_norm": 0.5757064819335938,
"learning_rate": 1.981399245915375e-05,
"loss": 1.844,
"step": 112
},
{
"epoch": 0.047339757017176375,
"grad_norm": 0.7597166895866394,
"learning_rate": 1.9812316715542524e-05,
"loss": 1.8678,
"step": 113
},
{
"epoch": 0.04775869291998324,
"grad_norm": 0.5536984801292419,
"learning_rate": 1.9810640971931295e-05,
"loss": 1.8502,
"step": 114
},
{
"epoch": 0.048177628822790114,
"grad_norm": 0.5753149390220642,
"learning_rate": 1.980896522832007e-05,
"loss": 1.8613,
"step": 115
},
{
"epoch": 0.04859656472559699,
"grad_norm": 0.6214611530303955,
"learning_rate": 1.980728948470884e-05,
"loss": 1.8803,
"step": 116
},
{
"epoch": 0.04901550062840385,
"grad_norm": 0.5892764329910278,
"learning_rate": 1.9805613741097614e-05,
"loss": 1.8241,
"step": 117
},
{
"epoch": 0.049434436531210726,
"grad_norm": 0.5623623132705688,
"learning_rate": 1.9803937997486388e-05,
"loss": 1.8584,
"step": 118
},
{
"epoch": 0.04985337243401759,
"grad_norm": 0.5206480622291565,
"learning_rate": 1.980226225387516e-05,
"loss": 1.8058,
"step": 119
},
{
"epoch": 0.050272308336824466,
"grad_norm": 0.7416813373565674,
"learning_rate": 1.980058651026393e-05,
"loss": 1.7422,
"step": 120
},
{
"epoch": 0.05069124423963134,
"grad_norm": 0.6095878481864929,
"learning_rate": 1.9798910766652704e-05,
"loss": 1.7567,
"step": 121
},
{
"epoch": 0.051110180142438205,
"grad_norm": 0.5830249786376953,
"learning_rate": 1.9797235023041478e-05,
"loss": 1.8099,
"step": 122
},
{
"epoch": 0.05152911604524508,
"grad_norm": 0.7603867053985596,
"learning_rate": 1.979555927943025e-05,
"loss": 1.8074,
"step": 123
},
{
"epoch": 0.05194805194805195,
"grad_norm": 0.607905387878418,
"learning_rate": 1.979388353581902e-05,
"loss": 1.8359,
"step": 124
},
{
"epoch": 0.05236698785085882,
"grad_norm": 0.5446661710739136,
"learning_rate": 1.9792207792207794e-05,
"loss": 1.7291,
"step": 125
},
{
"epoch": 0.05278592375366569,
"grad_norm": 0.5527285933494568,
"learning_rate": 1.9790532048596568e-05,
"loss": 1.7841,
"step": 126
},
{
"epoch": 0.05320485965647256,
"grad_norm": 0.6565405130386353,
"learning_rate": 1.978885630498534e-05,
"loss": 1.9168,
"step": 127
},
{
"epoch": 0.05362379555927943,
"grad_norm": 0.5959405899047852,
"learning_rate": 1.978718056137411e-05,
"loss": 1.8735,
"step": 128
},
{
"epoch": 0.0540427314620863,
"grad_norm": 0.7685437202453613,
"learning_rate": 1.9785504817762883e-05,
"loss": 1.782,
"step": 129
},
{
"epoch": 0.05446166736489317,
"grad_norm": 0.5747430324554443,
"learning_rate": 1.9783829074151657e-05,
"loss": 1.7831,
"step": 130
},
{
"epoch": 0.05488060326770004,
"grad_norm": 0.7328975200653076,
"learning_rate": 1.9782153330540428e-05,
"loss": 1.7451,
"step": 131
},
{
"epoch": 0.055299539170506916,
"grad_norm": 0.5662095546722412,
"learning_rate": 1.9780477586929202e-05,
"loss": 1.6731,
"step": 132
},
{
"epoch": 0.05571847507331378,
"grad_norm": 0.6165090799331665,
"learning_rate": 1.9778801843317973e-05,
"loss": 1.7787,
"step": 133
},
{
"epoch": 0.056137410976120655,
"grad_norm": 0.6399924755096436,
"learning_rate": 1.9777126099706747e-05,
"loss": 1.8697,
"step": 134
},
{
"epoch": 0.05655634687892752,
"grad_norm": 0.6513495445251465,
"learning_rate": 1.9775450356095518e-05,
"loss": 1.7885,
"step": 135
},
{
"epoch": 0.056975282781734395,
"grad_norm": 0.652104914188385,
"learning_rate": 1.9773774612484292e-05,
"loss": 1.8462,
"step": 136
},
{
"epoch": 0.05739421868454127,
"grad_norm": 0.8418712615966797,
"learning_rate": 1.9772098868873063e-05,
"loss": 1.8771,
"step": 137
},
{
"epoch": 0.057813154587348134,
"grad_norm": 0.6133163571357727,
"learning_rate": 1.9770423125261837e-05,
"loss": 1.7998,
"step": 138
},
{
"epoch": 0.05823209049015501,
"grad_norm": 0.6718358993530273,
"learning_rate": 1.9768747381650608e-05,
"loss": 1.7633,
"step": 139
},
{
"epoch": 0.05865102639296188,
"grad_norm": 0.6728368997573853,
"learning_rate": 1.9767071638039382e-05,
"loss": 1.8313,
"step": 140
},
{
"epoch": 0.059069962295768746,
"grad_norm": 0.588307797908783,
"learning_rate": 1.9765395894428156e-05,
"loss": 1.7395,
"step": 141
},
{
"epoch": 0.05948889819857562,
"grad_norm": 0.889776885509491,
"learning_rate": 1.9763720150816927e-05,
"loss": 1.7591,
"step": 142
},
{
"epoch": 0.059907834101382486,
"grad_norm": 0.5996978878974915,
"learning_rate": 1.9762044407205697e-05,
"loss": 1.6881,
"step": 143
},
{
"epoch": 0.06032677000418936,
"grad_norm": 0.6324535012245178,
"learning_rate": 1.976036866359447e-05,
"loss": 1.7142,
"step": 144
},
{
"epoch": 0.06074570590699623,
"grad_norm": 0.6198902130126953,
"learning_rate": 1.9758692919983246e-05,
"loss": 1.6867,
"step": 145
},
{
"epoch": 0.0611646418098031,
"grad_norm": 0.6651074886322021,
"learning_rate": 1.9757017176372016e-05,
"loss": 1.7773,
"step": 146
},
{
"epoch": 0.06158357771260997,
"grad_norm": 0.6877864599227905,
"learning_rate": 1.9755341432760787e-05,
"loss": 1.792,
"step": 147
},
{
"epoch": 0.062002513615416845,
"grad_norm": 0.5880079865455627,
"learning_rate": 1.975366568914956e-05,
"loss": 1.7761,
"step": 148
},
{
"epoch": 0.06242144951822371,
"grad_norm": 0.6574519872665405,
"learning_rate": 1.9751989945538335e-05,
"loss": 1.7769,
"step": 149
},
{
"epoch": 0.06284038542103058,
"grad_norm": 0.5385615825653076,
"learning_rate": 1.9750314201927106e-05,
"loss": 1.7226,
"step": 150
},
{
"epoch": 0.06325932132383745,
"grad_norm": 0.6323086023330688,
"learning_rate": 1.9748638458315877e-05,
"loss": 1.7043,
"step": 151
},
{
"epoch": 0.06367825722664433,
"grad_norm": 0.5224570035934448,
"learning_rate": 1.974696271470465e-05,
"loss": 1.7103,
"step": 152
},
{
"epoch": 0.0640971931294512,
"grad_norm": 0.5448949933052063,
"learning_rate": 1.9745286971093425e-05,
"loss": 1.7691,
"step": 153
},
{
"epoch": 0.06451612903225806,
"grad_norm": 0.47703322768211365,
"learning_rate": 1.97436112274822e-05,
"loss": 1.7107,
"step": 154
},
{
"epoch": 0.06493506493506493,
"grad_norm": 0.555005669593811,
"learning_rate": 1.974193548387097e-05,
"loss": 1.7493,
"step": 155
},
{
"epoch": 0.06535400083787181,
"grad_norm": 0.5083591938018799,
"learning_rate": 1.974025974025974e-05,
"loss": 1.621,
"step": 156
},
{
"epoch": 0.06577293674067868,
"grad_norm": 0.5037218928337097,
"learning_rate": 1.9738583996648515e-05,
"loss": 1.6905,
"step": 157
},
{
"epoch": 0.06619187264348554,
"grad_norm": 0.48570749163627625,
"learning_rate": 1.973690825303729e-05,
"loss": 1.73,
"step": 158
},
{
"epoch": 0.06661080854629242,
"grad_norm": 0.5082443952560425,
"learning_rate": 1.973523250942606e-05,
"loss": 1.7107,
"step": 159
},
{
"epoch": 0.06702974444909929,
"grad_norm": 0.5330659747123718,
"learning_rate": 1.973355676581483e-05,
"loss": 1.7756,
"step": 160
},
{
"epoch": 0.06744868035190615,
"grad_norm": 0.6401243209838867,
"learning_rate": 1.9731881022203605e-05,
"loss": 1.7171,
"step": 161
},
{
"epoch": 0.06786761625471303,
"grad_norm": 0.5555040836334229,
"learning_rate": 1.9730205278592375e-05,
"loss": 1.7117,
"step": 162
},
{
"epoch": 0.0682865521575199,
"grad_norm": 0.5580366253852844,
"learning_rate": 1.972852953498115e-05,
"loss": 1.7173,
"step": 163
},
{
"epoch": 0.06870548806032677,
"grad_norm": 0.6296005249023438,
"learning_rate": 1.9726853791369924e-05,
"loss": 1.7316,
"step": 164
},
{
"epoch": 0.06912442396313365,
"grad_norm": 0.681302547454834,
"learning_rate": 1.9725178047758694e-05,
"loss": 1.7849,
"step": 165
},
{
"epoch": 0.06954335986594051,
"grad_norm": 0.6220213770866394,
"learning_rate": 1.9723502304147465e-05,
"loss": 1.656,
"step": 166
},
{
"epoch": 0.06996229576874738,
"grad_norm": 0.6640814542770386,
"learning_rate": 1.972182656053624e-05,
"loss": 1.8749,
"step": 167
},
{
"epoch": 0.07038123167155426,
"grad_norm": 0.5521919131278992,
"learning_rate": 1.9720150816925013e-05,
"loss": 1.763,
"step": 168
},
{
"epoch": 0.07080016757436113,
"grad_norm": 0.6188511848449707,
"learning_rate": 1.9718475073313784e-05,
"loss": 1.6431,
"step": 169
},
{
"epoch": 0.07121910347716799,
"grad_norm": 0.5388275980949402,
"learning_rate": 1.9716799329702555e-05,
"loss": 1.7444,
"step": 170
},
{
"epoch": 0.07163803937997486,
"grad_norm": 1.0150574445724487,
"learning_rate": 1.971512358609133e-05,
"loss": 1.6986,
"step": 171
},
{
"epoch": 0.07205697528278174,
"grad_norm": 0.6714919805526733,
"learning_rate": 1.9713447842480103e-05,
"loss": 1.7461,
"step": 172
},
{
"epoch": 0.0724759111855886,
"grad_norm": 0.6587640047073364,
"learning_rate": 1.9711772098868874e-05,
"loss": 1.6743,
"step": 173
},
{
"epoch": 0.07289484708839547,
"grad_norm": 0.6181256175041199,
"learning_rate": 1.9710096355257648e-05,
"loss": 1.8552,
"step": 174
},
{
"epoch": 0.07331378299120235,
"grad_norm": 0.5564039945602417,
"learning_rate": 1.970842061164642e-05,
"loss": 1.741,
"step": 175
},
{
"epoch": 0.07373271889400922,
"grad_norm": 0.5421382188796997,
"learning_rate": 1.9706744868035193e-05,
"loss": 1.6919,
"step": 176
},
{
"epoch": 0.07415165479681608,
"grad_norm": 0.6172086000442505,
"learning_rate": 1.9705069124423967e-05,
"loss": 1.6686,
"step": 177
},
{
"epoch": 0.07457059069962296,
"grad_norm": 0.5004185438156128,
"learning_rate": 1.9703393380812738e-05,
"loss": 1.6309,
"step": 178
},
{
"epoch": 0.07498952660242983,
"grad_norm": 0.5099078416824341,
"learning_rate": 1.970171763720151e-05,
"loss": 1.674,
"step": 179
},
{
"epoch": 0.0754084625052367,
"grad_norm": 0.5554249882698059,
"learning_rate": 1.9700041893590283e-05,
"loss": 1.6227,
"step": 180
},
{
"epoch": 0.07582739840804358,
"grad_norm": 0.6313985586166382,
"learning_rate": 1.9698366149979057e-05,
"loss": 1.7098,
"step": 181
},
{
"epoch": 0.07624633431085044,
"grad_norm": 0.8186052441596985,
"learning_rate": 1.9696690406367827e-05,
"loss": 1.7933,
"step": 182
},
{
"epoch": 0.07666527021365731,
"grad_norm": 0.5017969608306885,
"learning_rate": 1.9695014662756598e-05,
"loss": 1.6853,
"step": 183
},
{
"epoch": 0.07708420611646419,
"grad_norm": 0.6917697787284851,
"learning_rate": 1.9693338919145372e-05,
"loss": 1.7031,
"step": 184
},
{
"epoch": 0.07750314201927105,
"grad_norm": 0.5040557980537415,
"learning_rate": 1.9691663175534146e-05,
"loss": 1.6353,
"step": 185
},
{
"epoch": 0.07792207792207792,
"grad_norm": 0.5733162760734558,
"learning_rate": 1.9689987431922917e-05,
"loss": 1.7278,
"step": 186
},
{
"epoch": 0.07834101382488479,
"grad_norm": 0.7823026776313782,
"learning_rate": 1.968831168831169e-05,
"loss": 1.6954,
"step": 187
},
{
"epoch": 0.07875994972769167,
"grad_norm": 0.5559296607971191,
"learning_rate": 1.9686635944700462e-05,
"loss": 1.7417,
"step": 188
},
{
"epoch": 0.07917888563049853,
"grad_norm": 0.6399711966514587,
"learning_rate": 1.9684960201089236e-05,
"loss": 1.6529,
"step": 189
},
{
"epoch": 0.0795978215333054,
"grad_norm": 0.6075267195701599,
"learning_rate": 1.9683284457478007e-05,
"loss": 1.687,
"step": 190
},
{
"epoch": 0.08001675743611228,
"grad_norm": 0.5875303149223328,
"learning_rate": 1.968160871386678e-05,
"loss": 1.6569,
"step": 191
},
{
"epoch": 0.08043569333891915,
"grad_norm": 0.6546170711517334,
"learning_rate": 1.9679932970255552e-05,
"loss": 1.6844,
"step": 192
},
{
"epoch": 0.08085462924172601,
"grad_norm": 0.5778879523277283,
"learning_rate": 1.9678257226644322e-05,
"loss": 1.6705,
"step": 193
},
{
"epoch": 0.08127356514453289,
"grad_norm": 0.5718396306037903,
"learning_rate": 1.9676581483033097e-05,
"loss": 1.6519,
"step": 194
},
{
"epoch": 0.08169250104733976,
"grad_norm": 0.6596693992614746,
"learning_rate": 1.967490573942187e-05,
"loss": 1.6547,
"step": 195
},
{
"epoch": 0.08211143695014662,
"grad_norm": 0.5057825446128845,
"learning_rate": 1.9673229995810645e-05,
"loss": 1.6214,
"step": 196
},
{
"epoch": 0.0825303728529535,
"grad_norm": 0.6651629209518433,
"learning_rate": 1.9671554252199416e-05,
"loss": 1.6835,
"step": 197
},
{
"epoch": 0.08294930875576037,
"grad_norm": 0.5056618452072144,
"learning_rate": 1.9669878508588186e-05,
"loss": 1.6368,
"step": 198
},
{
"epoch": 0.08336824465856724,
"grad_norm": 0.4693203568458557,
"learning_rate": 1.966820276497696e-05,
"loss": 1.6322,
"step": 199
},
{
"epoch": 0.08378718056137412,
"grad_norm": 0.5992833375930786,
"learning_rate": 1.9666527021365735e-05,
"loss": 1.7568,
"step": 200
},
{
"epoch": 0.08420611646418098,
"grad_norm": 0.62791508436203,
"learning_rate": 1.9664851277754505e-05,
"loss": 1.6937,
"step": 201
},
{
"epoch": 0.08462505236698785,
"grad_norm": 0.5130066275596619,
"learning_rate": 1.9663175534143276e-05,
"loss": 1.6236,
"step": 202
},
{
"epoch": 0.08504398826979472,
"grad_norm": 0.5045161247253418,
"learning_rate": 1.966149979053205e-05,
"loss": 1.7386,
"step": 203
},
{
"epoch": 0.0854629241726016,
"grad_norm": 0.6568188667297363,
"learning_rate": 1.9659824046920824e-05,
"loss": 1.6639,
"step": 204
},
{
"epoch": 0.08588186007540846,
"grad_norm": 0.6545958518981934,
"learning_rate": 1.9658148303309595e-05,
"loss": 1.6593,
"step": 205
},
{
"epoch": 0.08630079597821533,
"grad_norm": 0.5190823078155518,
"learning_rate": 1.9656472559698366e-05,
"loss": 1.6398,
"step": 206
},
{
"epoch": 0.08671973188102221,
"grad_norm": 0.5478256344795227,
"learning_rate": 1.965479681608714e-05,
"loss": 1.5848,
"step": 207
},
{
"epoch": 0.08713866778382907,
"grad_norm": 0.5818894505500793,
"learning_rate": 1.9653121072475914e-05,
"loss": 1.6046,
"step": 208
},
{
"epoch": 0.08755760368663594,
"grad_norm": 0.6687189936637878,
"learning_rate": 1.9651445328864685e-05,
"loss": 1.6177,
"step": 209
},
{
"epoch": 0.08797653958944282,
"grad_norm": 0.5873174071311951,
"learning_rate": 1.964976958525346e-05,
"loss": 1.659,
"step": 210
},
{
"epoch": 0.08839547549224969,
"grad_norm": 0.5621105432510376,
"learning_rate": 1.964809384164223e-05,
"loss": 1.6739,
"step": 211
},
{
"epoch": 0.08881441139505655,
"grad_norm": 0.7059792876243591,
"learning_rate": 1.9646418098031004e-05,
"loss": 1.6738,
"step": 212
},
{
"epoch": 0.08923334729786343,
"grad_norm": 0.5294623970985413,
"learning_rate": 1.9644742354419775e-05,
"loss": 1.6419,
"step": 213
},
{
"epoch": 0.0896522832006703,
"grad_norm": 0.7764983773231506,
"learning_rate": 1.964306661080855e-05,
"loss": 1.6793,
"step": 214
},
{
"epoch": 0.09007121910347717,
"grad_norm": 0.628094494342804,
"learning_rate": 1.964139086719732e-05,
"loss": 1.676,
"step": 215
},
{
"epoch": 0.09049015500628405,
"grad_norm": 0.5593112111091614,
"learning_rate": 1.9639715123586094e-05,
"loss": 1.7119,
"step": 216
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.7385965585708618,
"learning_rate": 1.9638039379974864e-05,
"loss": 1.6039,
"step": 217
},
{
"epoch": 0.09132802681189778,
"grad_norm": 0.5662972927093506,
"learning_rate": 1.963636363636364e-05,
"loss": 1.6255,
"step": 218
},
{
"epoch": 0.09174696271470464,
"grad_norm": 0.7071956992149353,
"learning_rate": 1.9634687892752413e-05,
"loss": 1.6587,
"step": 219
},
{
"epoch": 0.09216589861751152,
"grad_norm": 0.6986474990844727,
"learning_rate": 1.9633012149141183e-05,
"loss": 1.6266,
"step": 220
},
{
"epoch": 0.09258483452031839,
"grad_norm": 0.6965208053588867,
"learning_rate": 1.9631336405529954e-05,
"loss": 1.6619,
"step": 221
},
{
"epoch": 0.09300377042312526,
"grad_norm": 0.7024741768836975,
"learning_rate": 1.9629660661918728e-05,
"loss": 1.7327,
"step": 222
},
{
"epoch": 0.09342270632593214,
"grad_norm": 0.5887707471847534,
"learning_rate": 1.9627984918307502e-05,
"loss": 1.6706,
"step": 223
},
{
"epoch": 0.093841642228739,
"grad_norm": 0.8550237417221069,
"learning_rate": 1.9626309174696273e-05,
"loss": 1.5773,
"step": 224
},
{
"epoch": 0.09426057813154587,
"grad_norm": 0.6820223331451416,
"learning_rate": 1.9624633431085044e-05,
"loss": 1.7496,
"step": 225
},
{
"epoch": 0.09467951403435275,
"grad_norm": 0.7763844728469849,
"learning_rate": 1.9622957687473818e-05,
"loss": 1.6204,
"step": 226
},
{
"epoch": 0.09509844993715962,
"grad_norm": 0.779120147228241,
"learning_rate": 1.9621281943862592e-05,
"loss": 1.617,
"step": 227
},
{
"epoch": 0.09551738583996648,
"grad_norm": 0.7589849233627319,
"learning_rate": 1.9619606200251363e-05,
"loss": 1.5478,
"step": 228
},
{
"epoch": 0.09593632174277336,
"grad_norm": 0.5152125954627991,
"learning_rate": 1.9617930456640137e-05,
"loss": 1.6749,
"step": 229
},
{
"epoch": 0.09635525764558023,
"grad_norm": 0.7013604640960693,
"learning_rate": 1.9616254713028908e-05,
"loss": 1.5864,
"step": 230
},
{
"epoch": 0.0967741935483871,
"grad_norm": 0.7294275760650635,
"learning_rate": 1.9614578969417682e-05,
"loss": 1.5845,
"step": 231
},
{
"epoch": 0.09719312945119397,
"grad_norm": 0.5346665382385254,
"learning_rate": 1.9612903225806452e-05,
"loss": 1.643,
"step": 232
},
{
"epoch": 0.09761206535400084,
"grad_norm": 0.953779935836792,
"learning_rate": 1.9611227482195227e-05,
"loss": 1.6152,
"step": 233
},
{
"epoch": 0.0980310012568077,
"grad_norm": 0.6668707132339478,
"learning_rate": 1.9609551738583997e-05,
"loss": 1.6066,
"step": 234
},
{
"epoch": 0.09844993715961457,
"grad_norm": 0.6693033576011658,
"learning_rate": 1.960787599497277e-05,
"loss": 1.6124,
"step": 235
},
{
"epoch": 0.09886887306242145,
"grad_norm": 0.8081066608428955,
"learning_rate": 1.9606200251361542e-05,
"loss": 1.6351,
"step": 236
},
{
"epoch": 0.09928780896522832,
"grad_norm": 0.7561270594596863,
"learning_rate": 1.9604524507750316e-05,
"loss": 1.6476,
"step": 237
},
{
"epoch": 0.09970674486803519,
"grad_norm": 0.8342212438583374,
"learning_rate": 1.9602848764139087e-05,
"loss": 1.5573,
"step": 238
},
{
"epoch": 0.10012568077084207,
"grad_norm": 0.8095865845680237,
"learning_rate": 1.960117302052786e-05,
"loss": 1.5594,
"step": 239
},
{
"epoch": 0.10054461667364893,
"grad_norm": 0.8448402881622314,
"learning_rate": 1.9599497276916632e-05,
"loss": 1.6646,
"step": 240
},
{
"epoch": 0.1009635525764558,
"grad_norm": 0.936273455619812,
"learning_rate": 1.9597821533305406e-05,
"loss": 1.5822,
"step": 241
},
{
"epoch": 0.10138248847926268,
"grad_norm": 0.5605466365814209,
"learning_rate": 1.959614578969418e-05,
"loss": 1.616,
"step": 242
},
{
"epoch": 0.10180142438206954,
"grad_norm": 1.0700498819351196,
"learning_rate": 1.959447004608295e-05,
"loss": 1.618,
"step": 243
},
{
"epoch": 0.10222036028487641,
"grad_norm": 0.6166669726371765,
"learning_rate": 1.959279430247172e-05,
"loss": 1.5989,
"step": 244
},
{
"epoch": 0.10263929618768329,
"grad_norm": 0.7001603841781616,
"learning_rate": 1.9591118558860496e-05,
"loss": 1.464,
"step": 245
},
{
"epoch": 0.10305823209049016,
"grad_norm": 0.5694488883018494,
"learning_rate": 1.958944281524927e-05,
"loss": 1.5641,
"step": 246
},
{
"epoch": 0.10347716799329702,
"grad_norm": 0.5658904314041138,
"learning_rate": 1.958776707163804e-05,
"loss": 1.5911,
"step": 247
},
{
"epoch": 0.1038961038961039,
"grad_norm": 0.6600093245506287,
"learning_rate": 1.958609132802681e-05,
"loss": 1.6812,
"step": 248
},
{
"epoch": 0.10431503979891077,
"grad_norm": 0.7548564672470093,
"learning_rate": 1.9584415584415586e-05,
"loss": 1.5563,
"step": 249
},
{
"epoch": 0.10473397570171764,
"grad_norm": 0.6965343952178955,
"learning_rate": 1.958273984080436e-05,
"loss": 1.6126,
"step": 250
},
{
"epoch": 0.1051529116045245,
"grad_norm": 0.57705157995224,
"learning_rate": 1.958106409719313e-05,
"loss": 1.6778,
"step": 251
},
{
"epoch": 0.10557184750733138,
"grad_norm": 1.2478373050689697,
"learning_rate": 1.9579388353581905e-05,
"loss": 1.6654,
"step": 252
},
{
"epoch": 0.10599078341013825,
"grad_norm": 0.586651623249054,
"learning_rate": 1.9577712609970675e-05,
"loss": 1.5895,
"step": 253
},
{
"epoch": 0.10640971931294511,
"grad_norm": 0.6501399874687195,
"learning_rate": 1.957603686635945e-05,
"loss": 1.5991,
"step": 254
},
{
"epoch": 0.106828655215752,
"grad_norm": 0.7814245223999023,
"learning_rate": 1.9574361122748224e-05,
"loss": 1.5765,
"step": 255
},
{
"epoch": 0.10724759111855886,
"grad_norm": 0.691818356513977,
"learning_rate": 1.9572685379136994e-05,
"loss": 1.6663,
"step": 256
},
{
"epoch": 0.10766652702136573,
"grad_norm": 0.9530359506607056,
"learning_rate": 1.9571009635525765e-05,
"loss": 1.6614,
"step": 257
},
{
"epoch": 0.1080854629241726,
"grad_norm": 1.280455231666565,
"learning_rate": 1.956933389191454e-05,
"loss": 1.5998,
"step": 258
},
{
"epoch": 0.10850439882697947,
"grad_norm": 0.6066200137138367,
"learning_rate": 1.956765814830331e-05,
"loss": 1.6352,
"step": 259
},
{
"epoch": 0.10892333472978634,
"grad_norm": 0.6973633766174316,
"learning_rate": 1.9565982404692084e-05,
"loss": 1.619,
"step": 260
},
{
"epoch": 0.10934227063259322,
"grad_norm": 0.857652485370636,
"learning_rate": 1.9564306661080855e-05,
"loss": 1.6507,
"step": 261
},
{
"epoch": 0.10976120653540009,
"grad_norm": 0.5751752853393555,
"learning_rate": 1.956263091746963e-05,
"loss": 1.6866,
"step": 262
},
{
"epoch": 0.11018014243820695,
"grad_norm": 0.899047315120697,
"learning_rate": 1.95609551738584e-05,
"loss": 1.6211,
"step": 263
},
{
"epoch": 0.11059907834101383,
"grad_norm": 0.6887657046318054,
"learning_rate": 1.9559279430247174e-05,
"loss": 1.5289,
"step": 264
},
{
"epoch": 0.1110180142438207,
"grad_norm": 0.6921897530555725,
"learning_rate": 1.9557603686635948e-05,
"loss": 1.6667,
"step": 265
},
{
"epoch": 0.11143695014662756,
"grad_norm": 0.5706766843795776,
"learning_rate": 1.955592794302472e-05,
"loss": 1.6057,
"step": 266
},
{
"epoch": 0.11185588604943443,
"grad_norm": 0.7291983962059021,
"learning_rate": 1.955425219941349e-05,
"loss": 1.5033,
"step": 267
},
{
"epoch": 0.11227482195224131,
"grad_norm": 0.5996133089065552,
"learning_rate": 1.9552576455802263e-05,
"loss": 1.5232,
"step": 268
},
{
"epoch": 0.11269375785504818,
"grad_norm": 0.6987999677658081,
"learning_rate": 1.9550900712191038e-05,
"loss": 1.5832,
"step": 269
},
{
"epoch": 0.11311269375785504,
"grad_norm": 0.7466180920600891,
"learning_rate": 1.954922496857981e-05,
"loss": 1.6155,
"step": 270
},
{
"epoch": 0.11353162966066192,
"grad_norm": 0.5365012884140015,
"learning_rate": 1.954754922496858e-05,
"loss": 1.4844,
"step": 271
},
{
"epoch": 0.11395056556346879,
"grad_norm": 0.6302610635757446,
"learning_rate": 1.9545873481357353e-05,
"loss": 1.6401,
"step": 272
},
{
"epoch": 0.11436950146627566,
"grad_norm": 0.7011299729347229,
"learning_rate": 1.9544197737746127e-05,
"loss": 1.5856,
"step": 273
},
{
"epoch": 0.11478843736908254,
"grad_norm": 0.5213186740875244,
"learning_rate": 1.95425219941349e-05,
"loss": 1.6877,
"step": 274
},
{
"epoch": 0.1152073732718894,
"grad_norm": 0.5973451137542725,
"learning_rate": 1.9540846250523672e-05,
"loss": 1.6255,
"step": 275
},
{
"epoch": 0.11562630917469627,
"grad_norm": 0.8726099729537964,
"learning_rate": 1.9539170506912443e-05,
"loss": 1.6345,
"step": 276
},
{
"epoch": 0.11604524507750315,
"grad_norm": 0.6559906005859375,
"learning_rate": 1.9537494763301217e-05,
"loss": 1.4426,
"step": 277
},
{
"epoch": 0.11646418098031001,
"grad_norm": 1.2165895700454712,
"learning_rate": 1.953581901968999e-05,
"loss": 1.597,
"step": 278
},
{
"epoch": 0.11688311688311688,
"grad_norm": 0.5331985354423523,
"learning_rate": 1.9534143276078762e-05,
"loss": 1.5959,
"step": 279
},
{
"epoch": 0.11730205278592376,
"grad_norm": 0.7331112027168274,
"learning_rate": 1.9532467532467533e-05,
"loss": 1.6,
"step": 280
},
{
"epoch": 0.11772098868873063,
"grad_norm": 0.6983991265296936,
"learning_rate": 1.9530791788856307e-05,
"loss": 1.6188,
"step": 281
},
{
"epoch": 0.11813992459153749,
"grad_norm": 0.6612614393234253,
"learning_rate": 1.952911604524508e-05,
"loss": 1.579,
"step": 282
},
{
"epoch": 0.11855886049434436,
"grad_norm": 0.6999834179878235,
"learning_rate": 1.952744030163385e-05,
"loss": 1.715,
"step": 283
},
{
"epoch": 0.11897779639715124,
"grad_norm": 0.6970024108886719,
"learning_rate": 1.9525764558022622e-05,
"loss": 1.6406,
"step": 284
},
{
"epoch": 0.1193967322999581,
"grad_norm": 0.5302523374557495,
"learning_rate": 1.9524088814411397e-05,
"loss": 1.6388,
"step": 285
},
{
"epoch": 0.11981566820276497,
"grad_norm": 0.7106460332870483,
"learning_rate": 1.952241307080017e-05,
"loss": 1.5679,
"step": 286
},
{
"epoch": 0.12023460410557185,
"grad_norm": 0.6428540945053101,
"learning_rate": 1.952073732718894e-05,
"loss": 1.5151,
"step": 287
},
{
"epoch": 0.12065354000837872,
"grad_norm": 0.6660862565040588,
"learning_rate": 1.9519061583577716e-05,
"loss": 1.6999,
"step": 288
},
{
"epoch": 0.12107247591118558,
"grad_norm": 0.6251623034477234,
"learning_rate": 1.9517385839966486e-05,
"loss": 1.5289,
"step": 289
},
{
"epoch": 0.12149141181399246,
"grad_norm": 0.5240997672080994,
"learning_rate": 1.9515710096355257e-05,
"loss": 1.5991,
"step": 290
},
{
"epoch": 0.12191034771679933,
"grad_norm": 0.47173094749450684,
"learning_rate": 1.951403435274403e-05,
"loss": 1.5734,
"step": 291
},
{
"epoch": 0.1223292836196062,
"grad_norm": 0.775086522102356,
"learning_rate": 1.9512358609132805e-05,
"loss": 1.4597,
"step": 292
},
{
"epoch": 0.12274821952241308,
"grad_norm": 0.622778594493866,
"learning_rate": 1.9510682865521576e-05,
"loss": 1.5872,
"step": 293
},
{
"epoch": 0.12316715542521994,
"grad_norm": 0.5721079111099243,
"learning_rate": 1.9509007121910347e-05,
"loss": 1.5226,
"step": 294
},
{
"epoch": 0.12358609132802681,
"grad_norm": 0.8591808676719666,
"learning_rate": 1.950733137829912e-05,
"loss": 1.6192,
"step": 295
},
{
"epoch": 0.12400502723083369,
"grad_norm": 0.5390528440475464,
"learning_rate": 1.9505655634687895e-05,
"loss": 1.5228,
"step": 296
},
{
"epoch": 0.12442396313364056,
"grad_norm": 0.6414217948913574,
"learning_rate": 1.950397989107667e-05,
"loss": 1.6464,
"step": 297
},
{
"epoch": 0.12484289903644742,
"grad_norm": 0.9504109621047974,
"learning_rate": 1.950230414746544e-05,
"loss": 1.4574,
"step": 298
},
{
"epoch": 0.1252618349392543,
"grad_norm": 0.7934843301773071,
"learning_rate": 1.950062840385421e-05,
"loss": 1.5825,
"step": 299
},
{
"epoch": 0.12568077084206117,
"grad_norm": 1.2911075353622437,
"learning_rate": 1.9498952660242985e-05,
"loss": 1.5519,
"step": 300
},
{
"epoch": 0.12609970674486803,
"grad_norm": 0.7778868079185486,
"learning_rate": 1.949727691663176e-05,
"loss": 1.569,
"step": 301
},
{
"epoch": 0.1265186426476749,
"grad_norm": 0.669278621673584,
"learning_rate": 1.949560117302053e-05,
"loss": 1.6144,
"step": 302
},
{
"epoch": 0.12693757855048177,
"grad_norm": 1.0039944648742676,
"learning_rate": 1.94939254294093e-05,
"loss": 1.5249,
"step": 303
},
{
"epoch": 0.12735651445328866,
"grad_norm": 0.7724584937095642,
"learning_rate": 1.9492249685798074e-05,
"loss": 1.6277,
"step": 304
},
{
"epoch": 0.12777545035609553,
"grad_norm": 0.759467601776123,
"learning_rate": 1.949057394218685e-05,
"loss": 1.5756,
"step": 305
},
{
"epoch": 0.1281943862589024,
"grad_norm": 1.3089075088500977,
"learning_rate": 1.948889819857562e-05,
"loss": 1.5371,
"step": 306
},
{
"epoch": 0.12861332216170926,
"grad_norm": 0.5424050688743591,
"learning_rate": 1.9487222454964393e-05,
"loss": 1.5824,
"step": 307
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.7417402267456055,
"learning_rate": 1.9485546711353164e-05,
"loss": 1.5458,
"step": 308
},
{
"epoch": 0.129451193967323,
"grad_norm": 0.6556446552276611,
"learning_rate": 1.948387096774194e-05,
"loss": 1.6494,
"step": 309
},
{
"epoch": 0.12987012987012986,
"grad_norm": 0.9575488567352295,
"learning_rate": 1.948219522413071e-05,
"loss": 1.5509,
"step": 310
},
{
"epoch": 0.13028906577293675,
"grad_norm": 0.8725043535232544,
"learning_rate": 1.9480519480519483e-05,
"loss": 1.5099,
"step": 311
},
{
"epoch": 0.13070800167574362,
"grad_norm": 1.1596598625183105,
"learning_rate": 1.9478843736908254e-05,
"loss": 1.6001,
"step": 312
},
{
"epoch": 0.13112693757855048,
"grad_norm": 0.986149251461029,
"learning_rate": 1.9477167993297028e-05,
"loss": 1.4582,
"step": 313
},
{
"epoch": 0.13154587348135735,
"grad_norm": 0.6939131021499634,
"learning_rate": 1.94754922496858e-05,
"loss": 1.6263,
"step": 314
},
{
"epoch": 0.13196480938416422,
"grad_norm": 0.664408802986145,
"learning_rate": 1.9473816506074573e-05,
"loss": 1.538,
"step": 315
},
{
"epoch": 0.13238374528697108,
"grad_norm": 0.6826114058494568,
"learning_rate": 1.9472140762463344e-05,
"loss": 1.5806,
"step": 316
},
{
"epoch": 0.13280268118977798,
"grad_norm": 0.6879693269729614,
"learning_rate": 1.9470465018852118e-05,
"loss": 1.5156,
"step": 317
},
{
"epoch": 0.13322161709258484,
"grad_norm": 1.2276116609573364,
"learning_rate": 1.946878927524089e-05,
"loss": 1.4466,
"step": 318
},
{
"epoch": 0.1336405529953917,
"grad_norm": 0.7346695065498352,
"learning_rate": 1.9467113531629663e-05,
"loss": 1.5769,
"step": 319
},
{
"epoch": 0.13405948889819858,
"grad_norm": 0.795690655708313,
"learning_rate": 1.9465437788018437e-05,
"loss": 1.5348,
"step": 320
},
{
"epoch": 0.13447842480100544,
"grad_norm": 0.8207523822784424,
"learning_rate": 1.9463762044407208e-05,
"loss": 1.5196,
"step": 321
},
{
"epoch": 0.1348973607038123,
"grad_norm": 0.6205607056617737,
"learning_rate": 1.9462086300795978e-05,
"loss": 1.5508,
"step": 322
},
{
"epoch": 0.13531629660661917,
"grad_norm": 0.7060804963111877,
"learning_rate": 1.9460410557184752e-05,
"loss": 1.547,
"step": 323
},
{
"epoch": 0.13573523250942607,
"grad_norm": 0.6053579449653625,
"learning_rate": 1.9458734813573527e-05,
"loss": 1.5521,
"step": 324
},
{
"epoch": 0.13615416841223293,
"grad_norm": 0.6387944221496582,
"learning_rate": 1.9457059069962297e-05,
"loss": 1.5785,
"step": 325
},
{
"epoch": 0.1365731043150398,
"grad_norm": 0.7160474061965942,
"learning_rate": 1.9455383326351068e-05,
"loss": 1.5527,
"step": 326
},
{
"epoch": 0.13699204021784667,
"grad_norm": 0.5747194290161133,
"learning_rate": 1.9453707582739842e-05,
"loss": 1.4228,
"step": 327
},
{
"epoch": 0.13741097612065353,
"grad_norm": 0.7289405465126038,
"learning_rate": 1.9452031839128616e-05,
"loss": 1.5758,
"step": 328
},
{
"epoch": 0.1378299120234604,
"grad_norm": 0.5292596817016602,
"learning_rate": 1.9450356095517387e-05,
"loss": 1.5287,
"step": 329
},
{
"epoch": 0.1382488479262673,
"grad_norm": 0.7831090092658997,
"learning_rate": 1.944868035190616e-05,
"loss": 1.5584,
"step": 330
},
{
"epoch": 0.13866778382907416,
"grad_norm": 0.6046620011329651,
"learning_rate": 1.9447004608294932e-05,
"loss": 1.5534,
"step": 331
},
{
"epoch": 0.13908671973188103,
"grad_norm": 0.7165292501449585,
"learning_rate": 1.9445328864683706e-05,
"loss": 1.621,
"step": 332
},
{
"epoch": 0.1395056556346879,
"grad_norm": 0.7406589388847351,
"learning_rate": 1.9443653121072477e-05,
"loss": 1.5954,
"step": 333
},
{
"epoch": 0.13992459153749476,
"grad_norm": 0.5955418348312378,
"learning_rate": 1.944197737746125e-05,
"loss": 1.5457,
"step": 334
},
{
"epoch": 0.14034352744030162,
"grad_norm": 0.5523016452789307,
"learning_rate": 1.944030163385002e-05,
"loss": 1.5131,
"step": 335
},
{
"epoch": 0.14076246334310852,
"grad_norm": 0.7677832841873169,
"learning_rate": 1.9438625890238796e-05,
"loss": 1.5521,
"step": 336
},
{
"epoch": 0.14118139924591538,
"grad_norm": 0.6301062107086182,
"learning_rate": 1.9436950146627566e-05,
"loss": 1.5836,
"step": 337
},
{
"epoch": 0.14160033514872225,
"grad_norm": 0.6077446341514587,
"learning_rate": 1.943527440301634e-05,
"loss": 1.5108,
"step": 338
},
{
"epoch": 0.14201927105152912,
"grad_norm": 0.6678585410118103,
"learning_rate": 1.943359865940511e-05,
"loss": 1.5488,
"step": 339
},
{
"epoch": 0.14243820695433598,
"grad_norm": 0.7583240270614624,
"learning_rate": 1.9431922915793885e-05,
"loss": 1.551,
"step": 340
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.5899871587753296,
"learning_rate": 1.9430247172182656e-05,
"loss": 1.5522,
"step": 341
},
{
"epoch": 0.14327607875994972,
"grad_norm": 0.6008652448654175,
"learning_rate": 1.942857142857143e-05,
"loss": 1.5742,
"step": 342
},
{
"epoch": 0.1436950146627566,
"grad_norm": 0.5442250967025757,
"learning_rate": 1.9426895684960204e-05,
"loss": 1.5352,
"step": 343
},
{
"epoch": 0.14411395056556348,
"grad_norm": 0.5791664123535156,
"learning_rate": 1.9425219941348975e-05,
"loss": 1.5924,
"step": 344
},
{
"epoch": 0.14453288646837034,
"grad_norm": 0.5690919160842896,
"learning_rate": 1.9423544197737746e-05,
"loss": 1.5219,
"step": 345
},
{
"epoch": 0.1449518223711772,
"grad_norm": 0.6443383693695068,
"learning_rate": 1.942186845412652e-05,
"loss": 1.5994,
"step": 346
},
{
"epoch": 0.14537075827398407,
"grad_norm": 0.5575938820838928,
"learning_rate": 1.9420192710515294e-05,
"loss": 1.5579,
"step": 347
},
{
"epoch": 0.14578969417679094,
"grad_norm": 0.612278401851654,
"learning_rate": 1.9418516966904065e-05,
"loss": 1.6067,
"step": 348
},
{
"epoch": 0.14620863007959783,
"grad_norm": 0.6119928956031799,
"learning_rate": 1.9416841223292836e-05,
"loss": 1.4636,
"step": 349
},
{
"epoch": 0.1466275659824047,
"grad_norm": 0.568268895149231,
"learning_rate": 1.941516547968161e-05,
"loss": 1.4923,
"step": 350
},
{
"epoch": 0.14704650188521157,
"grad_norm": 0.6106241345405579,
"learning_rate": 1.9413489736070384e-05,
"loss": 1.5735,
"step": 351
},
{
"epoch": 0.14746543778801843,
"grad_norm": 0.5713450312614441,
"learning_rate": 1.9411813992459158e-05,
"loss": 1.4594,
"step": 352
},
{
"epoch": 0.1478843736908253,
"grad_norm": 0.5222604274749756,
"learning_rate": 1.941013824884793e-05,
"loss": 1.4957,
"step": 353
},
{
"epoch": 0.14830330959363217,
"grad_norm": 0.6617407202720642,
"learning_rate": 1.94084625052367e-05,
"loss": 1.3623,
"step": 354
},
{
"epoch": 0.14872224549643903,
"grad_norm": 0.6270737648010254,
"learning_rate": 1.9406786761625474e-05,
"loss": 1.494,
"step": 355
},
{
"epoch": 0.14914118139924593,
"grad_norm": 0.6663143634796143,
"learning_rate": 1.9405111018014244e-05,
"loss": 1.5093,
"step": 356
},
{
"epoch": 0.1495601173020528,
"grad_norm": 0.5815137624740601,
"learning_rate": 1.940343527440302e-05,
"loss": 1.5392,
"step": 357
},
{
"epoch": 0.14997905320485966,
"grad_norm": 0.5811892747879028,
"learning_rate": 1.940175953079179e-05,
"loss": 1.5553,
"step": 358
},
{
"epoch": 0.15039798910766652,
"grad_norm": 0.5487301349639893,
"learning_rate": 1.9400083787180563e-05,
"loss": 1.6208,
"step": 359
},
{
"epoch": 0.1508169250104734,
"grad_norm": 0.5299431681632996,
"learning_rate": 1.9398408043569334e-05,
"loss": 1.6097,
"step": 360
},
{
"epoch": 0.15123586091328026,
"grad_norm": 0.6918801665306091,
"learning_rate": 1.9396732299958108e-05,
"loss": 1.5027,
"step": 361
},
{
"epoch": 0.15165479681608715,
"grad_norm": 0.5945777893066406,
"learning_rate": 1.939505655634688e-05,
"loss": 1.6155,
"step": 362
},
{
"epoch": 0.15207373271889402,
"grad_norm": 0.700545072555542,
"learning_rate": 1.9393380812735653e-05,
"loss": 1.6345,
"step": 363
},
{
"epoch": 0.15249266862170088,
"grad_norm": 0.5498125553131104,
"learning_rate": 1.9391705069124424e-05,
"loss": 1.5087,
"step": 364
},
{
"epoch": 0.15291160452450775,
"grad_norm": 0.5619140267372131,
"learning_rate": 1.9390029325513198e-05,
"loss": 1.4977,
"step": 365
},
{
"epoch": 0.15333054042731462,
"grad_norm": 0.5968044400215149,
"learning_rate": 1.9388353581901972e-05,
"loss": 1.5017,
"step": 366
},
{
"epoch": 0.15374947633012148,
"grad_norm": 0.6269423365592957,
"learning_rate": 1.9386677838290743e-05,
"loss": 1.5811,
"step": 367
},
{
"epoch": 0.15416841223292838,
"grad_norm": 0.5672966241836548,
"learning_rate": 1.9385002094679514e-05,
"loss": 1.4781,
"step": 368
},
{
"epoch": 0.15458734813573524,
"grad_norm": 0.8752624988555908,
"learning_rate": 1.9383326351068288e-05,
"loss": 1.4952,
"step": 369
},
{
"epoch": 0.1550062840385421,
"grad_norm": 0.5117892622947693,
"learning_rate": 1.9381650607457062e-05,
"loss": 1.5436,
"step": 370
},
{
"epoch": 0.15542521994134897,
"grad_norm": 0.7978183031082153,
"learning_rate": 1.9379974863845833e-05,
"loss": 1.4284,
"step": 371
},
{
"epoch": 0.15584415584415584,
"grad_norm": 0.5909569263458252,
"learning_rate": 1.9378299120234603e-05,
"loss": 1.5057,
"step": 372
},
{
"epoch": 0.1562630917469627,
"grad_norm": 0.6655667424201965,
"learning_rate": 1.9376623376623377e-05,
"loss": 1.4267,
"step": 373
},
{
"epoch": 0.15668202764976957,
"grad_norm": 0.6063106656074524,
"learning_rate": 1.937494763301215e-05,
"loss": 1.5867,
"step": 374
},
{
"epoch": 0.15710096355257647,
"grad_norm": 0.9726372361183167,
"learning_rate": 1.9373271889400926e-05,
"loss": 1.4887,
"step": 375
},
{
"epoch": 0.15751989945538333,
"grad_norm": 0.711313784122467,
"learning_rate": 1.9371596145789696e-05,
"loss": 1.5464,
"step": 376
},
{
"epoch": 0.1579388353581902,
"grad_norm": 0.6071950197219849,
"learning_rate": 1.9369920402178467e-05,
"loss": 1.4917,
"step": 377
},
{
"epoch": 0.15835777126099707,
"grad_norm": 0.7539801597595215,
"learning_rate": 1.936824465856724e-05,
"loss": 1.4267,
"step": 378
},
{
"epoch": 0.15877670716380393,
"grad_norm": 0.5340871810913086,
"learning_rate": 1.9366568914956015e-05,
"loss": 1.4227,
"step": 379
},
{
"epoch": 0.1591956430666108,
"grad_norm": 0.7538002133369446,
"learning_rate": 1.9364893171344786e-05,
"loss": 1.5565,
"step": 380
},
{
"epoch": 0.1596145789694177,
"grad_norm": 0.6404510736465454,
"learning_rate": 1.9363217427733557e-05,
"loss": 1.5314,
"step": 381
},
{
"epoch": 0.16003351487222456,
"grad_norm": 0.6506287455558777,
"learning_rate": 1.936154168412233e-05,
"loss": 1.5931,
"step": 382
},
{
"epoch": 0.16045245077503142,
"grad_norm": 0.6741127967834473,
"learning_rate": 1.9359865940511105e-05,
"loss": 1.5172,
"step": 383
},
{
"epoch": 0.1608713866778383,
"grad_norm": 0.6476618647575378,
"learning_rate": 1.9358190196899876e-05,
"loss": 1.5038,
"step": 384
},
{
"epoch": 0.16129032258064516,
"grad_norm": 0.7657164931297302,
"learning_rate": 1.935651445328865e-05,
"loss": 1.461,
"step": 385
},
{
"epoch": 0.16170925848345202,
"grad_norm": 0.49177902936935425,
"learning_rate": 1.935483870967742e-05,
"loss": 1.4751,
"step": 386
},
{
"epoch": 0.1621281943862589,
"grad_norm": 1.3233708143234253,
"learning_rate": 1.935316296606619e-05,
"loss": 1.4353,
"step": 387
},
{
"epoch": 0.16254713028906578,
"grad_norm": 0.9667218923568726,
"learning_rate": 1.9351487222454966e-05,
"loss": 1.4472,
"step": 388
},
{
"epoch": 0.16296606619187265,
"grad_norm": 0.6916482448577881,
"learning_rate": 1.934981147884374e-05,
"loss": 1.5265,
"step": 389
},
{
"epoch": 0.16338500209467952,
"grad_norm": 0.870297372341156,
"learning_rate": 1.934813573523251e-05,
"loss": 1.5272,
"step": 390
},
{
"epoch": 0.16380393799748638,
"grad_norm": 0.48826926946640015,
"learning_rate": 1.934645999162128e-05,
"loss": 1.4762,
"step": 391
},
{
"epoch": 0.16422287390029325,
"grad_norm": 0.8552863597869873,
"learning_rate": 1.9344784248010055e-05,
"loss": 1.4508,
"step": 392
},
{
"epoch": 0.16464180980310011,
"grad_norm": 0.626533567905426,
"learning_rate": 1.934310850439883e-05,
"loss": 1.4639,
"step": 393
},
{
"epoch": 0.165060745705907,
"grad_norm": 0.6926416158676147,
"learning_rate": 1.93414327607876e-05,
"loss": 1.5507,
"step": 394
},
{
"epoch": 0.16547968160871387,
"grad_norm": 0.613665759563446,
"learning_rate": 1.933975701717637e-05,
"loss": 1.5496,
"step": 395
},
{
"epoch": 0.16589861751152074,
"grad_norm": 0.7060792446136475,
"learning_rate": 1.9338081273565145e-05,
"loss": 1.4823,
"step": 396
},
{
"epoch": 0.1663175534143276,
"grad_norm": 0.6209156513214111,
"learning_rate": 1.933640552995392e-05,
"loss": 1.5661,
"step": 397
},
{
"epoch": 0.16673648931713447,
"grad_norm": 0.7241356372833252,
"learning_rate": 1.9334729786342693e-05,
"loss": 1.5442,
"step": 398
},
{
"epoch": 0.16715542521994134,
"grad_norm": 0.5998069643974304,
"learning_rate": 1.9333054042731464e-05,
"loss": 1.5247,
"step": 399
},
{
"epoch": 0.16757436112274823,
"grad_norm": 0.5730302929878235,
"learning_rate": 1.9331378299120235e-05,
"loss": 1.5973,
"step": 400
},
{
"epoch": 0.1679932970255551,
"grad_norm": 0.649454653263092,
"learning_rate": 1.932970255550901e-05,
"loss": 1.5839,
"step": 401
},
{
"epoch": 0.16841223292836197,
"grad_norm": 0.5908178687095642,
"learning_rate": 1.9328026811897783e-05,
"loss": 1.4055,
"step": 402
},
{
"epoch": 0.16883116883116883,
"grad_norm": 0.4675125181674957,
"learning_rate": 1.9326351068286554e-05,
"loss": 1.5132,
"step": 403
},
{
"epoch": 0.1692501047339757,
"grad_norm": 0.738477885723114,
"learning_rate": 1.9324675324675325e-05,
"loss": 1.3856,
"step": 404
},
{
"epoch": 0.16966904063678256,
"grad_norm": 0.5623185634613037,
"learning_rate": 1.93229995810641e-05,
"loss": 1.5222,
"step": 405
},
{
"epoch": 0.17008797653958943,
"grad_norm": 0.6933448314666748,
"learning_rate": 1.9321323837452873e-05,
"loss": 1.6141,
"step": 406
},
{
"epoch": 0.17050691244239632,
"grad_norm": 0.6374297142028809,
"learning_rate": 1.9319648093841644e-05,
"loss": 1.5482,
"step": 407
},
{
"epoch": 0.1709258483452032,
"grad_norm": 0.692150354385376,
"learning_rate": 1.9317972350230418e-05,
"loss": 1.4039,
"step": 408
},
{
"epoch": 0.17134478424801006,
"grad_norm": 0.5226042866706848,
"learning_rate": 1.931629660661919e-05,
"loss": 1.4807,
"step": 409
},
{
"epoch": 0.17176372015081692,
"grad_norm": 0.602898895740509,
"learning_rate": 1.9314620863007963e-05,
"loss": 1.4713,
"step": 410
},
{
"epoch": 0.1721826560536238,
"grad_norm": 0.6459433436393738,
"learning_rate": 1.9312945119396733e-05,
"loss": 1.4061,
"step": 411
},
{
"epoch": 0.17260159195643066,
"grad_norm": 0.6216537356376648,
"learning_rate": 1.9311269375785507e-05,
"loss": 1.4756,
"step": 412
},
{
"epoch": 0.17302052785923755,
"grad_norm": 0.5865094065666199,
"learning_rate": 1.9309593632174278e-05,
"loss": 1.5048,
"step": 413
},
{
"epoch": 0.17343946376204442,
"grad_norm": 0.689116895198822,
"learning_rate": 1.9307917888563052e-05,
"loss": 1.5203,
"step": 414
},
{
"epoch": 0.17385839966485128,
"grad_norm": 0.5802826285362244,
"learning_rate": 1.9306242144951823e-05,
"loss": 1.4162,
"step": 415
},
{
"epoch": 0.17427733556765815,
"grad_norm": 0.6490867733955383,
"learning_rate": 1.9304566401340597e-05,
"loss": 1.4837,
"step": 416
},
{
"epoch": 0.17469627147046501,
"grad_norm": 0.538263738155365,
"learning_rate": 1.9302890657729368e-05,
"loss": 1.4829,
"step": 417
},
{
"epoch": 0.17511520737327188,
"grad_norm": 0.5698223114013672,
"learning_rate": 1.9301214914118142e-05,
"loss": 1.5414,
"step": 418
},
{
"epoch": 0.17553414327607875,
"grad_norm": 0.5576086640357971,
"learning_rate": 1.9299539170506913e-05,
"loss": 1.5554,
"step": 419
},
{
"epoch": 0.17595307917888564,
"grad_norm": 0.5333095192909241,
"learning_rate": 1.9297863426895687e-05,
"loss": 1.5389,
"step": 420
},
{
"epoch": 0.1763720150816925,
"grad_norm": 0.6559884548187256,
"learning_rate": 1.929618768328446e-05,
"loss": 1.4234,
"step": 421
},
{
"epoch": 0.17679095098449937,
"grad_norm": 0.5460498332977295,
"learning_rate": 1.9294511939673232e-05,
"loss": 1.39,
"step": 422
},
{
"epoch": 0.17720988688730624,
"grad_norm": 0.636465311050415,
"learning_rate": 1.9292836196062003e-05,
"loss": 1.5061,
"step": 423
},
{
"epoch": 0.1776288227901131,
"grad_norm": 1.0990898609161377,
"learning_rate": 1.9291160452450777e-05,
"loss": 1.5252,
"step": 424
},
{
"epoch": 0.17804775869291997,
"grad_norm": 0.7239777445793152,
"learning_rate": 1.928948470883955e-05,
"loss": 1.6089,
"step": 425
},
{
"epoch": 0.17846669459572687,
"grad_norm": 0.5494760870933533,
"learning_rate": 1.928780896522832e-05,
"loss": 1.5254,
"step": 426
},
{
"epoch": 0.17888563049853373,
"grad_norm": 0.8444346189498901,
"learning_rate": 1.9286133221617092e-05,
"loss": 1.5971,
"step": 427
},
{
"epoch": 0.1793045664013406,
"grad_norm": 0.6326781511306763,
"learning_rate": 1.9284457478005866e-05,
"loss": 1.6235,
"step": 428
},
{
"epoch": 0.17972350230414746,
"grad_norm": 1.274570107460022,
"learning_rate": 1.928278173439464e-05,
"loss": 1.3773,
"step": 429
},
{
"epoch": 0.18014243820695433,
"grad_norm": 0.8181143999099731,
"learning_rate": 1.928110599078341e-05,
"loss": 1.5252,
"step": 430
},
{
"epoch": 0.1805613741097612,
"grad_norm": 1.0652360916137695,
"learning_rate": 1.9279430247172185e-05,
"loss": 1.4825,
"step": 431
},
{
"epoch": 0.1809803100125681,
"grad_norm": 0.8766518831253052,
"learning_rate": 1.9277754503560956e-05,
"loss": 1.5517,
"step": 432
},
{
"epoch": 0.18139924591537496,
"grad_norm": 0.730766236782074,
"learning_rate": 1.927607875994973e-05,
"loss": 1.4122,
"step": 433
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.8081750869750977,
"learning_rate": 1.92744030163385e-05,
"loss": 1.4633,
"step": 434
},
{
"epoch": 0.1822371177209887,
"grad_norm": 0.5756118297576904,
"learning_rate": 1.9272727272727275e-05,
"loss": 1.5168,
"step": 435
},
{
"epoch": 0.18265605362379556,
"grad_norm": 0.7646397948265076,
"learning_rate": 1.9271051529116046e-05,
"loss": 1.3793,
"step": 436
},
{
"epoch": 0.18307498952660242,
"grad_norm": 0.5722802877426147,
"learning_rate": 1.926937578550482e-05,
"loss": 1.536,
"step": 437
},
{
"epoch": 0.1834939254294093,
"grad_norm": 0.6689785718917847,
"learning_rate": 1.926770004189359e-05,
"loss": 1.5591,
"step": 438
},
{
"epoch": 0.18391286133221618,
"grad_norm": 0.6856290102005005,
"learning_rate": 1.9266024298282365e-05,
"loss": 1.4544,
"step": 439
},
{
"epoch": 0.18433179723502305,
"grad_norm": 0.6789899468421936,
"learning_rate": 1.9264348554671136e-05,
"loss": 1.4653,
"step": 440
},
{
"epoch": 0.18475073313782991,
"grad_norm": 0.7284293174743652,
"learning_rate": 1.926267281105991e-05,
"loss": 1.5527,
"step": 441
},
{
"epoch": 0.18516966904063678,
"grad_norm": 0.7645947337150574,
"learning_rate": 1.926099706744868e-05,
"loss": 1.5051,
"step": 442
},
{
"epoch": 0.18558860494344365,
"grad_norm": 0.719478964805603,
"learning_rate": 1.9259321323837455e-05,
"loss": 1.5189,
"step": 443
},
{
"epoch": 0.1860075408462505,
"grad_norm": 0.695095419883728,
"learning_rate": 1.925764558022623e-05,
"loss": 1.5547,
"step": 444
},
{
"epoch": 0.1864264767490574,
"grad_norm": 0.8511472344398499,
"learning_rate": 1.9255969836615e-05,
"loss": 1.5407,
"step": 445
},
{
"epoch": 0.18684541265186427,
"grad_norm": 0.8481591939926147,
"learning_rate": 1.925429409300377e-05,
"loss": 1.4753,
"step": 446
},
{
"epoch": 0.18726434855467114,
"grad_norm": 0.5671373009681702,
"learning_rate": 1.9252618349392544e-05,
"loss": 1.6334,
"step": 447
},
{
"epoch": 0.187683284457478,
"grad_norm": 0.9575358629226685,
"learning_rate": 1.925094260578132e-05,
"loss": 1.4334,
"step": 448
},
{
"epoch": 0.18810222036028487,
"grad_norm": 0.7267637252807617,
"learning_rate": 1.924926686217009e-05,
"loss": 1.369,
"step": 449
},
{
"epoch": 0.18852115626309174,
"grad_norm": 0.9400660991668701,
"learning_rate": 1.924759111855886e-05,
"loss": 1.4424,
"step": 450
},
{
"epoch": 0.1889400921658986,
"grad_norm": 0.6154212951660156,
"learning_rate": 1.9245915374947634e-05,
"loss": 1.5379,
"step": 451
},
{
"epoch": 0.1893590280687055,
"grad_norm": 0.992761492729187,
"learning_rate": 1.9244239631336408e-05,
"loss": 1.4291,
"step": 452
},
{
"epoch": 0.18977796397151236,
"grad_norm": 0.8219314217567444,
"learning_rate": 1.924256388772518e-05,
"loss": 1.4851,
"step": 453
},
{
"epoch": 0.19019689987431923,
"grad_norm": 0.6872371435165405,
"learning_rate": 1.9240888144113953e-05,
"loss": 1.4687,
"step": 454
},
{
"epoch": 0.1906158357771261,
"grad_norm": 0.7827102541923523,
"learning_rate": 1.9239212400502724e-05,
"loss": 1.4223,
"step": 455
},
{
"epoch": 0.19103477167993296,
"grad_norm": 0.6476536989212036,
"learning_rate": 1.9237536656891498e-05,
"loss": 1.4416,
"step": 456
},
{
"epoch": 0.19145370758273983,
"grad_norm": 0.8523492813110352,
"learning_rate": 1.923586091328027e-05,
"loss": 1.4081,
"step": 457
},
{
"epoch": 0.19187264348554672,
"grad_norm": 0.7393168807029724,
"learning_rate": 1.9234185169669043e-05,
"loss": 1.4165,
"step": 458
},
{
"epoch": 0.1922915793883536,
"grad_norm": 0.5783189535140991,
"learning_rate": 1.9232509426057814e-05,
"loss": 1.5124,
"step": 459
},
{
"epoch": 0.19271051529116046,
"grad_norm": 0.5648137927055359,
"learning_rate": 1.9230833682446588e-05,
"loss": 1.4118,
"step": 460
},
{
"epoch": 0.19312945119396732,
"grad_norm": 0.6089081168174744,
"learning_rate": 1.922915793883536e-05,
"loss": 1.5284,
"step": 461
},
{
"epoch": 0.1935483870967742,
"grad_norm": 0.4999605417251587,
"learning_rate": 1.9227482195224133e-05,
"loss": 1.5311,
"step": 462
},
{
"epoch": 0.19396732299958105,
"grad_norm": 0.5938348174095154,
"learning_rate": 1.9225806451612907e-05,
"loss": 1.4788,
"step": 463
},
{
"epoch": 0.19438625890238795,
"grad_norm": 0.547335684299469,
"learning_rate": 1.9224130708001677e-05,
"loss": 1.4408,
"step": 464
},
{
"epoch": 0.19480519480519481,
"grad_norm": 0.6379075050354004,
"learning_rate": 1.9222454964390448e-05,
"loss": 1.5705,
"step": 465
},
{
"epoch": 0.19522413070800168,
"grad_norm": 0.5942078828811646,
"learning_rate": 1.9220779220779222e-05,
"loss": 1.4643,
"step": 466
},
{
"epoch": 0.19564306661080855,
"grad_norm": 0.5671261548995972,
"learning_rate": 1.9219103477167996e-05,
"loss": 1.5048,
"step": 467
},
{
"epoch": 0.1960620025136154,
"grad_norm": 0.5360516905784607,
"learning_rate": 1.9217427733556767e-05,
"loss": 1.4651,
"step": 468
},
{
"epoch": 0.19648093841642228,
"grad_norm": 0.6140695810317993,
"learning_rate": 1.9215751989945538e-05,
"loss": 1.4735,
"step": 469
},
{
"epoch": 0.19689987431922915,
"grad_norm": 0.573509931564331,
"learning_rate": 1.9214076246334312e-05,
"loss": 1.459,
"step": 470
},
{
"epoch": 0.19731881022203604,
"grad_norm": 0.6246591210365295,
"learning_rate": 1.9212400502723086e-05,
"loss": 1.4701,
"step": 471
},
{
"epoch": 0.1977377461248429,
"grad_norm": 0.5583751797676086,
"learning_rate": 1.9210724759111857e-05,
"loss": 1.6073,
"step": 472
},
{
"epoch": 0.19815668202764977,
"grad_norm": 0.7271811962127686,
"learning_rate": 1.9209049015500628e-05,
"loss": 1.4531,
"step": 473
},
{
"epoch": 0.19857561793045664,
"grad_norm": 0.7514092922210693,
"learning_rate": 1.9207373271889402e-05,
"loss": 1.4609,
"step": 474
},
{
"epoch": 0.1989945538332635,
"grad_norm": 0.6135318875312805,
"learning_rate": 1.9205697528278176e-05,
"loss": 1.46,
"step": 475
},
{
"epoch": 0.19941348973607037,
"grad_norm": 0.8104654550552368,
"learning_rate": 1.920402178466695e-05,
"loss": 1.3627,
"step": 476
},
{
"epoch": 0.19983242563887726,
"grad_norm": 0.6633170247077942,
"learning_rate": 1.920234604105572e-05,
"loss": 1.52,
"step": 477
},
{
"epoch": 0.20025136154168413,
"grad_norm": 0.7291746735572815,
"learning_rate": 1.920067029744449e-05,
"loss": 1.547,
"step": 478
},
{
"epoch": 0.200670297444491,
"grad_norm": 0.6674147248268127,
"learning_rate": 1.9198994553833266e-05,
"loss": 1.4795,
"step": 479
},
{
"epoch": 0.20108923334729786,
"grad_norm": 0.9536328911781311,
"learning_rate": 1.919731881022204e-05,
"loss": 1.4581,
"step": 480
},
{
"epoch": 0.20150816925010473,
"grad_norm": 0.8882992267608643,
"learning_rate": 1.919564306661081e-05,
"loss": 1.4351,
"step": 481
},
{
"epoch": 0.2019271051529116,
"grad_norm": 0.5723230838775635,
"learning_rate": 1.919396732299958e-05,
"loss": 1.5095,
"step": 482
},
{
"epoch": 0.20234604105571846,
"grad_norm": 1.1361421346664429,
"learning_rate": 1.9192291579388355e-05,
"loss": 1.4452,
"step": 483
},
{
"epoch": 0.20276497695852536,
"grad_norm": 0.5762605667114258,
"learning_rate": 1.9190615835777126e-05,
"loss": 1.5695,
"step": 484
},
{
"epoch": 0.20318391286133222,
"grad_norm": 0.9020100831985474,
"learning_rate": 1.91889400921659e-05,
"loss": 1.5157,
"step": 485
},
{
"epoch": 0.2036028487641391,
"grad_norm": 0.6302100419998169,
"learning_rate": 1.9187264348554674e-05,
"loss": 1.5346,
"step": 486
},
{
"epoch": 0.20402178466694595,
"grad_norm": 1.1093989610671997,
"learning_rate": 1.9185588604943445e-05,
"loss": 1.5348,
"step": 487
},
{
"epoch": 0.20444072056975282,
"grad_norm": 0.637134850025177,
"learning_rate": 1.9183912861332216e-05,
"loss": 1.4478,
"step": 488
},
{
"epoch": 0.2048596564725597,
"grad_norm": 0.9395270943641663,
"learning_rate": 1.918223711772099e-05,
"loss": 1.4659,
"step": 489
},
{
"epoch": 0.20527859237536658,
"grad_norm": 0.7030125260353088,
"learning_rate": 1.9180561374109764e-05,
"loss": 1.3621,
"step": 490
},
{
"epoch": 0.20569752827817345,
"grad_norm": 0.5922579169273376,
"learning_rate": 1.9178885630498535e-05,
"loss": 1.644,
"step": 491
},
{
"epoch": 0.2061164641809803,
"grad_norm": 1.0161982774734497,
"learning_rate": 1.9177209886887306e-05,
"loss": 1.3895,
"step": 492
},
{
"epoch": 0.20653540008378718,
"grad_norm": 1.179316759109497,
"learning_rate": 1.917553414327608e-05,
"loss": 1.4576,
"step": 493
},
{
"epoch": 0.20695433598659405,
"grad_norm": 0.8130103945732117,
"learning_rate": 1.9173858399664854e-05,
"loss": 1.4637,
"step": 494
},
{
"epoch": 0.2073732718894009,
"grad_norm": 0.7000212669372559,
"learning_rate": 1.9172182656053625e-05,
"loss": 1.5152,
"step": 495
},
{
"epoch": 0.2077922077922078,
"grad_norm": 0.8932341933250427,
"learning_rate": 1.91705069124424e-05,
"loss": 1.4741,
"step": 496
},
{
"epoch": 0.20821114369501467,
"grad_norm": 1.3901516199111938,
"learning_rate": 1.916883116883117e-05,
"loss": 1.4953,
"step": 497
},
{
"epoch": 0.20863007959782154,
"grad_norm": 1.0053791999816895,
"learning_rate": 1.9167155425219944e-05,
"loss": 1.488,
"step": 498
},
{
"epoch": 0.2090490155006284,
"grad_norm": 0.5231297612190247,
"learning_rate": 1.9165479681608718e-05,
"loss": 1.4963,
"step": 499
},
{
"epoch": 0.20946795140343527,
"grad_norm": 0.8381982445716858,
"learning_rate": 1.916380393799749e-05,
"loss": 1.5261,
"step": 500
},
{
"epoch": 0.20988688730624214,
"grad_norm": 0.9278205633163452,
"learning_rate": 1.916212819438626e-05,
"loss": 1.3259,
"step": 501
},
{
"epoch": 0.210305823209049,
"grad_norm": 0.9388899803161621,
"learning_rate": 1.9160452450775033e-05,
"loss": 1.4091,
"step": 502
},
{
"epoch": 0.2107247591118559,
"grad_norm": 1.0538358688354492,
"learning_rate": 1.9158776707163807e-05,
"loss": 1.4633,
"step": 503
},
{
"epoch": 0.21114369501466276,
"grad_norm": 0.6670867800712585,
"learning_rate": 1.9157100963552578e-05,
"loss": 1.5639,
"step": 504
},
{
"epoch": 0.21156263091746963,
"grad_norm": 1.7608953714370728,
"learning_rate": 1.915542521994135e-05,
"loss": 1.4433,
"step": 505
},
{
"epoch": 0.2119815668202765,
"grad_norm": 1.1406441926956177,
"learning_rate": 1.9153749476330123e-05,
"loss": 1.3938,
"step": 506
},
{
"epoch": 0.21240050272308336,
"grad_norm": 1.0751662254333496,
"learning_rate": 1.9152073732718897e-05,
"loss": 1.5059,
"step": 507
},
{
"epoch": 0.21281943862589023,
"grad_norm": 1.7127149105072021,
"learning_rate": 1.9150397989107668e-05,
"loss": 1.4072,
"step": 508
},
{
"epoch": 0.21323837452869712,
"grad_norm": 0.7607043385505676,
"learning_rate": 1.9148722245496442e-05,
"loss": 1.43,
"step": 509
},
{
"epoch": 0.213657310431504,
"grad_norm": 1.0876507759094238,
"learning_rate": 1.9147046501885213e-05,
"loss": 1.4219,
"step": 510
},
{
"epoch": 0.21407624633431085,
"grad_norm": 1.704974889755249,
"learning_rate": 1.9145370758273987e-05,
"loss": 1.5058,
"step": 511
},
{
"epoch": 0.21449518223711772,
"grad_norm": 1.3107315301895142,
"learning_rate": 1.9143695014662758e-05,
"loss": 1.3701,
"step": 512
},
{
"epoch": 0.2149141181399246,
"grad_norm": 0.72137850522995,
"learning_rate": 1.9142019271051532e-05,
"loss": 1.4608,
"step": 513
},
{
"epoch": 0.21533305404273145,
"grad_norm": 0.9491326808929443,
"learning_rate": 1.9140343527440303e-05,
"loss": 1.3998,
"step": 514
},
{
"epoch": 0.21575198994553832,
"grad_norm": 0.9301955103874207,
"learning_rate": 1.9138667783829073e-05,
"loss": 1.5215,
"step": 515
},
{
"epoch": 0.2161709258483452,
"grad_norm": 0.8181395530700684,
"learning_rate": 1.9136992040217847e-05,
"loss": 1.4322,
"step": 516
},
{
"epoch": 0.21658986175115208,
"grad_norm": 0.7477272748947144,
"learning_rate": 1.913531629660662e-05,
"loss": 1.4069,
"step": 517
},
{
"epoch": 0.21700879765395895,
"grad_norm": 0.8078567385673523,
"learning_rate": 1.9133640552995392e-05,
"loss": 1.4061,
"step": 518
},
{
"epoch": 0.2174277335567658,
"grad_norm": 0.6401208639144897,
"learning_rate": 1.9131964809384166e-05,
"loss": 1.4146,
"step": 519
},
{
"epoch": 0.21784666945957268,
"grad_norm": 1.1645647287368774,
"learning_rate": 1.9130289065772937e-05,
"loss": 1.4205,
"step": 520
},
{
"epoch": 0.21826560536237954,
"grad_norm": 0.5319536328315735,
"learning_rate": 1.912861332216171e-05,
"loss": 1.4255,
"step": 521
},
{
"epoch": 0.21868454126518644,
"grad_norm": 0.7974638938903809,
"learning_rate": 1.9126937578550485e-05,
"loss": 1.4991,
"step": 522
},
{
"epoch": 0.2191034771679933,
"grad_norm": 0.7385554313659668,
"learning_rate": 1.9125261834939256e-05,
"loss": 1.4917,
"step": 523
},
{
"epoch": 0.21952241307080017,
"grad_norm": 0.6328215003013611,
"learning_rate": 1.9123586091328027e-05,
"loss": 1.4466,
"step": 524
},
{
"epoch": 0.21994134897360704,
"grad_norm": 0.9712186455726624,
"learning_rate": 1.91219103477168e-05,
"loss": 1.3888,
"step": 525
},
{
"epoch": 0.2203602848764139,
"grad_norm": 0.8162040710449219,
"learning_rate": 1.9120234604105575e-05,
"loss": 1.4592,
"step": 526
},
{
"epoch": 0.22077922077922077,
"grad_norm": 0.6039196252822876,
"learning_rate": 1.9118558860494346e-05,
"loss": 1.4641,
"step": 527
},
{
"epoch": 0.22119815668202766,
"grad_norm": 0.677533745765686,
"learning_rate": 1.9116883116883117e-05,
"loss": 1.4318,
"step": 528
},
{
"epoch": 0.22161709258483453,
"grad_norm": 0.5307941436767578,
"learning_rate": 1.911520737327189e-05,
"loss": 1.4567,
"step": 529
},
{
"epoch": 0.2220360284876414,
"grad_norm": 0.6290689706802368,
"learning_rate": 1.9113531629660665e-05,
"loss": 1.5327,
"step": 530
},
{
"epoch": 0.22245496439044826,
"grad_norm": 0.668403685092926,
"learning_rate": 1.9111855886049436e-05,
"loss": 1.5186,
"step": 531
},
{
"epoch": 0.22287390029325513,
"grad_norm": 0.8590381145477295,
"learning_rate": 1.911018014243821e-05,
"loss": 1.4103,
"step": 532
},
{
"epoch": 0.223292836196062,
"grad_norm": 0.8472411036491394,
"learning_rate": 1.910850439882698e-05,
"loss": 1.4171,
"step": 533
},
{
"epoch": 0.22371177209886886,
"grad_norm": 0.8556522130966187,
"learning_rate": 1.9106828655215755e-05,
"loss": 1.4334,
"step": 534
},
{
"epoch": 0.22413070800167575,
"grad_norm": 0.6294506192207336,
"learning_rate": 1.9105152911604525e-05,
"loss": 1.4425,
"step": 535
},
{
"epoch": 0.22454964390448262,
"grad_norm": 0.8159162402153015,
"learning_rate": 1.91034771679933e-05,
"loss": 1.5275,
"step": 536
},
{
"epoch": 0.2249685798072895,
"grad_norm": 0.7663982510566711,
"learning_rate": 1.910180142438207e-05,
"loss": 1.4252,
"step": 537
},
{
"epoch": 0.22538751571009635,
"grad_norm": 0.6042879223823547,
"learning_rate": 1.9100125680770844e-05,
"loss": 1.4374,
"step": 538
},
{
"epoch": 0.22580645161290322,
"grad_norm": 0.5414412617683411,
"learning_rate": 1.9098449937159615e-05,
"loss": 1.4921,
"step": 539
},
{
"epoch": 0.22622538751571009,
"grad_norm": 0.7135291695594788,
"learning_rate": 1.909677419354839e-05,
"loss": 1.5495,
"step": 540
},
{
"epoch": 0.22664432341851698,
"grad_norm": 0.5732749104499817,
"learning_rate": 1.9095098449937163e-05,
"loss": 1.3577,
"step": 541
},
{
"epoch": 0.22706325932132385,
"grad_norm": 0.6375070810317993,
"learning_rate": 1.9093422706325934e-05,
"loss": 1.4254,
"step": 542
},
{
"epoch": 0.2274821952241307,
"grad_norm": 0.6467480063438416,
"learning_rate": 1.9091746962714705e-05,
"loss": 1.4277,
"step": 543
},
{
"epoch": 0.22790113112693758,
"grad_norm": 0.6565422415733337,
"learning_rate": 1.909007121910348e-05,
"loss": 1.55,
"step": 544
},
{
"epoch": 0.22832006702974444,
"grad_norm": 0.6464892625808716,
"learning_rate": 1.9088395475492253e-05,
"loss": 1.4626,
"step": 545
},
{
"epoch": 0.2287390029325513,
"grad_norm": 0.6357992887496948,
"learning_rate": 1.9086719731881024e-05,
"loss": 1.4343,
"step": 546
},
{
"epoch": 0.22915793883535818,
"grad_norm": 0.6153067946434021,
"learning_rate": 1.9085043988269795e-05,
"loss": 1.4146,
"step": 547
},
{
"epoch": 0.22957687473816507,
"grad_norm": 0.7078065276145935,
"learning_rate": 1.908336824465857e-05,
"loss": 1.4147,
"step": 548
},
{
"epoch": 0.22999581064097194,
"grad_norm": 0.6610170602798462,
"learning_rate": 1.9081692501047343e-05,
"loss": 1.5137,
"step": 549
},
{
"epoch": 0.2304147465437788,
"grad_norm": 0.6916443705558777,
"learning_rate": 1.9080016757436114e-05,
"loss": 1.4726,
"step": 550
},
{
"epoch": 0.23083368244658567,
"grad_norm": 0.668258547782898,
"learning_rate": 1.9078341013824884e-05,
"loss": 1.4796,
"step": 551
},
{
"epoch": 0.23125261834939254,
"grad_norm": 0.6234722137451172,
"learning_rate": 1.907666527021366e-05,
"loss": 1.4962,
"step": 552
},
{
"epoch": 0.2316715542521994,
"grad_norm": 0.7003756761550903,
"learning_rate": 1.9074989526602433e-05,
"loss": 1.4681,
"step": 553
},
{
"epoch": 0.2320904901550063,
"grad_norm": 0.6396431922912598,
"learning_rate": 1.9073313782991203e-05,
"loss": 1.3989,
"step": 554
},
{
"epoch": 0.23250942605781316,
"grad_norm": 0.6359259486198425,
"learning_rate": 1.9071638039379977e-05,
"loss": 1.4533,
"step": 555
},
{
"epoch": 0.23292836196062003,
"grad_norm": 0.6927059292793274,
"learning_rate": 1.9069962295768748e-05,
"loss": 1.4107,
"step": 556
},
{
"epoch": 0.2333472978634269,
"grad_norm": 0.633521318435669,
"learning_rate": 1.9068286552157522e-05,
"loss": 1.4099,
"step": 557
},
{
"epoch": 0.23376623376623376,
"grad_norm": 0.6133984923362732,
"learning_rate": 1.9066610808546293e-05,
"loss": 1.425,
"step": 558
},
{
"epoch": 0.23418516966904063,
"grad_norm": 0.661888062953949,
"learning_rate": 1.9064935064935067e-05,
"loss": 1.5631,
"step": 559
},
{
"epoch": 0.23460410557184752,
"grad_norm": 0.645517110824585,
"learning_rate": 1.9063259321323838e-05,
"loss": 1.4026,
"step": 560
},
{
"epoch": 0.2350230414746544,
"grad_norm": 0.5653700828552246,
"learning_rate": 1.9061583577712612e-05,
"loss": 1.413,
"step": 561
},
{
"epoch": 0.23544197737746125,
"grad_norm": 0.7444579601287842,
"learning_rate": 1.9059907834101383e-05,
"loss": 1.4424,
"step": 562
},
{
"epoch": 0.23586091328026812,
"grad_norm": 0.618651270866394,
"learning_rate": 1.9058232090490157e-05,
"loss": 1.6035,
"step": 563
},
{
"epoch": 0.23627984918307499,
"grad_norm": 0.6566857695579529,
"learning_rate": 1.905655634687893e-05,
"loss": 1.3807,
"step": 564
},
{
"epoch": 0.23669878508588185,
"grad_norm": 0.5518476963043213,
"learning_rate": 1.9054880603267702e-05,
"loss": 1.5754,
"step": 565
},
{
"epoch": 0.23711772098868872,
"grad_norm": 0.5864123106002808,
"learning_rate": 1.9053204859656472e-05,
"loss": 1.4076,
"step": 566
},
{
"epoch": 0.2375366568914956,
"grad_norm": 0.7522138953208923,
"learning_rate": 1.9051529116045247e-05,
"loss": 1.439,
"step": 567
},
{
"epoch": 0.23795559279430248,
"grad_norm": 0.616507351398468,
"learning_rate": 1.904985337243402e-05,
"loss": 1.463,
"step": 568
},
{
"epoch": 0.23837452869710934,
"grad_norm": 0.5815699100494385,
"learning_rate": 1.904817762882279e-05,
"loss": 1.4541,
"step": 569
},
{
"epoch": 0.2387934645999162,
"grad_norm": 0.6003520488739014,
"learning_rate": 1.9046501885211562e-05,
"loss": 1.4897,
"step": 570
},
{
"epoch": 0.23921240050272308,
"grad_norm": 0.6311242580413818,
"learning_rate": 1.9044826141600336e-05,
"loss": 1.4004,
"step": 571
},
{
"epoch": 0.23963133640552994,
"grad_norm": 0.607297956943512,
"learning_rate": 1.904315039798911e-05,
"loss": 1.5698,
"step": 572
},
{
"epoch": 0.24005027230833684,
"grad_norm": 0.7055544257164001,
"learning_rate": 1.904147465437788e-05,
"loss": 1.4389,
"step": 573
},
{
"epoch": 0.2404692082111437,
"grad_norm": 0.7283361554145813,
"learning_rate": 1.9039798910766655e-05,
"loss": 1.4967,
"step": 574
},
{
"epoch": 0.24088814411395057,
"grad_norm": 0.688834547996521,
"learning_rate": 1.9038123167155426e-05,
"loss": 1.4883,
"step": 575
},
{
"epoch": 0.24130708001675744,
"grad_norm": 0.47785529494285583,
"learning_rate": 1.90364474235442e-05,
"loss": 1.5188,
"step": 576
},
{
"epoch": 0.2417260159195643,
"grad_norm": 0.9133197069168091,
"learning_rate": 1.903477167993297e-05,
"loss": 1.3748,
"step": 577
},
{
"epoch": 0.24214495182237117,
"grad_norm": 0.6180113554000854,
"learning_rate": 1.9033095936321745e-05,
"loss": 1.4213,
"step": 578
},
{
"epoch": 0.24256388772517803,
"grad_norm": 0.6890195608139038,
"learning_rate": 1.9031420192710516e-05,
"loss": 1.3621,
"step": 579
},
{
"epoch": 0.24298282362798493,
"grad_norm": 0.6566532850265503,
"learning_rate": 1.902974444909929e-05,
"loss": 1.3666,
"step": 580
},
{
"epoch": 0.2434017595307918,
"grad_norm": 0.572672963142395,
"learning_rate": 1.902806870548806e-05,
"loss": 1.3654,
"step": 581
},
{
"epoch": 0.24382069543359866,
"grad_norm": 0.698391318321228,
"learning_rate": 1.9026392961876835e-05,
"loss": 1.4244,
"step": 582
},
{
"epoch": 0.24423963133640553,
"grad_norm": 0.5516411066055298,
"learning_rate": 1.9024717218265606e-05,
"loss": 1.5141,
"step": 583
},
{
"epoch": 0.2446585672392124,
"grad_norm": 0.6122570037841797,
"learning_rate": 1.902304147465438e-05,
"loss": 1.4333,
"step": 584
},
{
"epoch": 0.24507750314201926,
"grad_norm": 0.7202199697494507,
"learning_rate": 1.902136573104315e-05,
"loss": 1.4562,
"step": 585
},
{
"epoch": 0.24549643904482615,
"grad_norm": 0.938453733921051,
"learning_rate": 1.9019689987431925e-05,
"loss": 1.4605,
"step": 586
},
{
"epoch": 0.24591537494763302,
"grad_norm": 0.5673055648803711,
"learning_rate": 1.90180142438207e-05,
"loss": 1.4827,
"step": 587
},
{
"epoch": 0.24633431085043989,
"grad_norm": 0.677293062210083,
"learning_rate": 1.901633850020947e-05,
"loss": 1.375,
"step": 588
},
{
"epoch": 0.24675324675324675,
"grad_norm": 0.7138928771018982,
"learning_rate": 1.901466275659824e-05,
"loss": 1.541,
"step": 589
},
{
"epoch": 0.24717218265605362,
"grad_norm": 0.7003957033157349,
"learning_rate": 1.9012987012987014e-05,
"loss": 1.4234,
"step": 590
},
{
"epoch": 0.24759111855886048,
"grad_norm": 0.8985845446586609,
"learning_rate": 1.901131126937579e-05,
"loss": 1.2981,
"step": 591
},
{
"epoch": 0.24801005446166738,
"grad_norm": 0.6559215784072876,
"learning_rate": 1.900963552576456e-05,
"loss": 1.4871,
"step": 592
},
{
"epoch": 0.24842899036447424,
"grad_norm": 1.0038827657699585,
"learning_rate": 1.900795978215333e-05,
"loss": 1.3604,
"step": 593
},
{
"epoch": 0.2488479262672811,
"grad_norm": 0.670703649520874,
"learning_rate": 1.9006284038542104e-05,
"loss": 1.4142,
"step": 594
},
{
"epoch": 0.24926686217008798,
"grad_norm": 0.6743998527526855,
"learning_rate": 1.9004608294930878e-05,
"loss": 1.421,
"step": 595
},
{
"epoch": 0.24968579807289484,
"grad_norm": 0.90179044008255,
"learning_rate": 1.900293255131965e-05,
"loss": 1.3545,
"step": 596
},
{
"epoch": 0.2501047339757017,
"grad_norm": 0.7004808187484741,
"learning_rate": 1.9001256807708423e-05,
"loss": 1.4334,
"step": 597
},
{
"epoch": 0.2505236698785086,
"grad_norm": 0.811750590801239,
"learning_rate": 1.8999581064097194e-05,
"loss": 1.5068,
"step": 598
},
{
"epoch": 0.25094260578131544,
"grad_norm": 0.5660247206687927,
"learning_rate": 1.8997905320485968e-05,
"loss": 1.4119,
"step": 599
},
{
"epoch": 0.25136154168412234,
"grad_norm": 0.7421470880508423,
"learning_rate": 1.8996229576874742e-05,
"loss": 1.4193,
"step": 600
},
{
"epoch": 0.2517804775869292,
"grad_norm": 0.7964827418327332,
"learning_rate": 1.8994553833263513e-05,
"loss": 1.3664,
"step": 601
},
{
"epoch": 0.25219941348973607,
"grad_norm": 0.587858259677887,
"learning_rate": 1.8992878089652283e-05,
"loss": 1.4094,
"step": 602
},
{
"epoch": 0.25261834939254296,
"grad_norm": 0.8728364109992981,
"learning_rate": 1.8991202346041058e-05,
"loss": 1.2841,
"step": 603
},
{
"epoch": 0.2530372852953498,
"grad_norm": 0.7561309933662415,
"learning_rate": 1.8989526602429832e-05,
"loss": 1.351,
"step": 604
},
{
"epoch": 0.2534562211981567,
"grad_norm": 0.6205732822418213,
"learning_rate": 1.8987850858818602e-05,
"loss": 1.4043,
"step": 605
},
{
"epoch": 0.25387515710096353,
"grad_norm": 0.7390834093093872,
"learning_rate": 1.8986175115207373e-05,
"loss": 1.4274,
"step": 606
},
{
"epoch": 0.2542940930037704,
"grad_norm": 0.7050930261611938,
"learning_rate": 1.8984499371596147e-05,
"loss": 1.4875,
"step": 607
},
{
"epoch": 0.2547130289065773,
"grad_norm": 0.5341117978096008,
"learning_rate": 1.898282362798492e-05,
"loss": 1.4482,
"step": 608
},
{
"epoch": 0.25513196480938416,
"grad_norm": 0.8297886848449707,
"learning_rate": 1.8981147884373692e-05,
"loss": 1.3514,
"step": 609
},
{
"epoch": 0.25555090071219105,
"grad_norm": 0.6667957901954651,
"learning_rate": 1.8979472140762466e-05,
"loss": 1.3185,
"step": 610
},
{
"epoch": 0.2559698366149979,
"grad_norm": 0.6411688923835754,
"learning_rate": 1.8977796397151237e-05,
"loss": 1.4442,
"step": 611
},
{
"epoch": 0.2563887725178048,
"grad_norm": 0.6379684805870056,
"learning_rate": 1.8976120653540008e-05,
"loss": 1.4377,
"step": 612
},
{
"epoch": 0.2568077084206116,
"grad_norm": 0.6216014623641968,
"learning_rate": 1.8974444909928782e-05,
"loss": 1.4474,
"step": 613
},
{
"epoch": 0.2572266443234185,
"grad_norm": 0.5671195983886719,
"learning_rate": 1.8972769166317556e-05,
"loss": 1.4434,
"step": 614
},
{
"epoch": 0.2576455802262254,
"grad_norm": 0.8159009218215942,
"learning_rate": 1.8971093422706327e-05,
"loss": 1.3781,
"step": 615
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.6750363111495972,
"learning_rate": 1.8969417679095098e-05,
"loss": 1.5244,
"step": 616
},
{
"epoch": 0.25848345203183914,
"grad_norm": 0.5345395803451538,
"learning_rate": 1.896774193548387e-05,
"loss": 1.4944,
"step": 617
},
{
"epoch": 0.258902387934646,
"grad_norm": 1.0422483682632446,
"learning_rate": 1.8966066191872646e-05,
"loss": 1.4257,
"step": 618
},
{
"epoch": 0.2593213238374529,
"grad_norm": 0.6644807457923889,
"learning_rate": 1.896439044826142e-05,
"loss": 1.4396,
"step": 619
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.6991039514541626,
"learning_rate": 1.896271470465019e-05,
"loss": 1.4456,
"step": 620
},
{
"epoch": 0.2601591956430666,
"grad_norm": 0.6287118792533875,
"learning_rate": 1.896103896103896e-05,
"loss": 1.3556,
"step": 621
},
{
"epoch": 0.2605781315458735,
"grad_norm": 0.7149505615234375,
"learning_rate": 1.8959363217427736e-05,
"loss": 1.3708,
"step": 622
},
{
"epoch": 0.26099706744868034,
"grad_norm": 0.6485461592674255,
"learning_rate": 1.895768747381651e-05,
"loss": 1.3813,
"step": 623
},
{
"epoch": 0.26141600335148724,
"grad_norm": 0.796234130859375,
"learning_rate": 1.895601173020528e-05,
"loss": 1.4557,
"step": 624
},
{
"epoch": 0.2618349392542941,
"grad_norm": 0.6059120297431946,
"learning_rate": 1.895433598659405e-05,
"loss": 1.4393,
"step": 625
},
{
"epoch": 0.26225387515710097,
"grad_norm": 0.7328737378120422,
"learning_rate": 1.8952660242982825e-05,
"loss": 1.4259,
"step": 626
},
{
"epoch": 0.2626728110599078,
"grad_norm": 0.5874695181846619,
"learning_rate": 1.89509844993716e-05,
"loss": 1.51,
"step": 627
},
{
"epoch": 0.2630917469627147,
"grad_norm": 0.7244255542755127,
"learning_rate": 1.894930875576037e-05,
"loss": 1.4277,
"step": 628
},
{
"epoch": 0.2635106828655216,
"grad_norm": 0.5406452417373657,
"learning_rate": 1.894763301214914e-05,
"loss": 1.4015,
"step": 629
},
{
"epoch": 0.26392961876832843,
"grad_norm": 0.7187069058418274,
"learning_rate": 1.8945957268537915e-05,
"loss": 1.417,
"step": 630
},
{
"epoch": 0.2643485546711353,
"grad_norm": 0.6457657217979431,
"learning_rate": 1.894428152492669e-05,
"loss": 1.4711,
"step": 631
},
{
"epoch": 0.26476749057394217,
"grad_norm": 0.5307900309562683,
"learning_rate": 1.894260578131546e-05,
"loss": 1.4237,
"step": 632
},
{
"epoch": 0.26518642647674906,
"grad_norm": 0.9521127343177795,
"learning_rate": 1.8940930037704234e-05,
"loss": 1.3934,
"step": 633
},
{
"epoch": 0.26560536237955595,
"grad_norm": 0.596931517124176,
"learning_rate": 1.8939254294093005e-05,
"loss": 1.4957,
"step": 634
},
{
"epoch": 0.2660242982823628,
"grad_norm": 0.8038269281387329,
"learning_rate": 1.893757855048178e-05,
"loss": 1.4551,
"step": 635
},
{
"epoch": 0.2664432341851697,
"grad_norm": 0.7987629771232605,
"learning_rate": 1.893590280687055e-05,
"loss": 1.4973,
"step": 636
},
{
"epoch": 0.2668621700879765,
"grad_norm": 0.7971741557121277,
"learning_rate": 1.8934227063259324e-05,
"loss": 1.331,
"step": 637
},
{
"epoch": 0.2672811059907834,
"grad_norm": 0.6776058077812195,
"learning_rate": 1.8932551319648094e-05,
"loss": 1.3397,
"step": 638
},
{
"epoch": 0.26770004189359026,
"grad_norm": 0.7527434825897217,
"learning_rate": 1.893087557603687e-05,
"loss": 1.3912,
"step": 639
},
{
"epoch": 0.26811897779639715,
"grad_norm": 0.6726639866828918,
"learning_rate": 1.892919983242564e-05,
"loss": 1.3428,
"step": 640
},
{
"epoch": 0.26853791369920404,
"grad_norm": 0.7143589854240417,
"learning_rate": 1.8927524088814413e-05,
"loss": 1.4311,
"step": 641
},
{
"epoch": 0.2689568496020109,
"grad_norm": 0.6544737815856934,
"learning_rate": 1.8925848345203188e-05,
"loss": 1.4774,
"step": 642
},
{
"epoch": 0.2693757855048178,
"grad_norm": 0.707067608833313,
"learning_rate": 1.892417260159196e-05,
"loss": 1.3982,
"step": 643
},
{
"epoch": 0.2697947214076246,
"grad_norm": 0.7015464901924133,
"learning_rate": 1.892249685798073e-05,
"loss": 1.3921,
"step": 644
},
{
"epoch": 0.2702136573104315,
"grad_norm": 0.7379519939422607,
"learning_rate": 1.8920821114369503e-05,
"loss": 1.4429,
"step": 645
},
{
"epoch": 0.27063259321323835,
"grad_norm": 0.6990654468536377,
"learning_rate": 1.8919145370758277e-05,
"loss": 1.3888,
"step": 646
},
{
"epoch": 0.27105152911604524,
"grad_norm": 0.8418135643005371,
"learning_rate": 1.8917469627147048e-05,
"loss": 1.3989,
"step": 647
},
{
"epoch": 0.27147046501885214,
"grad_norm": 0.7323577404022217,
"learning_rate": 1.891579388353582e-05,
"loss": 1.4053,
"step": 648
},
{
"epoch": 0.271889400921659,
"grad_norm": 0.5955424904823303,
"learning_rate": 1.8914118139924593e-05,
"loss": 1.4684,
"step": 649
},
{
"epoch": 0.27230833682446587,
"grad_norm": 0.7422502040863037,
"learning_rate": 1.8912442396313367e-05,
"loss": 1.394,
"step": 650
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.6807714700698853,
"learning_rate": 1.8910766652702138e-05,
"loss": 1.315,
"step": 651
},
{
"epoch": 0.2731462086300796,
"grad_norm": 0.6865885257720947,
"learning_rate": 1.8909090909090912e-05,
"loss": 1.3815,
"step": 652
},
{
"epoch": 0.2735651445328865,
"grad_norm": 0.7024874091148376,
"learning_rate": 1.8907415165479683e-05,
"loss": 1.4215,
"step": 653
},
{
"epoch": 0.27398408043569333,
"grad_norm": 0.7904298305511475,
"learning_rate": 1.8905739421868457e-05,
"loss": 1.3908,
"step": 654
},
{
"epoch": 0.2744030163385002,
"grad_norm": 0.6477106213569641,
"learning_rate": 1.8904063678257228e-05,
"loss": 1.4599,
"step": 655
},
{
"epoch": 0.27482195224130707,
"grad_norm": 0.8508483171463013,
"learning_rate": 1.8902387934646e-05,
"loss": 1.4838,
"step": 656
},
{
"epoch": 0.27524088814411396,
"grad_norm": 0.5711894631385803,
"learning_rate": 1.8900712191034772e-05,
"loss": 1.4615,
"step": 657
},
{
"epoch": 0.2756598240469208,
"grad_norm": 0.6993704438209534,
"learning_rate": 1.8899036447423547e-05,
"loss": 1.4493,
"step": 658
},
{
"epoch": 0.2760787599497277,
"grad_norm": 1.0374524593353271,
"learning_rate": 1.8897360703812317e-05,
"loss": 1.482,
"step": 659
},
{
"epoch": 0.2764976958525346,
"grad_norm": 0.9369934797286987,
"learning_rate": 1.889568496020109e-05,
"loss": 1.3472,
"step": 660
},
{
"epoch": 0.2769166317553414,
"grad_norm": 0.8173794746398926,
"learning_rate": 1.8894009216589862e-05,
"loss": 1.4428,
"step": 661
},
{
"epoch": 0.2773355676581483,
"grad_norm": 1.059898018836975,
"learning_rate": 1.8892333472978636e-05,
"loss": 1.395,
"step": 662
},
{
"epoch": 0.27775450356095516,
"grad_norm": 0.5998404026031494,
"learning_rate": 1.8890657729367407e-05,
"loss": 1.3945,
"step": 663
},
{
"epoch": 0.27817343946376205,
"grad_norm": 0.6833025217056274,
"learning_rate": 1.888898198575618e-05,
"loss": 1.4655,
"step": 664
},
{
"epoch": 0.2785923753665689,
"grad_norm": 0.7907499670982361,
"learning_rate": 1.8887306242144955e-05,
"loss": 1.3141,
"step": 665
},
{
"epoch": 0.2790113112693758,
"grad_norm": 0.6411296129226685,
"learning_rate": 1.8885630498533726e-05,
"loss": 1.507,
"step": 666
},
{
"epoch": 0.2794302471721827,
"grad_norm": 0.8076937794685364,
"learning_rate": 1.8883954754922497e-05,
"loss": 1.3578,
"step": 667
},
{
"epoch": 0.2798491830749895,
"grad_norm": 0.6930973529815674,
"learning_rate": 1.888227901131127e-05,
"loss": 1.3186,
"step": 668
},
{
"epoch": 0.2802681189777964,
"grad_norm": 0.7560698390007019,
"learning_rate": 1.8880603267700045e-05,
"loss": 1.3772,
"step": 669
},
{
"epoch": 0.28068705488060325,
"grad_norm": 0.5842419266700745,
"learning_rate": 1.8878927524088816e-05,
"loss": 1.4772,
"step": 670
},
{
"epoch": 0.28110599078341014,
"grad_norm": 0.9055956602096558,
"learning_rate": 1.8877251780477586e-05,
"loss": 1.4802,
"step": 671
},
{
"epoch": 0.28152492668621704,
"grad_norm": 0.5749624371528625,
"learning_rate": 1.887557603686636e-05,
"loss": 1.345,
"step": 672
},
{
"epoch": 0.2819438625890239,
"grad_norm": 0.6721547245979309,
"learning_rate": 1.8873900293255135e-05,
"loss": 1.3641,
"step": 673
},
{
"epoch": 0.28236279849183077,
"grad_norm": 0.580252468585968,
"learning_rate": 1.8872224549643905e-05,
"loss": 1.4225,
"step": 674
},
{
"epoch": 0.2827817343946376,
"grad_norm": 0.8793032765388489,
"learning_rate": 1.887054880603268e-05,
"loss": 1.3721,
"step": 675
},
{
"epoch": 0.2832006702974445,
"grad_norm": 0.5774890184402466,
"learning_rate": 1.886887306242145e-05,
"loss": 1.4358,
"step": 676
},
{
"epoch": 0.28361960620025134,
"grad_norm": 0.6694848537445068,
"learning_rate": 1.8867197318810224e-05,
"loss": 1.3601,
"step": 677
},
{
"epoch": 0.28403854210305823,
"grad_norm": 0.5719591975212097,
"learning_rate": 1.8865521575198995e-05,
"loss": 1.526,
"step": 678
},
{
"epoch": 0.2844574780058651,
"grad_norm": 0.6238727569580078,
"learning_rate": 1.886384583158777e-05,
"loss": 1.3982,
"step": 679
},
{
"epoch": 0.28487641390867197,
"grad_norm": 0.6695346236228943,
"learning_rate": 1.886217008797654e-05,
"loss": 1.384,
"step": 680
},
{
"epoch": 0.28529534981147886,
"grad_norm": 0.7487378120422363,
"learning_rate": 1.8860494344365314e-05,
"loss": 1.364,
"step": 681
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.6982246041297913,
"learning_rate": 1.8858818600754085e-05,
"loss": 1.4175,
"step": 682
},
{
"epoch": 0.2861332216170926,
"grad_norm": 0.6678481698036194,
"learning_rate": 1.885714285714286e-05,
"loss": 1.3014,
"step": 683
},
{
"epoch": 0.28655215751989943,
"grad_norm": 0.5933229923248291,
"learning_rate": 1.885546711353163e-05,
"loss": 1.4171,
"step": 684
},
{
"epoch": 0.2869710934227063,
"grad_norm": 0.6992403864860535,
"learning_rate": 1.8853791369920404e-05,
"loss": 1.3231,
"step": 685
},
{
"epoch": 0.2873900293255132,
"grad_norm": 0.4942586421966553,
"learning_rate": 1.8852115626309175e-05,
"loss": 1.4685,
"step": 686
},
{
"epoch": 0.28780896522832006,
"grad_norm": 0.6683540940284729,
"learning_rate": 1.885043988269795e-05,
"loss": 1.3826,
"step": 687
},
{
"epoch": 0.28822790113112695,
"grad_norm": 0.4600916802883148,
"learning_rate": 1.8848764139086723e-05,
"loss": 1.4668,
"step": 688
},
{
"epoch": 0.2886468370339338,
"grad_norm": 0.6156571507453918,
"learning_rate": 1.8847088395475494e-05,
"loss": 1.4465,
"step": 689
},
{
"epoch": 0.2890657729367407,
"grad_norm": 0.5449684262275696,
"learning_rate": 1.8845412651864264e-05,
"loss": 1.2964,
"step": 690
},
{
"epoch": 0.2894847088395475,
"grad_norm": 0.5813759565353394,
"learning_rate": 1.884373690825304e-05,
"loss": 1.4211,
"step": 691
},
{
"epoch": 0.2899036447423544,
"grad_norm": 0.618743360042572,
"learning_rate": 1.8842061164641813e-05,
"loss": 1.3976,
"step": 692
},
{
"epoch": 0.2903225806451613,
"grad_norm": 0.5242084264755249,
"learning_rate": 1.8840385421030583e-05,
"loss": 1.4076,
"step": 693
},
{
"epoch": 0.29074151654796815,
"grad_norm": 0.676110029220581,
"learning_rate": 1.8838709677419354e-05,
"loss": 1.3143,
"step": 694
},
{
"epoch": 0.29116045245077504,
"grad_norm": 0.5773074626922607,
"learning_rate": 1.8837033933808128e-05,
"loss": 1.4443,
"step": 695
},
{
"epoch": 0.2915793883535819,
"grad_norm": 0.8076342940330505,
"learning_rate": 1.8835358190196902e-05,
"loss": 1.469,
"step": 696
},
{
"epoch": 0.2919983242563888,
"grad_norm": 0.6035589575767517,
"learning_rate": 1.8833682446585677e-05,
"loss": 1.349,
"step": 697
},
{
"epoch": 0.29241726015919567,
"grad_norm": 1.1530474424362183,
"learning_rate": 1.8832006702974447e-05,
"loss": 1.4056,
"step": 698
},
{
"epoch": 0.2928361960620025,
"grad_norm": 0.6804980635643005,
"learning_rate": 1.8830330959363218e-05,
"loss": 1.45,
"step": 699
},
{
"epoch": 0.2932551319648094,
"grad_norm": 0.8612604141235352,
"learning_rate": 1.8828655215751992e-05,
"loss": 1.4082,
"step": 700
},
{
"epoch": 0.29367406786761624,
"grad_norm": 0.7837015390396118,
"learning_rate": 1.8826979472140766e-05,
"loss": 1.3432,
"step": 701
},
{
"epoch": 0.29409300377042313,
"grad_norm": 1.0070463418960571,
"learning_rate": 1.8825303728529537e-05,
"loss": 1.3149,
"step": 702
},
{
"epoch": 0.29451193967322997,
"grad_norm": 0.8743970990180969,
"learning_rate": 1.8823627984918308e-05,
"loss": 1.2927,
"step": 703
},
{
"epoch": 0.29493087557603687,
"grad_norm": 0.7475900053977966,
"learning_rate": 1.8821952241307082e-05,
"loss": 1.4228,
"step": 704
},
{
"epoch": 0.29534981147884376,
"grad_norm": 0.5288072824478149,
"learning_rate": 1.8820276497695853e-05,
"loss": 1.3062,
"step": 705
},
{
"epoch": 0.2957687473816506,
"grad_norm": 0.718515932559967,
"learning_rate": 1.8818600754084627e-05,
"loss": 1.4571,
"step": 706
},
{
"epoch": 0.2961876832844575,
"grad_norm": 0.6150030493736267,
"learning_rate": 1.8816925010473397e-05,
"loss": 1.4581,
"step": 707
},
{
"epoch": 0.29660661918726433,
"grad_norm": 0.6937044858932495,
"learning_rate": 1.881524926686217e-05,
"loss": 1.3419,
"step": 708
},
{
"epoch": 0.2970255550900712,
"grad_norm": 0.8995934128761292,
"learning_rate": 1.8813573523250942e-05,
"loss": 1.5374,
"step": 709
},
{
"epoch": 0.29744449099287806,
"grad_norm": 0.5294831991195679,
"learning_rate": 1.8811897779639716e-05,
"loss": 1.4163,
"step": 710
},
{
"epoch": 0.29786342689568496,
"grad_norm": 1.1763445138931274,
"learning_rate": 1.881022203602849e-05,
"loss": 1.3632,
"step": 711
},
{
"epoch": 0.29828236279849185,
"grad_norm": 0.6113332509994507,
"learning_rate": 1.880854629241726e-05,
"loss": 1.417,
"step": 712
},
{
"epoch": 0.2987012987012987,
"grad_norm": 0.7323436737060547,
"learning_rate": 1.8806870548806032e-05,
"loss": 1.4475,
"step": 713
},
{
"epoch": 0.2991202346041056,
"grad_norm": 0.6122335195541382,
"learning_rate": 1.8805194805194806e-05,
"loss": 1.4296,
"step": 714
},
{
"epoch": 0.2995391705069124,
"grad_norm": 0.8511880040168762,
"learning_rate": 1.880351906158358e-05,
"loss": 1.2773,
"step": 715
},
{
"epoch": 0.2999581064097193,
"grad_norm": 0.540932297706604,
"learning_rate": 1.880184331797235e-05,
"loss": 1.4429,
"step": 716
},
{
"epoch": 0.3003770423125262,
"grad_norm": 0.7005630731582642,
"learning_rate": 1.8800167574361122e-05,
"loss": 1.3484,
"step": 717
},
{
"epoch": 0.30079597821533305,
"grad_norm": 0.4778623878955841,
"learning_rate": 1.8798491830749896e-05,
"loss": 1.4279,
"step": 718
},
{
"epoch": 0.30121491411813994,
"grad_norm": 0.6638504266738892,
"learning_rate": 1.879681608713867e-05,
"loss": 1.3713,
"step": 719
},
{
"epoch": 0.3016338500209468,
"grad_norm": 0.6170998811721802,
"learning_rate": 1.8795140343527444e-05,
"loss": 1.4593,
"step": 720
},
{
"epoch": 0.3020527859237537,
"grad_norm": 0.8390569686889648,
"learning_rate": 1.8793464599916215e-05,
"loss": 1.3165,
"step": 721
},
{
"epoch": 0.3024717218265605,
"grad_norm": 0.5174708962440491,
"learning_rate": 1.8791788856304986e-05,
"loss": 1.3133,
"step": 722
},
{
"epoch": 0.3028906577293674,
"grad_norm": 0.6793212294578552,
"learning_rate": 1.879011311269376e-05,
"loss": 1.4321,
"step": 723
},
{
"epoch": 0.3033095936321743,
"grad_norm": 0.6248990893363953,
"learning_rate": 1.8788437369082534e-05,
"loss": 1.3265,
"step": 724
},
{
"epoch": 0.30372852953498114,
"grad_norm": 0.6905636191368103,
"learning_rate": 1.8786761625471305e-05,
"loss": 1.4365,
"step": 725
},
{
"epoch": 0.30414746543778803,
"grad_norm": 0.5848572850227356,
"learning_rate": 1.8785085881860075e-05,
"loss": 1.3977,
"step": 726
},
{
"epoch": 0.30456640134059487,
"grad_norm": 0.715691089630127,
"learning_rate": 1.878341013824885e-05,
"loss": 1.43,
"step": 727
},
{
"epoch": 0.30498533724340177,
"grad_norm": 0.5724084377288818,
"learning_rate": 1.8781734394637624e-05,
"loss": 1.4189,
"step": 728
},
{
"epoch": 0.3054042731462086,
"grad_norm": 0.6158417463302612,
"learning_rate": 1.8780058651026394e-05,
"loss": 1.4262,
"step": 729
},
{
"epoch": 0.3058232090490155,
"grad_norm": 0.5914372801780701,
"learning_rate": 1.877838290741517e-05,
"loss": 1.3335,
"step": 730
},
{
"epoch": 0.3062421449518224,
"grad_norm": 0.6226646900177002,
"learning_rate": 1.877670716380394e-05,
"loss": 1.366,
"step": 731
},
{
"epoch": 0.30666108085462923,
"grad_norm": 0.5656031370162964,
"learning_rate": 1.8775031420192713e-05,
"loss": 1.2984,
"step": 732
},
{
"epoch": 0.3070800167574361,
"grad_norm": 0.7473766207695007,
"learning_rate": 1.8773355676581484e-05,
"loss": 1.3788,
"step": 733
},
{
"epoch": 0.30749895266024296,
"grad_norm": 0.686081051826477,
"learning_rate": 1.8771679932970258e-05,
"loss": 1.2504,
"step": 734
},
{
"epoch": 0.30791788856304986,
"grad_norm": 0.6029173135757446,
"learning_rate": 1.877000418935903e-05,
"loss": 1.3931,
"step": 735
},
{
"epoch": 0.30833682446585675,
"grad_norm": 0.761979341506958,
"learning_rate": 1.8768328445747803e-05,
"loss": 1.3497,
"step": 736
},
{
"epoch": 0.3087557603686636,
"grad_norm": 0.6504870057106018,
"learning_rate": 1.8766652702136574e-05,
"loss": 1.4025,
"step": 737
},
{
"epoch": 0.3091746962714705,
"grad_norm": 0.7684826254844666,
"learning_rate": 1.8764976958525348e-05,
"loss": 1.506,
"step": 738
},
{
"epoch": 0.3095936321742773,
"grad_norm": 0.8104509115219116,
"learning_rate": 1.876330121491412e-05,
"loss": 1.4321,
"step": 739
},
{
"epoch": 0.3100125680770842,
"grad_norm": 0.5708764791488647,
"learning_rate": 1.876162547130289e-05,
"loss": 1.3636,
"step": 740
},
{
"epoch": 0.31043150397989105,
"grad_norm": 0.5689902901649475,
"learning_rate": 1.8759949727691664e-05,
"loss": 1.3956,
"step": 741
},
{
"epoch": 0.31085043988269795,
"grad_norm": 0.6868107914924622,
"learning_rate": 1.8758273984080438e-05,
"loss": 1.3395,
"step": 742
},
{
"epoch": 0.31126937578550484,
"grad_norm": 0.6060523390769958,
"learning_rate": 1.8756598240469212e-05,
"loss": 1.369,
"step": 743
},
{
"epoch": 0.3116883116883117,
"grad_norm": 0.8488364219665527,
"learning_rate": 1.8754922496857983e-05,
"loss": 1.3584,
"step": 744
},
{
"epoch": 0.3121072475911186,
"grad_norm": 0.5008958578109741,
"learning_rate": 1.8753246753246753e-05,
"loss": 1.4263,
"step": 745
},
{
"epoch": 0.3125261834939254,
"grad_norm": 0.5795634388923645,
"learning_rate": 1.8751571009635527e-05,
"loss": 1.4323,
"step": 746
},
{
"epoch": 0.3129451193967323,
"grad_norm": 0.5958297848701477,
"learning_rate": 1.87498952660243e-05,
"loss": 1.435,
"step": 747
},
{
"epoch": 0.31336405529953915,
"grad_norm": 0.610854983329773,
"learning_rate": 1.8748219522413072e-05,
"loss": 1.3164,
"step": 748
},
{
"epoch": 0.31378299120234604,
"grad_norm": 0.6233651638031006,
"learning_rate": 1.8746543778801843e-05,
"loss": 1.3757,
"step": 749
},
{
"epoch": 0.31420192710515293,
"grad_norm": 0.6277685165405273,
"learning_rate": 1.8744868035190617e-05,
"loss": 1.3958,
"step": 750
},
{
"epoch": 0.3146208630079598,
"grad_norm": 0.6157351732254028,
"learning_rate": 1.874319229157939e-05,
"loss": 1.3395,
"step": 751
},
{
"epoch": 0.31503979891076667,
"grad_norm": 0.5872085690498352,
"learning_rate": 1.8741516547968162e-05,
"loss": 1.4568,
"step": 752
},
{
"epoch": 0.3154587348135735,
"grad_norm": 0.6481055021286011,
"learning_rate": 1.8739840804356936e-05,
"loss": 1.466,
"step": 753
},
{
"epoch": 0.3158776707163804,
"grad_norm": 0.5024817585945129,
"learning_rate": 1.8738165060745707e-05,
"loss": 1.4321,
"step": 754
},
{
"epoch": 0.31629660661918724,
"grad_norm": 0.817737340927124,
"learning_rate": 1.873648931713448e-05,
"loss": 1.3119,
"step": 755
},
{
"epoch": 0.31671554252199413,
"grad_norm": 0.5730132460594177,
"learning_rate": 1.8734813573523252e-05,
"loss": 1.4508,
"step": 756
},
{
"epoch": 0.317134478424801,
"grad_norm": 0.7577594518661499,
"learning_rate": 1.8733137829912026e-05,
"loss": 1.3521,
"step": 757
},
{
"epoch": 0.31755341432760786,
"grad_norm": 0.6062861680984497,
"learning_rate": 1.8731462086300797e-05,
"loss": 1.384,
"step": 758
},
{
"epoch": 0.31797235023041476,
"grad_norm": 0.7353479266166687,
"learning_rate": 1.872978634268957e-05,
"loss": 1.2669,
"step": 759
},
{
"epoch": 0.3183912861332216,
"grad_norm": 0.7415457963943481,
"learning_rate": 1.872811059907834e-05,
"loss": 1.3057,
"step": 760
},
{
"epoch": 0.3188102220360285,
"grad_norm": 0.7072665691375732,
"learning_rate": 1.8726434855467116e-05,
"loss": 1.3939,
"step": 761
},
{
"epoch": 0.3192291579388354,
"grad_norm": 0.6156973838806152,
"learning_rate": 1.8724759111855886e-05,
"loss": 1.444,
"step": 762
},
{
"epoch": 0.3196480938416422,
"grad_norm": 0.8771636486053467,
"learning_rate": 1.872308336824466e-05,
"loss": 1.3528,
"step": 763
},
{
"epoch": 0.3200670297444491,
"grad_norm": 0.7313105463981628,
"learning_rate": 1.872140762463343e-05,
"loss": 1.403,
"step": 764
},
{
"epoch": 0.32048596564725595,
"grad_norm": 0.8612370491027832,
"learning_rate": 1.8719731881022205e-05,
"loss": 1.2753,
"step": 765
},
{
"epoch": 0.32090490155006285,
"grad_norm": 0.7663282155990601,
"learning_rate": 1.871805613741098e-05,
"loss": 1.2537,
"step": 766
},
{
"epoch": 0.3213238374528697,
"grad_norm": 0.6050071716308594,
"learning_rate": 1.871638039379975e-05,
"loss": 1.2666,
"step": 767
},
{
"epoch": 0.3217427733556766,
"grad_norm": 0.878713846206665,
"learning_rate": 1.871470465018852e-05,
"loss": 1.2472,
"step": 768
},
{
"epoch": 0.3221617092584835,
"grad_norm": 0.5855206251144409,
"learning_rate": 1.8713028906577295e-05,
"loss": 1.3284,
"step": 769
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.6722334623336792,
"learning_rate": 1.871135316296607e-05,
"loss": 1.3594,
"step": 770
},
{
"epoch": 0.3229995810640972,
"grad_norm": 0.5793231725692749,
"learning_rate": 1.870967741935484e-05,
"loss": 1.3275,
"step": 771
},
{
"epoch": 0.32341851696690405,
"grad_norm": 0.604360818862915,
"learning_rate": 1.870800167574361e-05,
"loss": 1.385,
"step": 772
},
{
"epoch": 0.32383745286971094,
"grad_norm": 0.6285070776939392,
"learning_rate": 1.8706325932132385e-05,
"loss": 1.3477,
"step": 773
},
{
"epoch": 0.3242563887725178,
"grad_norm": 0.637465238571167,
"learning_rate": 1.870465018852116e-05,
"loss": 1.3904,
"step": 774
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.6604148149490356,
"learning_rate": 1.870297444490993e-05,
"loss": 1.4409,
"step": 775
},
{
"epoch": 0.32509426057813157,
"grad_norm": 0.5733173489570618,
"learning_rate": 1.8701298701298704e-05,
"loss": 1.4067,
"step": 776
},
{
"epoch": 0.3255131964809384,
"grad_norm": 0.6333217620849609,
"learning_rate": 1.8699622957687475e-05,
"loss": 1.3755,
"step": 777
},
{
"epoch": 0.3259321323837453,
"grad_norm": 0.5488452315330505,
"learning_rate": 1.869794721407625e-05,
"loss": 1.3702,
"step": 778
},
{
"epoch": 0.32635106828655214,
"grad_norm": 0.501512885093689,
"learning_rate": 1.869627147046502e-05,
"loss": 1.354,
"step": 779
},
{
"epoch": 0.32677000418935903,
"grad_norm": 0.6487094163894653,
"learning_rate": 1.8694595726853794e-05,
"loss": 1.436,
"step": 780
},
{
"epoch": 0.3271889400921659,
"grad_norm": 0.6414726972579956,
"learning_rate": 1.8692919983242564e-05,
"loss": 1.3531,
"step": 781
},
{
"epoch": 0.32760787599497276,
"grad_norm": 0.6292533278465271,
"learning_rate": 1.869124423963134e-05,
"loss": 1.3462,
"step": 782
},
{
"epoch": 0.32802681189777966,
"grad_norm": 0.593009889125824,
"learning_rate": 1.868956849602011e-05,
"loss": 1.3921,
"step": 783
},
{
"epoch": 0.3284457478005865,
"grad_norm": 0.8031853437423706,
"learning_rate": 1.8687892752408883e-05,
"loss": 1.3397,
"step": 784
},
{
"epoch": 0.3288646837033934,
"grad_norm": 0.6368310451507568,
"learning_rate": 1.8686217008797654e-05,
"loss": 1.3801,
"step": 785
},
{
"epoch": 0.32928361960620023,
"grad_norm": 0.540608286857605,
"learning_rate": 1.8684541265186428e-05,
"loss": 1.4328,
"step": 786
},
{
"epoch": 0.3297025555090071,
"grad_norm": 0.6946301460266113,
"learning_rate": 1.86828655215752e-05,
"loss": 1.3398,
"step": 787
},
{
"epoch": 0.330121491411814,
"grad_norm": 0.57708340883255,
"learning_rate": 1.8681189777963973e-05,
"loss": 1.3537,
"step": 788
},
{
"epoch": 0.33054042731462085,
"grad_norm": 0.647994875907898,
"learning_rate": 1.8679514034352747e-05,
"loss": 1.4367,
"step": 789
},
{
"epoch": 0.33095936321742775,
"grad_norm": 0.7245671153068542,
"learning_rate": 1.8677838290741518e-05,
"loss": 1.4373,
"step": 790
},
{
"epoch": 0.3313782991202346,
"grad_norm": 0.5863420963287354,
"learning_rate": 1.867616254713029e-05,
"loss": 1.3749,
"step": 791
},
{
"epoch": 0.3317972350230415,
"grad_norm": 0.5827834010124207,
"learning_rate": 1.8674486803519063e-05,
"loss": 1.4547,
"step": 792
},
{
"epoch": 0.3322161709258483,
"grad_norm": 0.61444091796875,
"learning_rate": 1.8672811059907837e-05,
"loss": 1.4826,
"step": 793
},
{
"epoch": 0.3326351068286552,
"grad_norm": 0.7494825124740601,
"learning_rate": 1.8671135316296608e-05,
"loss": 1.3186,
"step": 794
},
{
"epoch": 0.3330540427314621,
"grad_norm": 0.708984375,
"learning_rate": 1.866945957268538e-05,
"loss": 1.3216,
"step": 795
},
{
"epoch": 0.33347297863426895,
"grad_norm": 0.7570633888244629,
"learning_rate": 1.8667783829074153e-05,
"loss": 1.2755,
"step": 796
},
{
"epoch": 0.33389191453707584,
"grad_norm": 0.6864930987358093,
"learning_rate": 1.8666108085462927e-05,
"loss": 1.5026,
"step": 797
},
{
"epoch": 0.3343108504398827,
"grad_norm": 0.7167800068855286,
"learning_rate": 1.86644323418517e-05,
"loss": 1.417,
"step": 798
},
{
"epoch": 0.3347297863426896,
"grad_norm": 0.6132422089576721,
"learning_rate": 1.866275659824047e-05,
"loss": 1.3885,
"step": 799
},
{
"epoch": 0.33514872224549647,
"grad_norm": 0.496698796749115,
"learning_rate": 1.8661080854629242e-05,
"loss": 1.3729,
"step": 800
},
{
"epoch": 0.3355676581483033,
"grad_norm": 0.718532383441925,
"learning_rate": 1.8659405111018016e-05,
"loss": 1.391,
"step": 801
},
{
"epoch": 0.3359865940511102,
"grad_norm": 0.7083394527435303,
"learning_rate": 1.8657729367406787e-05,
"loss": 1.3375,
"step": 802
},
{
"epoch": 0.33640552995391704,
"grad_norm": 0.6132792830467224,
"learning_rate": 1.865605362379556e-05,
"loss": 1.4201,
"step": 803
},
{
"epoch": 0.33682446585672393,
"grad_norm": 0.7242376804351807,
"learning_rate": 1.8654377880184332e-05,
"loss": 1.4025,
"step": 804
},
{
"epoch": 0.33724340175953077,
"grad_norm": 0.6561216711997986,
"learning_rate": 1.8652702136573106e-05,
"loss": 1.2498,
"step": 805
},
{
"epoch": 0.33766233766233766,
"grad_norm": 0.6856805086135864,
"learning_rate": 1.8651026392961877e-05,
"loss": 1.3054,
"step": 806
},
{
"epoch": 0.33808127356514456,
"grad_norm": 0.7083545923233032,
"learning_rate": 1.864935064935065e-05,
"loss": 1.4073,
"step": 807
},
{
"epoch": 0.3385002094679514,
"grad_norm": 0.6021366119384766,
"learning_rate": 1.8647674905739425e-05,
"loss": 1.4361,
"step": 808
},
{
"epoch": 0.3389191453707583,
"grad_norm": 0.5841922760009766,
"learning_rate": 1.8645999162128196e-05,
"loss": 1.4091,
"step": 809
},
{
"epoch": 0.33933808127356513,
"grad_norm": 0.7605058550834656,
"learning_rate": 1.8644323418516967e-05,
"loss": 1.4554,
"step": 810
},
{
"epoch": 0.339757017176372,
"grad_norm": 0.6625474095344543,
"learning_rate": 1.864264767490574e-05,
"loss": 1.3343,
"step": 811
},
{
"epoch": 0.34017595307917886,
"grad_norm": 0.5613833665847778,
"learning_rate": 1.8640971931294515e-05,
"loss": 1.3195,
"step": 812
},
{
"epoch": 0.34059488898198576,
"grad_norm": 0.7579832673072815,
"learning_rate": 1.8639296187683286e-05,
"loss": 1.2687,
"step": 813
},
{
"epoch": 0.34101382488479265,
"grad_norm": 0.6240825653076172,
"learning_rate": 1.8637620444072056e-05,
"loss": 1.3975,
"step": 814
},
{
"epoch": 0.3414327607875995,
"grad_norm": 0.669330894947052,
"learning_rate": 1.863594470046083e-05,
"loss": 1.3588,
"step": 815
},
{
"epoch": 0.3418516966904064,
"grad_norm": 1.4403901100158691,
"learning_rate": 1.8634268956849605e-05,
"loss": 1.3911,
"step": 816
},
{
"epoch": 0.3422706325932132,
"grad_norm": 0.5918375253677368,
"learning_rate": 1.8632593213238375e-05,
"loss": 1.4409,
"step": 817
},
{
"epoch": 0.3426895684960201,
"grad_norm": 0.8068619966506958,
"learning_rate": 1.8630917469627146e-05,
"loss": 1.373,
"step": 818
},
{
"epoch": 0.34310850439882695,
"grad_norm": 0.6585489511489868,
"learning_rate": 1.862924172601592e-05,
"loss": 1.3883,
"step": 819
},
{
"epoch": 0.34352744030163385,
"grad_norm": 0.7592496275901794,
"learning_rate": 1.8627565982404694e-05,
"loss": 1.274,
"step": 820
},
{
"epoch": 0.34394637620444074,
"grad_norm": 0.7368906736373901,
"learning_rate": 1.862589023879347e-05,
"loss": 1.3069,
"step": 821
},
{
"epoch": 0.3443653121072476,
"grad_norm": 1.1015337705612183,
"learning_rate": 1.862421449518224e-05,
"loss": 1.359,
"step": 822
},
{
"epoch": 0.3447842480100545,
"grad_norm": 1.0174856185913086,
"learning_rate": 1.862253875157101e-05,
"loss": 1.3323,
"step": 823
},
{
"epoch": 0.3452031839128613,
"grad_norm": 0.5378293991088867,
"learning_rate": 1.8620863007959784e-05,
"loss": 1.363,
"step": 824
},
{
"epoch": 0.3456221198156682,
"grad_norm": 0.5914802551269531,
"learning_rate": 1.8619187264348558e-05,
"loss": 1.4108,
"step": 825
},
{
"epoch": 0.3460410557184751,
"grad_norm": 0.7762312889099121,
"learning_rate": 1.861751152073733e-05,
"loss": 1.5178,
"step": 826
},
{
"epoch": 0.34645999162128194,
"grad_norm": 0.5453881025314331,
"learning_rate": 1.86158357771261e-05,
"loss": 1.2591,
"step": 827
},
{
"epoch": 0.34687892752408883,
"grad_norm": 0.8273909091949463,
"learning_rate": 1.8614160033514874e-05,
"loss": 1.4152,
"step": 828
},
{
"epoch": 0.34729786342689567,
"grad_norm": 0.619732677936554,
"learning_rate": 1.8612484289903648e-05,
"loss": 1.3843,
"step": 829
},
{
"epoch": 0.34771679932970256,
"grad_norm": 0.8301693797111511,
"learning_rate": 1.861080854629242e-05,
"loss": 1.3134,
"step": 830
},
{
"epoch": 0.3481357352325094,
"grad_norm": 0.5382381081581116,
"learning_rate": 1.8609132802681193e-05,
"loss": 1.3369,
"step": 831
},
{
"epoch": 0.3485546711353163,
"grad_norm": 0.6819462776184082,
"learning_rate": 1.8607457059069964e-05,
"loss": 1.3153,
"step": 832
},
{
"epoch": 0.3489736070381232,
"grad_norm": 0.5813642144203186,
"learning_rate": 1.8605781315458734e-05,
"loss": 1.3567,
"step": 833
},
{
"epoch": 0.34939254294093003,
"grad_norm": 0.5303306579589844,
"learning_rate": 1.860410557184751e-05,
"loss": 1.3297,
"step": 834
},
{
"epoch": 0.3498114788437369,
"grad_norm": 0.7988172769546509,
"learning_rate": 1.8602429828236283e-05,
"loss": 1.3171,
"step": 835
},
{
"epoch": 0.35023041474654376,
"grad_norm": 0.6038556694984436,
"learning_rate": 1.8600754084625053e-05,
"loss": 1.4674,
"step": 836
},
{
"epoch": 0.35064935064935066,
"grad_norm": 0.7679703235626221,
"learning_rate": 1.8599078341013824e-05,
"loss": 1.3359,
"step": 837
},
{
"epoch": 0.3510682865521575,
"grad_norm": 0.5809141993522644,
"learning_rate": 1.8597402597402598e-05,
"loss": 1.3567,
"step": 838
},
{
"epoch": 0.3514872224549644,
"grad_norm": 0.7988621592521667,
"learning_rate": 1.8595726853791372e-05,
"loss": 1.3453,
"step": 839
},
{
"epoch": 0.3519061583577713,
"grad_norm": 0.5147368907928467,
"learning_rate": 1.8594051110180143e-05,
"loss": 1.4187,
"step": 840
},
{
"epoch": 0.3523250942605781,
"grad_norm": 0.8276616334915161,
"learning_rate": 1.8592375366568917e-05,
"loss": 1.3438,
"step": 841
},
{
"epoch": 0.352744030163385,
"grad_norm": 0.630969762802124,
"learning_rate": 1.8590699622957688e-05,
"loss": 1.3271,
"step": 842
},
{
"epoch": 0.35316296606619185,
"grad_norm": 0.823090136051178,
"learning_rate": 1.8589023879346462e-05,
"loss": 1.3266,
"step": 843
},
{
"epoch": 0.35358190196899875,
"grad_norm": 0.6593233346939087,
"learning_rate": 1.8587348135735236e-05,
"loss": 1.2582,
"step": 844
},
{
"epoch": 0.35400083787180564,
"grad_norm": 0.5821310877799988,
"learning_rate": 1.8585672392124007e-05,
"loss": 1.3739,
"step": 845
},
{
"epoch": 0.3544197737746125,
"grad_norm": 0.6901243329048157,
"learning_rate": 1.8583996648512778e-05,
"loss": 1.4388,
"step": 846
},
{
"epoch": 0.3548387096774194,
"grad_norm": 0.521477460861206,
"learning_rate": 1.8582320904901552e-05,
"loss": 1.4263,
"step": 847
},
{
"epoch": 0.3552576455802262,
"grad_norm": 0.6898438930511475,
"learning_rate": 1.8580645161290326e-05,
"loss": 1.3508,
"step": 848
},
{
"epoch": 0.3556765814830331,
"grad_norm": 0.7993562817573547,
"learning_rate": 1.8578969417679097e-05,
"loss": 1.2651,
"step": 849
},
{
"epoch": 0.35609551738583994,
"grad_norm": 0.5602364540100098,
"learning_rate": 1.8577293674067867e-05,
"loss": 1.3954,
"step": 850
},
{
"epoch": 0.35651445328864684,
"grad_norm": 1.204590082168579,
"learning_rate": 1.857561793045664e-05,
"loss": 1.2695,
"step": 851
},
{
"epoch": 0.35693338919145373,
"grad_norm": 0.7635987997055054,
"learning_rate": 1.8573942186845416e-05,
"loss": 1.5585,
"step": 852
},
{
"epoch": 0.35735232509426057,
"grad_norm": 0.9088107943534851,
"learning_rate": 1.8572266443234186e-05,
"loss": 1.4667,
"step": 853
},
{
"epoch": 0.35777126099706746,
"grad_norm": 0.9393168687820435,
"learning_rate": 1.857059069962296e-05,
"loss": 1.4196,
"step": 854
},
{
"epoch": 0.3581901968998743,
"grad_norm": 0.7522462606430054,
"learning_rate": 1.856891495601173e-05,
"loss": 1.3734,
"step": 855
},
{
"epoch": 0.3586091328026812,
"grad_norm": 0.868783712387085,
"learning_rate": 1.8567239212400505e-05,
"loss": 1.3904,
"step": 856
},
{
"epoch": 0.35902806870548803,
"grad_norm": 0.721829354763031,
"learning_rate": 1.8565563468789276e-05,
"loss": 1.3377,
"step": 857
},
{
"epoch": 0.35944700460829493,
"grad_norm": 0.6955874562263489,
"learning_rate": 1.856388772517805e-05,
"loss": 1.3995,
"step": 858
},
{
"epoch": 0.3598659405111018,
"grad_norm": 0.7859539985656738,
"learning_rate": 1.856221198156682e-05,
"loss": 1.4676,
"step": 859
},
{
"epoch": 0.36028487641390866,
"grad_norm": 0.8676955699920654,
"learning_rate": 1.8560536237955595e-05,
"loss": 1.3937,
"step": 860
},
{
"epoch": 0.36070381231671556,
"grad_norm": 0.8304063081741333,
"learning_rate": 1.8558860494344366e-05,
"loss": 1.3601,
"step": 861
},
{
"epoch": 0.3611227482195224,
"grad_norm": 0.6992982029914856,
"learning_rate": 1.855718475073314e-05,
"loss": 1.3896,
"step": 862
},
{
"epoch": 0.3615416841223293,
"grad_norm": 0.7726747989654541,
"learning_rate": 1.855550900712191e-05,
"loss": 1.3639,
"step": 863
},
{
"epoch": 0.3619606200251362,
"grad_norm": 0.6171780228614807,
"learning_rate": 1.8553833263510685e-05,
"loss": 1.447,
"step": 864
},
{
"epoch": 0.362379555927943,
"grad_norm": 0.6462947130203247,
"learning_rate": 1.8552157519899456e-05,
"loss": 1.4144,
"step": 865
},
{
"epoch": 0.3627984918307499,
"grad_norm": 0.6577037572860718,
"learning_rate": 1.855048177628823e-05,
"loss": 1.3563,
"step": 866
},
{
"epoch": 0.36321742773355675,
"grad_norm": 0.633479118347168,
"learning_rate": 1.8548806032677004e-05,
"loss": 1.4013,
"step": 867
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.5869176387786865,
"learning_rate": 1.8547130289065775e-05,
"loss": 1.2304,
"step": 868
},
{
"epoch": 0.3640552995391705,
"grad_norm": 0.9644981026649475,
"learning_rate": 1.8545454545454545e-05,
"loss": 1.4055,
"step": 869
},
{
"epoch": 0.3644742354419774,
"grad_norm": 0.4988062381744385,
"learning_rate": 1.854377880184332e-05,
"loss": 1.3402,
"step": 870
},
{
"epoch": 0.3648931713447843,
"grad_norm": 0.6213245391845703,
"learning_rate": 1.8542103058232094e-05,
"loss": 1.3781,
"step": 871
},
{
"epoch": 0.3653121072475911,
"grad_norm": 1.2338489294052124,
"learning_rate": 1.8540427314620864e-05,
"loss": 1.3705,
"step": 872
},
{
"epoch": 0.365731043150398,
"grad_norm": 0.642489492893219,
"learning_rate": 1.8538751571009635e-05,
"loss": 1.302,
"step": 873
},
{
"epoch": 0.36614997905320484,
"grad_norm": 0.5393372178077698,
"learning_rate": 1.853707582739841e-05,
"loss": 1.3932,
"step": 874
},
{
"epoch": 0.36656891495601174,
"grad_norm": 0.702558696269989,
"learning_rate": 1.8535400083787183e-05,
"loss": 1.3118,
"step": 875
},
{
"epoch": 0.3669878508588186,
"grad_norm": 0.5150250792503357,
"learning_rate": 1.8533724340175954e-05,
"loss": 1.3945,
"step": 876
},
{
"epoch": 0.36740678676162547,
"grad_norm": 0.5648869872093201,
"learning_rate": 1.8532048596564728e-05,
"loss": 1.2966,
"step": 877
},
{
"epoch": 0.36782572266443236,
"grad_norm": 0.507340133190155,
"learning_rate": 1.85303728529535e-05,
"loss": 1.3102,
"step": 878
},
{
"epoch": 0.3682446585672392,
"grad_norm": 0.4831449091434479,
"learning_rate": 1.8528697109342273e-05,
"loss": 1.4016,
"step": 879
},
{
"epoch": 0.3686635944700461,
"grad_norm": 0.5660369396209717,
"learning_rate": 1.8527021365731044e-05,
"loss": 1.3923,
"step": 880
},
{
"epoch": 0.36908253037285293,
"grad_norm": 0.5805203318595886,
"learning_rate": 1.8525345622119818e-05,
"loss": 1.3403,
"step": 881
},
{
"epoch": 0.36950146627565983,
"grad_norm": 0.5939772129058838,
"learning_rate": 1.852366987850859e-05,
"loss": 1.433,
"step": 882
},
{
"epoch": 0.36992040217846667,
"grad_norm": 0.5323980450630188,
"learning_rate": 1.8521994134897363e-05,
"loss": 1.3378,
"step": 883
},
{
"epoch": 0.37033933808127356,
"grad_norm": 0.6497474312782288,
"learning_rate": 1.8520318391286134e-05,
"loss": 1.3351,
"step": 884
},
{
"epoch": 0.37075827398408046,
"grad_norm": 0.564187228679657,
"learning_rate": 1.8518642647674908e-05,
"loss": 1.3354,
"step": 885
},
{
"epoch": 0.3711772098868873,
"grad_norm": 0.5568170547485352,
"learning_rate": 1.8516966904063682e-05,
"loss": 1.3902,
"step": 886
},
{
"epoch": 0.3715961457896942,
"grad_norm": 0.635718584060669,
"learning_rate": 1.8515291160452453e-05,
"loss": 1.2659,
"step": 887
},
{
"epoch": 0.372015081692501,
"grad_norm": 0.5702401399612427,
"learning_rate": 1.8513615416841223e-05,
"loss": 1.3953,
"step": 888
},
{
"epoch": 0.3724340175953079,
"grad_norm": 0.9754594564437866,
"learning_rate": 1.8511939673229997e-05,
"loss": 1.523,
"step": 889
},
{
"epoch": 0.3728529534981148,
"grad_norm": 0.6205545663833618,
"learning_rate": 1.851026392961877e-05,
"loss": 1.4438,
"step": 890
},
{
"epoch": 0.37327188940092165,
"grad_norm": 0.6615620255470276,
"learning_rate": 1.8508588186007542e-05,
"loss": 1.3249,
"step": 891
},
{
"epoch": 0.37369082530372855,
"grad_norm": 1.1198056936264038,
"learning_rate": 1.8506912442396313e-05,
"loss": 1.3237,
"step": 892
},
{
"epoch": 0.3741097612065354,
"grad_norm": 0.4639185070991516,
"learning_rate": 1.8505236698785087e-05,
"loss": 1.3922,
"step": 893
},
{
"epoch": 0.3745286971093423,
"grad_norm": 0.9922068119049072,
"learning_rate": 1.850356095517386e-05,
"loss": 1.4176,
"step": 894
},
{
"epoch": 0.3749476330121491,
"grad_norm": 0.6543740034103394,
"learning_rate": 1.8501885211562632e-05,
"loss": 1.3416,
"step": 895
},
{
"epoch": 0.375366568914956,
"grad_norm": 0.6506433486938477,
"learning_rate": 1.8500209467951403e-05,
"loss": 1.3429,
"step": 896
},
{
"epoch": 0.3757855048177629,
"grad_norm": 0.6528995633125305,
"learning_rate": 1.8498533724340177e-05,
"loss": 1.3,
"step": 897
},
{
"epoch": 0.37620444072056974,
"grad_norm": 0.5251516103744507,
"learning_rate": 1.849685798072895e-05,
"loss": 1.3076,
"step": 898
},
{
"epoch": 0.37662337662337664,
"grad_norm": 0.6646214723587036,
"learning_rate": 1.8495182237117722e-05,
"loss": 1.3381,
"step": 899
},
{
"epoch": 0.3770423125261835,
"grad_norm": 0.7112960815429688,
"learning_rate": 1.8493506493506496e-05,
"loss": 1.342,
"step": 900
}
],
"logging_steps": 1.0,
"max_steps": 11935,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.8466193635540992e+18,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}