Ego-R1-SFT-3B / trainer_state.json
shulin16's picture
Upload folder using huggingface_hub
4d97764 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.8553299492385786,
"eval_steps": 500,
"global_step": 4500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006345177664974619,
"grad_norm": 12.304139137268066,
"learning_rate": 1.9027484143763215e-07,
"loss": 1.4092,
"step": 10
},
{
"epoch": 0.012690355329949238,
"grad_norm": 10.735240936279297,
"learning_rate": 4.0169133192389007e-07,
"loss": 1.3444,
"step": 20
},
{
"epoch": 0.01903553299492386,
"grad_norm": 4.4380784034729,
"learning_rate": 6.131078224101481e-07,
"loss": 1.2567,
"step": 30
},
{
"epoch": 0.025380710659898477,
"grad_norm": 3.0971062183380127,
"learning_rate": 8.245243128964061e-07,
"loss": 1.2201,
"step": 40
},
{
"epoch": 0.031725888324873094,
"grad_norm": 2.3528785705566406,
"learning_rate": 1.0359408033826639e-06,
"loss": 1.1005,
"step": 50
},
{
"epoch": 0.03807106598984772,
"grad_norm": 1.9325449466705322,
"learning_rate": 1.2473572938689219e-06,
"loss": 1.0258,
"step": 60
},
{
"epoch": 0.044416243654822336,
"grad_norm": 1.9481005668640137,
"learning_rate": 1.4587737843551796e-06,
"loss": 0.9549,
"step": 70
},
{
"epoch": 0.050761421319796954,
"grad_norm": 1.3744746446609497,
"learning_rate": 1.6701902748414379e-06,
"loss": 0.9397,
"step": 80
},
{
"epoch": 0.05710659898477157,
"grad_norm": 1.3208822011947632,
"learning_rate": 1.8816067653276956e-06,
"loss": 0.9581,
"step": 90
},
{
"epoch": 0.06345177664974619,
"grad_norm": 1.578454613685608,
"learning_rate": 2.0930232558139536e-06,
"loss": 0.8835,
"step": 100
},
{
"epoch": 0.06979695431472081,
"grad_norm": 1.7314599752426147,
"learning_rate": 2.3044397463002116e-06,
"loss": 0.877,
"step": 110
},
{
"epoch": 0.07614213197969544,
"grad_norm": 1.690652847290039,
"learning_rate": 2.5158562367864696e-06,
"loss": 0.8674,
"step": 120
},
{
"epoch": 0.08248730964467005,
"grad_norm": 1.4886319637298584,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.8124,
"step": 130
},
{
"epoch": 0.08883248730964467,
"grad_norm": 1.5932313203811646,
"learning_rate": 2.9386892177589852e-06,
"loss": 0.8825,
"step": 140
},
{
"epoch": 0.09517766497461928,
"grad_norm": 1.7353770732879639,
"learning_rate": 3.1501057082452436e-06,
"loss": 0.8381,
"step": 150
},
{
"epoch": 0.10152284263959391,
"grad_norm": 1.5052095651626587,
"learning_rate": 3.3615221987315012e-06,
"loss": 0.8094,
"step": 160
},
{
"epoch": 0.10786802030456853,
"grad_norm": 1.5068026781082153,
"learning_rate": 3.5729386892177592e-06,
"loss": 0.8088,
"step": 170
},
{
"epoch": 0.11421319796954314,
"grad_norm": 1.3972314596176147,
"learning_rate": 3.7843551797040172e-06,
"loss": 0.7807,
"step": 180
},
{
"epoch": 0.12055837563451777,
"grad_norm": 1.4561253786087036,
"learning_rate": 3.995771670190275e-06,
"loss": 0.751,
"step": 190
},
{
"epoch": 0.12690355329949238,
"grad_norm": 1.1900990009307861,
"learning_rate": 4.207188160676533e-06,
"loss": 0.7526,
"step": 200
},
{
"epoch": 0.13324873096446702,
"grad_norm": 1.2069578170776367,
"learning_rate": 4.418604651162791e-06,
"loss": 0.737,
"step": 210
},
{
"epoch": 0.13959390862944163,
"grad_norm": 1.3006811141967773,
"learning_rate": 4.630021141649049e-06,
"loss": 0.757,
"step": 220
},
{
"epoch": 0.14593908629441624,
"grad_norm": 1.1366584300994873,
"learning_rate": 4.841437632135307e-06,
"loss": 0.7355,
"step": 230
},
{
"epoch": 0.15228426395939088,
"grad_norm": 1.0923043489456177,
"learning_rate": 5.052854122621564e-06,
"loss": 0.7273,
"step": 240
},
{
"epoch": 0.15862944162436549,
"grad_norm": 1.1340067386627197,
"learning_rate": 5.264270613107823e-06,
"loss": 0.7093,
"step": 250
},
{
"epoch": 0.1649746192893401,
"grad_norm": 1.0045281648635864,
"learning_rate": 5.47568710359408e-06,
"loss": 0.709,
"step": 260
},
{
"epoch": 0.1713197969543147,
"grad_norm": 1.3080400228500366,
"learning_rate": 5.687103594080339e-06,
"loss": 0.7142,
"step": 270
},
{
"epoch": 0.17766497461928935,
"grad_norm": 1.4830659627914429,
"learning_rate": 5.898520084566597e-06,
"loss": 0.7233,
"step": 280
},
{
"epoch": 0.18401015228426396,
"grad_norm": 1.295798897743225,
"learning_rate": 6.109936575052855e-06,
"loss": 0.7254,
"step": 290
},
{
"epoch": 0.19035532994923857,
"grad_norm": 1.1951725482940674,
"learning_rate": 6.321353065539113e-06,
"loss": 0.7008,
"step": 300
},
{
"epoch": 0.1967005076142132,
"grad_norm": 1.1962999105453491,
"learning_rate": 6.53276955602537e-06,
"loss": 0.6697,
"step": 310
},
{
"epoch": 0.20304568527918782,
"grad_norm": 1.0768781900405884,
"learning_rate": 6.744186046511628e-06,
"loss": 0.6688,
"step": 320
},
{
"epoch": 0.20939086294416243,
"grad_norm": 1.2655526399612427,
"learning_rate": 6.955602536997886e-06,
"loss": 0.7098,
"step": 330
},
{
"epoch": 0.21573604060913706,
"grad_norm": 1.1732734441757202,
"learning_rate": 7.167019027484144e-06,
"loss": 0.6961,
"step": 340
},
{
"epoch": 0.22208121827411167,
"grad_norm": 1.4146960973739624,
"learning_rate": 7.378435517970403e-06,
"loss": 0.6581,
"step": 350
},
{
"epoch": 0.22842639593908629,
"grad_norm": 1.0180368423461914,
"learning_rate": 7.58985200845666e-06,
"loss": 0.636,
"step": 360
},
{
"epoch": 0.23477157360406092,
"grad_norm": 1.1763561964035034,
"learning_rate": 7.801268498942918e-06,
"loss": 0.6695,
"step": 370
},
{
"epoch": 0.24111675126903553,
"grad_norm": 1.120521068572998,
"learning_rate": 8.012684989429176e-06,
"loss": 0.6658,
"step": 380
},
{
"epoch": 0.24746192893401014,
"grad_norm": 1.070609450340271,
"learning_rate": 8.224101479915433e-06,
"loss": 0.6528,
"step": 390
},
{
"epoch": 0.25380710659898476,
"grad_norm": 1.404994249343872,
"learning_rate": 8.435517970401692e-06,
"loss": 0.6525,
"step": 400
},
{
"epoch": 0.26015228426395937,
"grad_norm": 1.3568419218063354,
"learning_rate": 8.64693446088795e-06,
"loss": 0.6525,
"step": 410
},
{
"epoch": 0.26649746192893403,
"grad_norm": 1.3468185663223267,
"learning_rate": 8.858350951374208e-06,
"loss": 0.641,
"step": 420
},
{
"epoch": 0.27284263959390864,
"grad_norm": 1.0951420068740845,
"learning_rate": 9.069767441860465e-06,
"loss": 0.6453,
"step": 430
},
{
"epoch": 0.27918781725888325,
"grad_norm": 1.030259370803833,
"learning_rate": 9.281183932346723e-06,
"loss": 0.6138,
"step": 440
},
{
"epoch": 0.28553299492385786,
"grad_norm": 1.1757938861846924,
"learning_rate": 9.492600422832982e-06,
"loss": 0.6787,
"step": 450
},
{
"epoch": 0.2918781725888325,
"grad_norm": 1.3138433694839478,
"learning_rate": 9.70401691331924e-06,
"loss": 0.6633,
"step": 460
},
{
"epoch": 0.2982233502538071,
"grad_norm": 1.3092707395553589,
"learning_rate": 9.915433403805497e-06,
"loss": 0.6432,
"step": 470
},
{
"epoch": 0.30456852791878175,
"grad_norm": 1.2927078008651733,
"learning_rate": 9.999950938319974e-06,
"loss": 0.6266,
"step": 480
},
{
"epoch": 0.31091370558375636,
"grad_norm": 1.33150053024292,
"learning_rate": 9.999651120428776e-06,
"loss": 0.6427,
"step": 490
},
{
"epoch": 0.31725888324873097,
"grad_norm": 1.2657496929168701,
"learning_rate": 9.999078757459388e-06,
"loss": 0.6457,
"step": 500
},
{
"epoch": 0.3236040609137056,
"grad_norm": 1.6883960962295532,
"learning_rate": 9.998233880612932e-06,
"loss": 0.6137,
"step": 510
},
{
"epoch": 0.3299492385786802,
"grad_norm": 0.9815077781677246,
"learning_rate": 9.997116535946028e-06,
"loss": 0.6069,
"step": 520
},
{
"epoch": 0.3362944162436548,
"grad_norm": 1.3186026811599731,
"learning_rate": 9.99572678436828e-06,
"loss": 0.6024,
"step": 530
},
{
"epoch": 0.3426395939086294,
"grad_norm": 1.6290111541748047,
"learning_rate": 9.994064701638969e-06,
"loss": 0.6273,
"step": 540
},
{
"epoch": 0.3489847715736041,
"grad_norm": 1.3211804628372192,
"learning_rate": 9.992130378362908e-06,
"loss": 0.6068,
"step": 550
},
{
"epoch": 0.3553299492385787,
"grad_norm": 1.619232177734375,
"learning_rate": 9.989923919985512e-06,
"loss": 0.612,
"step": 560
},
{
"epoch": 0.3616751269035533,
"grad_norm": 1.0001276731491089,
"learning_rate": 9.987445446787049e-06,
"loss": 0.5687,
"step": 570
},
{
"epoch": 0.3680203045685279,
"grad_norm": 1.2668827772140503,
"learning_rate": 9.984695093876081e-06,
"loss": 0.5723,
"step": 580
},
{
"epoch": 0.3743654822335025,
"grad_norm": 1.1758859157562256,
"learning_rate": 9.981673011182098e-06,
"loss": 0.5963,
"step": 590
},
{
"epoch": 0.38071065989847713,
"grad_norm": 1.4700498580932617,
"learning_rate": 9.978379363447348e-06,
"loss": 0.5682,
"step": 600
},
{
"epoch": 0.3870558375634518,
"grad_norm": 1.7378568649291992,
"learning_rate": 9.974814330217858e-06,
"loss": 0.6286,
"step": 610
},
{
"epoch": 0.3934010152284264,
"grad_norm": 1.5732265710830688,
"learning_rate": 9.970978105833632e-06,
"loss": 0.5464,
"step": 620
},
{
"epoch": 0.399746192893401,
"grad_norm": 1.4477766752243042,
"learning_rate": 9.966870899418087e-06,
"loss": 0.5806,
"step": 630
},
{
"epoch": 0.40609137055837563,
"grad_norm": 1.5664384365081787,
"learning_rate": 9.96249293486662e-06,
"loss": 0.5868,
"step": 640
},
{
"epoch": 0.41243654822335024,
"grad_norm": 1.242577075958252,
"learning_rate": 9.957844450834418e-06,
"loss": 0.5943,
"step": 650
},
{
"epoch": 0.41878172588832485,
"grad_norm": 1.3932079076766968,
"learning_rate": 9.952925700723455e-06,
"loss": 0.5582,
"step": 660
},
{
"epoch": 0.4251269035532995,
"grad_norm": 1.4832308292388916,
"learning_rate": 9.947736952668667e-06,
"loss": 0.561,
"step": 670
},
{
"epoch": 0.43147208121827413,
"grad_norm": 1.8345366716384888,
"learning_rate": 9.942278489523338e-06,
"loss": 0.5459,
"step": 680
},
{
"epoch": 0.43781725888324874,
"grad_norm": 1.1875063180923462,
"learning_rate": 9.936550608843685e-06,
"loss": 0.5267,
"step": 690
},
{
"epoch": 0.44416243654822335,
"grad_norm": 1.4732545614242554,
"learning_rate": 9.930553622872631e-06,
"loss": 0.5814,
"step": 700
},
{
"epoch": 0.45050761421319796,
"grad_norm": 1.7493573427200317,
"learning_rate": 9.924287858522789e-06,
"loss": 0.5633,
"step": 710
},
{
"epoch": 0.45685279187817257,
"grad_norm": 1.4842727184295654,
"learning_rate": 9.917753657358638e-06,
"loss": 0.53,
"step": 720
},
{
"epoch": 0.4631979695431472,
"grad_norm": 1.6605039834976196,
"learning_rate": 9.910951375577907e-06,
"loss": 0.5231,
"step": 730
},
{
"epoch": 0.46954314720812185,
"grad_norm": 1.6541188955307007,
"learning_rate": 9.903881383992153e-06,
"loss": 0.5268,
"step": 740
},
{
"epoch": 0.47588832487309646,
"grad_norm": 1.8268778324127197,
"learning_rate": 9.89654406800655e-06,
"loss": 0.49,
"step": 750
},
{
"epoch": 0.48223350253807107,
"grad_norm": 1.4834731817245483,
"learning_rate": 9.88893982759888e-06,
"loss": 0.5045,
"step": 760
},
{
"epoch": 0.4885786802030457,
"grad_norm": 1.717140555381775,
"learning_rate": 9.881069077297724e-06,
"loss": 0.496,
"step": 770
},
{
"epoch": 0.4949238578680203,
"grad_norm": 1.0741287469863892,
"learning_rate": 9.872932246159873e-06,
"loss": 0.4679,
"step": 780
},
{
"epoch": 0.501269035532995,
"grad_norm": 1.2269752025604248,
"learning_rate": 9.864529777746929e-06,
"loss": 0.4772,
"step": 790
},
{
"epoch": 0.5076142131979695,
"grad_norm": 1.6613504886627197,
"learning_rate": 9.85586213010114e-06,
"loss": 0.5008,
"step": 800
},
{
"epoch": 0.5139593908629442,
"grad_norm": 1.2009035348892212,
"learning_rate": 9.846929775720411e-06,
"loss": 0.5038,
"step": 810
},
{
"epoch": 0.5203045685279187,
"grad_norm": 1.5814530849456787,
"learning_rate": 9.837733201532565e-06,
"loss": 0.5021,
"step": 820
},
{
"epoch": 0.5266497461928934,
"grad_norm": 1.6952024698257446,
"learning_rate": 9.82827290886879e-06,
"loss": 0.4845,
"step": 830
},
{
"epoch": 0.5329949238578681,
"grad_norm": 1.3526102304458618,
"learning_rate": 9.818549413436309e-06,
"loss": 0.4952,
"step": 840
},
{
"epoch": 0.5393401015228426,
"grad_norm": 1.7655881643295288,
"learning_rate": 9.80856324529027e-06,
"loss": 0.4678,
"step": 850
},
{
"epoch": 0.5456852791878173,
"grad_norm": 1.391158103942871,
"learning_rate": 9.79831494880486e-06,
"loss": 0.4702,
"step": 860
},
{
"epoch": 0.5520304568527918,
"grad_norm": 1.3191405534744263,
"learning_rate": 9.787805082643604e-06,
"loss": 0.4394,
"step": 870
},
{
"epoch": 0.5583756345177665,
"grad_norm": 1.537750005722046,
"learning_rate": 9.777034219728943e-06,
"loss": 0.4172,
"step": 880
},
{
"epoch": 0.5647208121827412,
"grad_norm": 1.953177809715271,
"learning_rate": 9.76600294721098e-06,
"loss": 0.4846,
"step": 890
},
{
"epoch": 0.5710659898477157,
"grad_norm": 1.3089863061904907,
"learning_rate": 9.754711866435477e-06,
"loss": 0.414,
"step": 900
},
{
"epoch": 0.5774111675126904,
"grad_norm": 1.6026610136032104,
"learning_rate": 9.743161592911088e-06,
"loss": 0.5243,
"step": 910
},
{
"epoch": 0.583756345177665,
"grad_norm": 1.7620460987091064,
"learning_rate": 9.731352756275781e-06,
"loss": 0.4181,
"step": 920
},
{
"epoch": 0.5901015228426396,
"grad_norm": 1.6068378686904907,
"learning_rate": 9.719286000262533e-06,
"loss": 0.3713,
"step": 930
},
{
"epoch": 0.5964467005076142,
"grad_norm": 2.3091704845428467,
"learning_rate": 9.706961982664239e-06,
"loss": 0.4562,
"step": 940
},
{
"epoch": 0.6027918781725888,
"grad_norm": 2.353106737136841,
"learning_rate": 9.69438137529784e-06,
"loss": 0.4361,
"step": 950
},
{
"epoch": 0.6091370558375635,
"grad_norm": 1.599411129951477,
"learning_rate": 9.681544863967713e-06,
"loss": 0.4496,
"step": 960
},
{
"epoch": 0.6154822335025381,
"grad_norm": 1.5869901180267334,
"learning_rate": 9.668453148428282e-06,
"loss": 0.4046,
"step": 970
},
{
"epoch": 0.6218274111675127,
"grad_norm": 1.7548712491989136,
"learning_rate": 9.65510694234587e-06,
"loss": 0.3627,
"step": 980
},
{
"epoch": 0.6281725888324873,
"grad_norm": 1.3313032388687134,
"learning_rate": 9.641506973259798e-06,
"loss": 0.4176,
"step": 990
},
{
"epoch": 0.6345177664974619,
"grad_norm": 3.056716203689575,
"learning_rate": 9.627653982542722e-06,
"loss": 0.4283,
"step": 1000
},
{
"epoch": 0.6408629441624365,
"grad_norm": 1.8358234167099,
"learning_rate": 9.613548725360224e-06,
"loss": 0.4217,
"step": 1010
},
{
"epoch": 0.6472081218274112,
"grad_norm": 1.823522686958313,
"learning_rate": 9.599191970629638e-06,
"loss": 0.437,
"step": 1020
},
{
"epoch": 0.6535532994923858,
"grad_norm": 1.779383897781372,
"learning_rate": 9.584584500978144e-06,
"loss": 0.3995,
"step": 1030
},
{
"epoch": 0.6598984771573604,
"grad_norm": 1.7531787157058716,
"learning_rate": 9.569727112700093e-06,
"loss": 0.4449,
"step": 1040
},
{
"epoch": 0.666243654822335,
"grad_norm": 2.1453044414520264,
"learning_rate": 9.55462061571361e-06,
"loss": 0.3754,
"step": 1050
},
{
"epoch": 0.6725888324873096,
"grad_norm": 1.6521024703979492,
"learning_rate": 9.539265833516434e-06,
"loss": 0.419,
"step": 1060
},
{
"epoch": 0.6789340101522843,
"grad_norm": 1.616896152496338,
"learning_rate": 9.523663603141032e-06,
"loss": 0.4076,
"step": 1070
},
{
"epoch": 0.6852791878172588,
"grad_norm": 1.219354510307312,
"learning_rate": 9.507814775108971e-06,
"loss": 0.4092,
"step": 1080
},
{
"epoch": 0.6916243654822335,
"grad_norm": 22.454200744628906,
"learning_rate": 9.49172021338455e-06,
"loss": 0.4034,
"step": 1090
},
{
"epoch": 0.6979695431472082,
"grad_norm": 1.8505566120147705,
"learning_rate": 9.475380795327702e-06,
"loss": 0.3824,
"step": 1100
},
{
"epoch": 0.7043147208121827,
"grad_norm": 1.492254376411438,
"learning_rate": 9.458797411646176e-06,
"loss": 0.3405,
"step": 1110
},
{
"epoch": 0.7106598984771574,
"grad_norm": 1.774132251739502,
"learning_rate": 9.441970966346965e-06,
"loss": 0.3425,
"step": 1120
},
{
"epoch": 0.7170050761421319,
"grad_norm": 1.2463436126708984,
"learning_rate": 9.424902376687045e-06,
"loss": 0.3594,
"step": 1130
},
{
"epoch": 0.7233502538071066,
"grad_norm": 1.515215277671814,
"learning_rate": 9.407592573123359e-06,
"loss": 0.359,
"step": 1140
},
{
"epoch": 0.7296954314720813,
"grad_norm": 3.103351593017578,
"learning_rate": 9.390042499262102e-06,
"loss": 0.3554,
"step": 1150
},
{
"epoch": 0.7360406091370558,
"grad_norm": 1.8471239805221558,
"learning_rate": 9.372253111807276e-06,
"loss": 0.3251,
"step": 1160
},
{
"epoch": 0.7423857868020305,
"grad_norm": 1.8411760330200195,
"learning_rate": 9.354225380508548e-06,
"loss": 0.3233,
"step": 1170
},
{
"epoch": 0.748730964467005,
"grad_norm": 1.499944806098938,
"learning_rate": 9.33596028810838e-06,
"loss": 0.3718,
"step": 1180
},
{
"epoch": 0.7550761421319797,
"grad_norm": 2.158557653427124,
"learning_rate": 9.317458830288446e-06,
"loss": 0.3463,
"step": 1190
},
{
"epoch": 0.7614213197969543,
"grad_norm": 1.5045950412750244,
"learning_rate": 9.29872201561538e-06,
"loss": 0.3682,
"step": 1200
},
{
"epoch": 0.7677664974619289,
"grad_norm": 1.9903945922851562,
"learning_rate": 9.279750865485772e-06,
"loss": 0.3149,
"step": 1210
},
{
"epoch": 0.7741116751269036,
"grad_norm": 1.7139513492584229,
"learning_rate": 9.260546414070504e-06,
"loss": 0.2947,
"step": 1220
},
{
"epoch": 0.7804568527918782,
"grad_norm": 2.4074273109436035,
"learning_rate": 9.241109708258362e-06,
"loss": 0.3451,
"step": 1230
},
{
"epoch": 0.7868020304568528,
"grad_norm": 1.736325740814209,
"learning_rate": 9.221441807598981e-06,
"loss": 0.3156,
"step": 1240
},
{
"epoch": 0.7931472081218274,
"grad_norm": 1.722331166267395,
"learning_rate": 9.201543784245076e-06,
"loss": 0.2895,
"step": 1250
},
{
"epoch": 0.799492385786802,
"grad_norm": 1.800851583480835,
"learning_rate": 9.181416722893998e-06,
"loss": 0.2907,
"step": 1260
},
{
"epoch": 0.8058375634517766,
"grad_norm": 2.2214279174804688,
"learning_rate": 9.161061720728606e-06,
"loss": 0.3074,
"step": 1270
},
{
"epoch": 0.8121827411167513,
"grad_norm": 1.5840632915496826,
"learning_rate": 9.140479887357454e-06,
"loss": 0.2684,
"step": 1280
},
{
"epoch": 0.8185279187817259,
"grad_norm": 2.0567562580108643,
"learning_rate": 9.119672344754307e-06,
"loss": 0.2777,
"step": 1290
},
{
"epoch": 0.8248730964467005,
"grad_norm": 2.080697774887085,
"learning_rate": 9.098640227196978e-06,
"loss": 0.294,
"step": 1300
},
{
"epoch": 0.8312182741116751,
"grad_norm": 2.2059218883514404,
"learning_rate": 9.077384681205487e-06,
"loss": 0.3483,
"step": 1310
},
{
"epoch": 0.8375634517766497,
"grad_norm": 1.5565263032913208,
"learning_rate": 9.055906865479574e-06,
"loss": 0.2744,
"step": 1320
},
{
"epoch": 0.8439086294416244,
"grad_norm": 1.5794973373413086,
"learning_rate": 9.034207950835527e-06,
"loss": 0.2803,
"step": 1330
},
{
"epoch": 0.850253807106599,
"grad_norm": 1.8375296592712402,
"learning_rate": 9.01228912014236e-06,
"loss": 0.2805,
"step": 1340
},
{
"epoch": 0.8565989847715736,
"grad_norm": 1.5420727729797363,
"learning_rate": 8.99015156825733e-06,
"loss": 0.2774,
"step": 1350
},
{
"epoch": 0.8629441624365483,
"grad_norm": 1.6844383478164673,
"learning_rate": 8.967796501960805e-06,
"loss": 0.2724,
"step": 1360
},
{
"epoch": 0.8692893401015228,
"grad_norm": 2.27237606048584,
"learning_rate": 8.945225139890468e-06,
"loss": 0.2514,
"step": 1370
},
{
"epoch": 0.8756345177664975,
"grad_norm": 1.6022717952728271,
"learning_rate": 8.92243871247491e-06,
"loss": 0.2675,
"step": 1380
},
{
"epoch": 0.881979695431472,
"grad_norm": 1.3979642391204834,
"learning_rate": 8.899438461866526e-06,
"loss": 0.2404,
"step": 1390
},
{
"epoch": 0.8883248730964467,
"grad_norm": 1.8629894256591797,
"learning_rate": 8.876225641873822e-06,
"loss": 0.2744,
"step": 1400
},
{
"epoch": 0.8946700507614214,
"grad_norm": 1.6122556924819946,
"learning_rate": 8.852801517893063e-06,
"loss": 0.2814,
"step": 1410
},
{
"epoch": 0.9010152284263959,
"grad_norm": 2.0331978797912598,
"learning_rate": 8.829167366839287e-06,
"loss": 0.2728,
"step": 1420
},
{
"epoch": 0.9073604060913706,
"grad_norm": 1.5905483961105347,
"learning_rate": 8.805324477076697e-06,
"loss": 0.2503,
"step": 1430
},
{
"epoch": 0.9137055837563451,
"grad_norm": 1.9675116539001465,
"learning_rate": 8.781274148348438e-06,
"loss": 0.2241,
"step": 1440
},
{
"epoch": 0.9200507614213198,
"grad_norm": 1.981604814529419,
"learning_rate": 8.757017691705732e-06,
"loss": 0.2789,
"step": 1450
},
{
"epoch": 0.9263959390862944,
"grad_norm": 1.6477928161621094,
"learning_rate": 8.732556429436419e-06,
"loss": 0.2442,
"step": 1460
},
{
"epoch": 0.932741116751269,
"grad_norm": 1.875747799873352,
"learning_rate": 8.70789169499287e-06,
"loss": 0.2372,
"step": 1470
},
{
"epoch": 0.9390862944162437,
"grad_norm": 1.9763504266738892,
"learning_rate": 8.683024832919295e-06,
"loss": 0.2493,
"step": 1480
},
{
"epoch": 0.9454314720812182,
"grad_norm": 2.166445016860962,
"learning_rate": 8.657957198778455e-06,
"loss": 0.2491,
"step": 1490
},
{
"epoch": 0.9517766497461929,
"grad_norm": 2.062021493911743,
"learning_rate": 8.632690159077758e-06,
"loss": 0.2611,
"step": 1500
},
{
"epoch": 0.9581218274111675,
"grad_norm": 1.5676127672195435,
"learning_rate": 8.60722509119478e-06,
"loss": 0.2475,
"step": 1510
},
{
"epoch": 0.9644670050761421,
"grad_norm": 1.734596610069275,
"learning_rate": 8.581563383302158e-06,
"loss": 0.2499,
"step": 1520
},
{
"epoch": 0.9708121827411168,
"grad_norm": 2.276888132095337,
"learning_rate": 8.555706434291944e-06,
"loss": 0.2052,
"step": 1530
},
{
"epoch": 0.9771573604060914,
"grad_norm": 1.5414533615112305,
"learning_rate": 8.529655653699323e-06,
"loss": 0.2008,
"step": 1540
},
{
"epoch": 0.983502538071066,
"grad_norm": 2.0116498470306396,
"learning_rate": 8.503412461625792e-06,
"loss": 0.2088,
"step": 1550
},
{
"epoch": 0.9898477157360406,
"grad_norm": 2.507782220840454,
"learning_rate": 8.47697828866174e-06,
"loss": 0.2212,
"step": 1560
},
{
"epoch": 0.9961928934010152,
"grad_norm": 1.5416207313537598,
"learning_rate": 8.450354575808463e-06,
"loss": 0.227,
"step": 1570
},
{
"epoch": 1.00253807106599,
"grad_norm": 1.7348345518112183,
"learning_rate": 8.423542774399606e-06,
"loss": 0.2192,
"step": 1580
},
{
"epoch": 1.0088832487309645,
"grad_norm": 1.8863823413848877,
"learning_rate": 8.396544346022055e-06,
"loss": 0.159,
"step": 1590
},
{
"epoch": 1.015228426395939,
"grad_norm": 1.3554282188415527,
"learning_rate": 8.36936076243626e-06,
"loss": 0.1519,
"step": 1600
},
{
"epoch": 1.0215736040609138,
"grad_norm": 1.915385127067566,
"learning_rate": 8.341993505496e-06,
"loss": 0.1667,
"step": 1610
},
{
"epoch": 1.0279187817258884,
"grad_norm": 2.683910369873047,
"learning_rate": 8.314444067067611e-06,
"loss": 0.1672,
"step": 1620
},
{
"epoch": 1.034263959390863,
"grad_norm": 3.2767446041107178,
"learning_rate": 8.286713948948646e-06,
"loss": 0.151,
"step": 1630
},
{
"epoch": 1.0406091370558375,
"grad_norm": 1.7172635793685913,
"learning_rate": 8.258804662786031e-06,
"loss": 0.1365,
"step": 1640
},
{
"epoch": 1.0469543147208122,
"grad_norm": 1.9492729902267456,
"learning_rate": 8.230717729993637e-06,
"loss": 0.1521,
"step": 1650
},
{
"epoch": 1.0532994923857868,
"grad_norm": 1.3974714279174805,
"learning_rate": 8.202454681669352e-06,
"loss": 0.1784,
"step": 1660
},
{
"epoch": 1.0596446700507614,
"grad_norm": 1.5528488159179688,
"learning_rate": 8.17401705851163e-06,
"loss": 0.145,
"step": 1670
},
{
"epoch": 1.0659898477157361,
"grad_norm": 4.622862815856934,
"learning_rate": 8.14540641073548e-06,
"loss": 0.149,
"step": 1680
},
{
"epoch": 1.0723350253807107,
"grad_norm": 1.4450290203094482,
"learning_rate": 8.116624297987973e-06,
"loss": 0.1354,
"step": 1690
},
{
"epoch": 1.0786802030456852,
"grad_norm": 1.5473392009735107,
"learning_rate": 8.087672289263228e-06,
"loss": 0.1355,
"step": 1700
},
{
"epoch": 1.0850253807106598,
"grad_norm": 1.55717134475708,
"learning_rate": 8.058551962816858e-06,
"loss": 0.1533,
"step": 1710
},
{
"epoch": 1.0913705583756346,
"grad_norm": 2.583096742630005,
"learning_rate": 8.029264906079962e-06,
"loss": 0.1498,
"step": 1720
},
{
"epoch": 1.0977157360406091,
"grad_norm": 3.534912109375,
"learning_rate": 7.99981271557257e-06,
"loss": 0.1653,
"step": 1730
},
{
"epoch": 1.1040609137055837,
"grad_norm": 1.350325345993042,
"learning_rate": 7.970196996816622e-06,
"loss": 0.1253,
"step": 1740
},
{
"epoch": 1.1104060913705585,
"grad_norm": 1.4373643398284912,
"learning_rate": 7.940419364248445e-06,
"loss": 0.1681,
"step": 1750
},
{
"epoch": 1.116751269035533,
"grad_norm": 2.416491985321045,
"learning_rate": 7.910481441130739e-06,
"loss": 0.1382,
"step": 1760
},
{
"epoch": 1.1230964467005076,
"grad_norm": 1.4168888330459595,
"learning_rate": 7.880384859464102e-06,
"loss": 0.1286,
"step": 1770
},
{
"epoch": 1.1294416243654823,
"grad_norm": 1.4525187015533447,
"learning_rate": 7.850131259898051e-06,
"loss": 0.1454,
"step": 1780
},
{
"epoch": 1.135786802030457,
"grad_norm": 2.431896448135376,
"learning_rate": 7.819722291641591e-06,
"loss": 0.159,
"step": 1790
},
{
"epoch": 1.1421319796954315,
"grad_norm": 1.982692837715149,
"learning_rate": 7.789159612373317e-06,
"loss": 0.1201,
"step": 1800
},
{
"epoch": 1.148477157360406,
"grad_norm": 1.786580204963684,
"learning_rate": 7.758444888151042e-06,
"loss": 0.1274,
"step": 1810
},
{
"epoch": 1.1548223350253808,
"grad_norm": 1.0583122968673706,
"learning_rate": 7.727579793320977e-06,
"loss": 0.1246,
"step": 1820
},
{
"epoch": 1.1611675126903553,
"grad_norm": 1.2649511098861694,
"learning_rate": 7.69656601042646e-06,
"loss": 0.1296,
"step": 1830
},
{
"epoch": 1.16751269035533,
"grad_norm": 1.5088468790054321,
"learning_rate": 7.665405230116232e-06,
"loss": 0.1549,
"step": 1840
},
{
"epoch": 1.1738578680203045,
"grad_norm": 1.6474385261535645,
"learning_rate": 7.634099151052283e-06,
"loss": 0.1114,
"step": 1850
},
{
"epoch": 1.1802030456852792,
"grad_norm": 1.665197730064392,
"learning_rate": 7.602649479817242e-06,
"loss": 0.119,
"step": 1860
},
{
"epoch": 1.1865482233502538,
"grad_norm": 1.6402256488800049,
"learning_rate": 7.5710579308213576e-06,
"loss": 0.105,
"step": 1870
},
{
"epoch": 1.1928934010152283,
"grad_norm": 1.4458770751953125,
"learning_rate": 7.539326226209032e-06,
"loss": 0.1574,
"step": 1880
},
{
"epoch": 1.1992385786802031,
"grad_norm": 1.4857584238052368,
"learning_rate": 7.507456095764942e-06,
"loss": 0.1265,
"step": 1890
},
{
"epoch": 1.2055837563451777,
"grad_norm": 1.7672957181930542,
"learning_rate": 7.475449276819753e-06,
"loss": 0.1152,
"step": 1900
},
{
"epoch": 1.2119289340101522,
"grad_norm": 1.756518006324768,
"learning_rate": 7.443307514155402e-06,
"loss": 0.1051,
"step": 1910
},
{
"epoch": 1.218274111675127,
"grad_norm": 2.3999290466308594,
"learning_rate": 7.411032559909991e-06,
"loss": 0.1249,
"step": 1920
},
{
"epoch": 1.2246192893401016,
"grad_norm": 2.726649522781372,
"learning_rate": 7.378626173482268e-06,
"loss": 0.1065,
"step": 1930
},
{
"epoch": 1.2309644670050761,
"grad_norm": 1.4104615449905396,
"learning_rate": 7.346090121435724e-06,
"loss": 0.0982,
"step": 1940
},
{
"epoch": 1.2373096446700507,
"grad_norm": 1.8831905126571655,
"learning_rate": 7.313426177402281e-06,
"loss": 0.1091,
"step": 1950
},
{
"epoch": 1.2436548223350254,
"grad_norm": 2.125528573989868,
"learning_rate": 7.2806361219856205e-06,
"loss": 0.1197,
"step": 1960
},
{
"epoch": 1.25,
"grad_norm": 1.8320462703704834,
"learning_rate": 7.24772174266411e-06,
"loss": 0.0979,
"step": 1970
},
{
"epoch": 1.2563451776649746,
"grad_norm": 1.6644319295883179,
"learning_rate": 7.214684833693362e-06,
"loss": 0.1451,
"step": 1980
},
{
"epoch": 1.262690355329949,
"grad_norm": 1.816611886024475,
"learning_rate": 7.181527196008424e-06,
"loss": 0.1111,
"step": 1990
},
{
"epoch": 1.2690355329949239,
"grad_norm": 2.8035154342651367,
"learning_rate": 7.148250637125611e-06,
"loss": 0.0894,
"step": 2000
},
{
"epoch": 1.2753807106598984,
"grad_norm": 1.8045902252197266,
"learning_rate": 7.114856971043963e-06,
"loss": 0.0931,
"step": 2010
},
{
"epoch": 1.281725888324873,
"grad_norm": 1.637097716331482,
"learning_rate": 7.081348018146367e-06,
"loss": 0.1572,
"step": 2020
},
{
"epoch": 1.2880710659898478,
"grad_norm": 1.4267776012420654,
"learning_rate": 7.047725605100317e-06,
"loss": 0.1071,
"step": 2030
},
{
"epoch": 1.2944162436548223,
"grad_norm": 2.571660280227661,
"learning_rate": 7.01399156475834e-06,
"loss": 0.1158,
"step": 2040
},
{
"epoch": 1.3007614213197969,
"grad_norm": 2.324598789215088,
"learning_rate": 6.980147736058083e-06,
"loss": 0.0959,
"step": 2050
},
{
"epoch": 1.3071065989847717,
"grad_norm": 1.4909052848815918,
"learning_rate": 6.946195963922064e-06,
"loss": 0.1202,
"step": 2060
},
{
"epoch": 1.3134517766497462,
"grad_norm": 1.6092907190322876,
"learning_rate": 6.9121380991571065e-06,
"loss": 0.0805,
"step": 2070
},
{
"epoch": 1.3197969543147208,
"grad_norm": 1.2184277772903442,
"learning_rate": 6.877975998353433e-06,
"loss": 0.1132,
"step": 2080
},
{
"epoch": 1.3261421319796955,
"grad_norm": 1.2614070177078247,
"learning_rate": 6.8437115237834765e-06,
"loss": 0.089,
"step": 2090
},
{
"epoch": 1.33248730964467,
"grad_norm": 1.7008192539215088,
"learning_rate": 6.809346543300346e-06,
"loss": 0.0787,
"step": 2100
},
{
"epoch": 1.3388324873096447,
"grad_norm": 1.3894529342651367,
"learning_rate": 6.774882930236015e-06,
"loss": 0.0962,
"step": 2110
},
{
"epoch": 1.3451776649746192,
"grad_norm": 1.7126891613006592,
"learning_rate": 6.740322563299195e-06,
"loss": 0.0952,
"step": 2120
},
{
"epoch": 1.351522842639594,
"grad_norm": 1.7561262845993042,
"learning_rate": 6.705667326472926e-06,
"loss": 0.0989,
"step": 2130
},
{
"epoch": 1.3578680203045685,
"grad_norm": 1.4162139892578125,
"learning_rate": 6.6709191089118685e-06,
"loss": 0.1046,
"step": 2140
},
{
"epoch": 1.364213197969543,
"grad_norm": 1.8884022235870361,
"learning_rate": 6.636079804839329e-06,
"loss": 0.0847,
"step": 2150
},
{
"epoch": 1.3705583756345177,
"grad_norm": 1.4617987871170044,
"learning_rate": 6.601151313443997e-06,
"loss": 0.0858,
"step": 2160
},
{
"epoch": 1.3769035532994924,
"grad_norm": 1.5476235151290894,
"learning_rate": 6.566135538776413e-06,
"loss": 0.0907,
"step": 2170
},
{
"epoch": 1.383248730964467,
"grad_norm": 1.8879975080490112,
"learning_rate": 6.531034389645175e-06,
"loss": 0.1255,
"step": 2180
},
{
"epoch": 1.3895939086294415,
"grad_norm": 1.563038945198059,
"learning_rate": 6.495849779512879e-06,
"loss": 0.084,
"step": 2190
},
{
"epoch": 1.3959390862944163,
"grad_norm": 2.6775851249694824,
"learning_rate": 6.460583626391827e-06,
"loss": 0.0957,
"step": 2200
},
{
"epoch": 1.4022842639593909,
"grad_norm": 5.497508525848389,
"learning_rate": 6.4252378527394475e-06,
"loss": 0.0882,
"step": 2210
},
{
"epoch": 1.4086294416243654,
"grad_norm": 2.2709615230560303,
"learning_rate": 6.3898143853535145e-06,
"loss": 0.1038,
"step": 2220
},
{
"epoch": 1.4149746192893402,
"grad_norm": 2.0166831016540527,
"learning_rate": 6.354315155267105e-06,
"loss": 0.0778,
"step": 2230
},
{
"epoch": 1.4213197969543148,
"grad_norm": 1.4909207820892334,
"learning_rate": 6.318742097643336e-06,
"loss": 0.1091,
"step": 2240
},
{
"epoch": 1.4276649746192893,
"grad_norm": 2.3677256107330322,
"learning_rate": 6.283097151669869e-06,
"loss": 0.1019,
"step": 2250
},
{
"epoch": 1.434010152284264,
"grad_norm": 3.072751045227051,
"learning_rate": 6.247382260453203e-06,
"loss": 0.1004,
"step": 2260
},
{
"epoch": 1.4403553299492386,
"grad_norm": 2.3845341205596924,
"learning_rate": 6.211599370912752e-06,
"loss": 0.0886,
"step": 2270
},
{
"epoch": 1.4467005076142132,
"grad_norm": 4.395678997039795,
"learning_rate": 6.175750433674708e-06,
"loss": 0.1095,
"step": 2280
},
{
"epoch": 1.4530456852791878,
"grad_norm": 1.326743721961975,
"learning_rate": 6.139837402965705e-06,
"loss": 0.1021,
"step": 2290
},
{
"epoch": 1.4593908629441623,
"grad_norm": 1.4270453453063965,
"learning_rate": 6.103862236506303e-06,
"loss": 0.0744,
"step": 2300
},
{
"epoch": 1.465736040609137,
"grad_norm": 1.5374149084091187,
"learning_rate": 6.067826895404249e-06,
"loss": 0.0757,
"step": 2310
},
{
"epoch": 1.4720812182741116,
"grad_norm": 1.5649033784866333,
"learning_rate": 6.031733344047581e-06,
"loss": 0.1023,
"step": 2320
},
{
"epoch": 1.4784263959390862,
"grad_norm": 1.169797420501709,
"learning_rate": 5.995583549997542e-06,
"loss": 0.0654,
"step": 2330
},
{
"epoch": 1.484771573604061,
"grad_norm": 1.8578475713729858,
"learning_rate": 5.959379483881327e-06,
"loss": 0.0819,
"step": 2340
},
{
"epoch": 1.4911167512690355,
"grad_norm": 1.6423859596252441,
"learning_rate": 5.923123119284646e-06,
"loss": 0.0663,
"step": 2350
},
{
"epoch": 1.49746192893401,
"grad_norm": 1.1731383800506592,
"learning_rate": 5.886816432644155e-06,
"loss": 0.0932,
"step": 2360
},
{
"epoch": 1.5038071065989849,
"grad_norm": 1.0412118434906006,
"learning_rate": 5.850461403139702e-06,
"loss": 0.0807,
"step": 2370
},
{
"epoch": 1.5101522842639594,
"grad_norm": 1.5270987749099731,
"learning_rate": 5.814060012586443e-06,
"loss": 0.0747,
"step": 2380
},
{
"epoch": 1.516497461928934,
"grad_norm": 1.9564098119735718,
"learning_rate": 5.777614245326802e-06,
"loss": 0.0715,
"step": 2390
},
{
"epoch": 1.5228426395939088,
"grad_norm": 1.6264362335205078,
"learning_rate": 5.7411260881223045e-06,
"loss": 0.0947,
"step": 2400
},
{
"epoch": 1.529187817258883,
"grad_norm": 1.0679928064346313,
"learning_rate": 5.704597530045272e-06,
"loss": 0.0669,
"step": 2410
},
{
"epoch": 1.5355329949238579,
"grad_norm": 1.393947720527649,
"learning_rate": 5.6680305623703926e-06,
"loss": 0.089,
"step": 2420
},
{
"epoch": 1.5418781725888326,
"grad_norm": 1.8824158906936646,
"learning_rate": 5.631427178466166e-06,
"loss": 0.071,
"step": 2430
},
{
"epoch": 1.548223350253807,
"grad_norm": 1.060774326324463,
"learning_rate": 5.594789373686247e-06,
"loss": 0.0747,
"step": 2440
},
{
"epoch": 1.5545685279187818,
"grad_norm": 1.935646891593933,
"learning_rate": 5.5581191452606664e-06,
"loss": 0.0671,
"step": 2450
},
{
"epoch": 1.5609137055837563,
"grad_norm": 1.2591124773025513,
"learning_rate": 5.521418492186962e-06,
"loss": 0.0796,
"step": 2460
},
{
"epoch": 1.5672588832487309,
"grad_norm": 2.050698757171631,
"learning_rate": 5.484689415121204e-06,
"loss": 0.0724,
"step": 2470
},
{
"epoch": 1.5736040609137056,
"grad_norm": 1.2225536108016968,
"learning_rate": 5.447933916268933e-06,
"loss": 0.0591,
"step": 2480
},
{
"epoch": 1.5799492385786802,
"grad_norm": 4.785628318786621,
"learning_rate": 5.411153999276016e-06,
"loss": 0.0873,
"step": 2490
},
{
"epoch": 1.5862944162436547,
"grad_norm": 2.2066152095794678,
"learning_rate": 5.374351669119425e-06,
"loss": 0.057,
"step": 2500
},
{
"epoch": 1.5926395939086295,
"grad_norm": 1.9447569847106934,
"learning_rate": 5.337528931997934e-06,
"loss": 0.0548,
"step": 2510
},
{
"epoch": 1.598984771573604,
"grad_norm": 2.1758713722229004,
"learning_rate": 5.3006877952227585e-06,
"loss": 0.0674,
"step": 2520
},
{
"epoch": 1.6053299492385786,
"grad_norm": 1.5067161321640015,
"learning_rate": 5.263830267108129e-06,
"loss": 0.0583,
"step": 2530
},
{
"epoch": 1.6116751269035534,
"grad_norm": 1.6991007328033447,
"learning_rate": 5.226958356861819e-06,
"loss": 0.0521,
"step": 2540
},
{
"epoch": 1.618020304568528,
"grad_norm": 1.2602826356887817,
"learning_rate": 5.190074074475606e-06,
"loss": 0.0674,
"step": 2550
},
{
"epoch": 1.6243654822335025,
"grad_norm": 2.1869382858276367,
"learning_rate": 5.153179430615716e-06,
"loss": 0.062,
"step": 2560
},
{
"epoch": 1.6307106598984773,
"grad_norm": 1.6224417686462402,
"learning_rate": 5.116276436513201e-06,
"loss": 0.0718,
"step": 2570
},
{
"epoch": 1.6370558375634516,
"grad_norm": 2.291430711746216,
"learning_rate": 5.079367103854311e-06,
"loss": 0.0722,
"step": 2580
},
{
"epoch": 1.6434010152284264,
"grad_norm": 1.0190826654434204,
"learning_rate": 5.042453444670829e-06,
"loss": 0.0612,
"step": 2590
},
{
"epoch": 1.649746192893401,
"grad_norm": 1.6983177661895752,
"learning_rate": 5.005537471230387e-06,
"loss": 0.06,
"step": 2600
},
{
"epoch": 1.6560913705583755,
"grad_norm": 1.5693427324295044,
"learning_rate": 4.968621195926779e-06,
"loss": 0.0674,
"step": 2610
},
{
"epoch": 1.6624365482233503,
"grad_norm": 1.4258981943130493,
"learning_rate": 4.931706631170246e-06,
"loss": 0.0602,
"step": 2620
},
{
"epoch": 1.6687817258883249,
"grad_norm": 1.9744484424591064,
"learning_rate": 4.894795789277789e-06,
"loss": 0.0657,
"step": 2630
},
{
"epoch": 1.6751269035532994,
"grad_norm": 1.0477792024612427,
"learning_rate": 4.857890682363461e-06,
"loss": 0.0643,
"step": 2640
},
{
"epoch": 1.6814720812182742,
"grad_norm": 1.2517801523208618,
"learning_rate": 4.820993322228691e-06,
"loss": 0.0574,
"step": 2650
},
{
"epoch": 1.6878172588832487,
"grad_norm": 1.339064359664917,
"learning_rate": 4.784105720252602e-06,
"loss": 0.0639,
"step": 2660
},
{
"epoch": 1.6941624365482233,
"grad_norm": 1.0788367986679077,
"learning_rate": 4.747229887282379e-06,
"loss": 0.044,
"step": 2670
},
{
"epoch": 1.700507614213198,
"grad_norm": 0.8012908697128296,
"learning_rate": 4.7103678335236395e-06,
"loss": 0.0642,
"step": 2680
},
{
"epoch": 1.7068527918781726,
"grad_norm": 1.975696086883545,
"learning_rate": 4.673521568430859e-06,
"loss": 0.0655,
"step": 2690
},
{
"epoch": 1.7131979695431472,
"grad_norm": 1.7474173307418823,
"learning_rate": 4.63669310059783e-06,
"loss": 0.0447,
"step": 2700
},
{
"epoch": 1.719543147208122,
"grad_norm": 0.9429912567138672,
"learning_rate": 4.5998844376481665e-06,
"loss": 0.0588,
"step": 2710
},
{
"epoch": 1.7258883248730963,
"grad_norm": 2.345489025115967,
"learning_rate": 4.5630975861258605e-06,
"loss": 0.0637,
"step": 2720
},
{
"epoch": 1.732233502538071,
"grad_norm": 0.8988242149353027,
"learning_rate": 4.526334551385902e-06,
"loss": 0.0613,
"step": 2730
},
{
"epoch": 1.7385786802030458,
"grad_norm": 2.0134191513061523,
"learning_rate": 4.489597337484961e-06,
"loss": 0.0533,
"step": 2740
},
{
"epoch": 1.7449238578680202,
"grad_norm": 1.8432866334915161,
"learning_rate": 4.452887947072142e-06,
"loss": 0.0684,
"step": 2750
},
{
"epoch": 1.751269035532995,
"grad_norm": 3.151284694671631,
"learning_rate": 4.416208381279812e-06,
"loss": 0.0556,
"step": 2760
},
{
"epoch": 1.7576142131979695,
"grad_norm": 1.051060676574707,
"learning_rate": 4.379560639614513e-06,
"loss": 0.0498,
"step": 2770
},
{
"epoch": 1.763959390862944,
"grad_norm": 1.5683525800704956,
"learning_rate": 4.3429467198479665e-06,
"loss": 0.0524,
"step": 2780
},
{
"epoch": 1.7703045685279188,
"grad_norm": 1.0461344718933105,
"learning_rate": 4.306368617908163e-06,
"loss": 0.0445,
"step": 2790
},
{
"epoch": 1.7766497461928934,
"grad_norm": 1.2296735048294067,
"learning_rate": 4.2698283277705655e-06,
"loss": 0.0464,
"step": 2800
},
{
"epoch": 1.782994923857868,
"grad_norm": 0.9869544506072998,
"learning_rate": 4.23332784134941e-06,
"loss": 0.0506,
"step": 2810
},
{
"epoch": 1.7893401015228427,
"grad_norm": 2.624345541000366,
"learning_rate": 4.196869148389114e-06,
"loss": 0.0455,
"step": 2820
},
{
"epoch": 1.7956852791878173,
"grad_norm": 2.0790648460388184,
"learning_rate": 4.160454236355822e-06,
"loss": 0.0465,
"step": 2830
},
{
"epoch": 1.8020304568527918,
"grad_norm": 1.0878472328186035,
"learning_rate": 4.124085090329056e-06,
"loss": 0.0354,
"step": 2840
},
{
"epoch": 1.8083756345177666,
"grad_norm": 1.4148125648498535,
"learning_rate": 4.087763692893498e-06,
"loss": 0.0378,
"step": 2850
},
{
"epoch": 1.8147208121827412,
"grad_norm": 0.8988755941390991,
"learning_rate": 4.051492024030925e-06,
"loss": 0.0421,
"step": 2860
},
{
"epoch": 1.8210659898477157,
"grad_norm": 2.1405270099639893,
"learning_rate": 4.015272061012271e-06,
"loss": 0.0647,
"step": 2870
},
{
"epoch": 1.8274111675126905,
"grad_norm": 0.8886227607727051,
"learning_rate": 3.979105778289832e-06,
"loss": 0.0547,
"step": 2880
},
{
"epoch": 1.8337563451776648,
"grad_norm": 1.402446985244751,
"learning_rate": 3.942995147389648e-06,
"loss": 0.0378,
"step": 2890
},
{
"epoch": 1.8401015228426396,
"grad_norm": 1.283605933189392,
"learning_rate": 3.9069421368040115e-06,
"loss": 0.0488,
"step": 2900
},
{
"epoch": 1.8464467005076142,
"grad_norm": 1.226680874824524,
"learning_rate": 3.870948711884178e-06,
"loss": 0.0382,
"step": 2910
},
{
"epoch": 1.8527918781725887,
"grad_norm": 1.871385097503662,
"learning_rate": 3.835016834733216e-06,
"loss": 0.0441,
"step": 2920
},
{
"epoch": 1.8591370558375635,
"grad_norm": 1.125570297241211,
"learning_rate": 3.7991484640990506e-06,
"loss": 0.0429,
"step": 2930
},
{
"epoch": 1.865482233502538,
"grad_norm": 1.131261944770813,
"learning_rate": 3.763345555267692e-06,
"loss": 0.0404,
"step": 2940
},
{
"epoch": 1.8718274111675126,
"grad_norm": 1.5131438970565796,
"learning_rate": 3.727610059956641e-06,
"loss": 0.0359,
"step": 2950
},
{
"epoch": 1.8781725888324874,
"grad_norm": 0.8379979133605957,
"learning_rate": 3.691943926208494e-06,
"loss": 0.0508,
"step": 2960
},
{
"epoch": 1.884517766497462,
"grad_norm": 1.1895625591278076,
"learning_rate": 3.6563490982847577e-06,
"loss": 0.034,
"step": 2970
},
{
"epoch": 1.8908629441624365,
"grad_norm": 0.7952091097831726,
"learning_rate": 3.620827516559854e-06,
"loss": 0.0494,
"step": 2980
},
{
"epoch": 1.8972081218274113,
"grad_norm": 1.2926766872406006,
"learning_rate": 3.58538111741535e-06,
"loss": 0.0483,
"step": 2990
},
{
"epoch": 1.9035532994923858,
"grad_norm": 1.165218472480774,
"learning_rate": 3.550011833134399e-06,
"loss": 0.0446,
"step": 3000
},
{
"epoch": 1.9098984771573604,
"grad_norm": 1.2693628072738647,
"learning_rate": 3.5147215917964037e-06,
"loss": 0.0296,
"step": 3010
},
{
"epoch": 1.9162436548223352,
"grad_norm": 0.7264485955238342,
"learning_rate": 3.4795123171719142e-06,
"loss": 0.0488,
"step": 3020
},
{
"epoch": 1.9225888324873095,
"grad_norm": 0.9121705889701843,
"learning_rate": 3.4443859286177545e-06,
"loss": 0.0299,
"step": 3030
},
{
"epoch": 1.9289340101522843,
"grad_norm": 1.2310829162597656,
"learning_rate": 3.4093443409723985e-06,
"loss": 0.0389,
"step": 3040
},
{
"epoch": 1.9352791878172588,
"grad_norm": 1.087215542793274,
"learning_rate": 3.374389464451583e-06,
"loss": 0.0367,
"step": 3050
},
{
"epoch": 1.9416243654822334,
"grad_norm": 1.1739871501922607,
"learning_rate": 3.339523204544176e-06,
"loss": 0.0407,
"step": 3060
},
{
"epoch": 1.9479695431472082,
"grad_norm": 0.9143801927566528,
"learning_rate": 3.3047474619083043e-06,
"loss": 0.0361,
"step": 3070
},
{
"epoch": 1.9543147208121827,
"grad_norm": 0.9468094706535339,
"learning_rate": 3.2700641322677405e-06,
"loss": 0.0309,
"step": 3080
},
{
"epoch": 1.9606598984771573,
"grad_norm": 1.2729860544204712,
"learning_rate": 3.235475106308569e-06,
"loss": 0.0194,
"step": 3090
},
{
"epoch": 1.967005076142132,
"grad_norm": 1.381415843963623,
"learning_rate": 3.200982269576111e-06,
"loss": 0.0495,
"step": 3100
},
{
"epoch": 1.9733502538071066,
"grad_norm": 1.4151417016983032,
"learning_rate": 3.1665875023721453e-06,
"loss": 0.0344,
"step": 3110
},
{
"epoch": 1.9796954314720812,
"grad_norm": 0.9717885851860046,
"learning_rate": 3.1322926796524016e-06,
"loss": 0.0376,
"step": 3120
},
{
"epoch": 1.986040609137056,
"grad_norm": 0.9146430492401123,
"learning_rate": 3.0980996709243517e-06,
"loss": 0.028,
"step": 3130
},
{
"epoch": 1.9923857868020305,
"grad_norm": 1.5948601961135864,
"learning_rate": 3.0640103401453035e-06,
"loss": 0.0511,
"step": 3140
},
{
"epoch": 1.998730964467005,
"grad_norm": 1.120682716369629,
"learning_rate": 3.030026545620787e-06,
"loss": 0.0411,
"step": 3150
},
{
"epoch": 2.00507614213198,
"grad_norm": 0.8402583003044128,
"learning_rate": 2.9961501399032546e-06,
"loss": 0.0272,
"step": 3160
},
{
"epoch": 2.011421319796954,
"grad_norm": 1.102598786354065,
"learning_rate": 2.9623829696910867e-06,
"loss": 0.0207,
"step": 3170
},
{
"epoch": 2.017766497461929,
"grad_norm": 0.9598972201347351,
"learning_rate": 2.928726875727937e-06,
"loss": 0.0197,
"step": 3180
},
{
"epoch": 2.0241116751269037,
"grad_norm": 0.8507049679756165,
"learning_rate": 2.8951836927023703e-06,
"loss": 0.0161,
"step": 3190
},
{
"epoch": 2.030456852791878,
"grad_norm": 0.9228895902633667,
"learning_rate": 2.861755249147862e-06,
"loss": 0.023,
"step": 3200
},
{
"epoch": 2.036802030456853,
"grad_norm": 0.8271005749702454,
"learning_rate": 2.828443367343119e-06,
"loss": 0.0148,
"step": 3210
},
{
"epoch": 2.0431472081218276,
"grad_norm": 1.2311136722564697,
"learning_rate": 2.7952498632127324e-06,
"loss": 0.0202,
"step": 3220
},
{
"epoch": 2.049492385786802,
"grad_norm": 1.3220641613006592,
"learning_rate": 2.762176546228198e-06,
"loss": 0.0235,
"step": 3230
},
{
"epoch": 2.0558375634517767,
"grad_norm": 1.2385421991348267,
"learning_rate": 2.7292252193092693e-06,
"loss": 0.0205,
"step": 3240
},
{
"epoch": 2.0621827411167515,
"grad_norm": 1.238295316696167,
"learning_rate": 2.6963976787256726e-06,
"loss": 0.0157,
"step": 3250
},
{
"epoch": 2.068527918781726,
"grad_norm": 0.7305588126182556,
"learning_rate": 2.6636957139992003e-06,
"loss": 0.0183,
"step": 3260
},
{
"epoch": 2.0748730964467006,
"grad_norm": 0.8512719869613647,
"learning_rate": 2.631121107806144e-06,
"loss": 0.0204,
"step": 3270
},
{
"epoch": 2.081218274111675,
"grad_norm": 0.8006191849708557,
"learning_rate": 2.598675635880129e-06,
"loss": 0.0223,
"step": 3280
},
{
"epoch": 2.0875634517766497,
"grad_norm": 1.4886091947555542,
"learning_rate": 2.5663610669153043e-06,
"loss": 0.0197,
"step": 3290
},
{
"epoch": 2.0939086294416245,
"grad_norm": 0.7531688213348389,
"learning_rate": 2.534179162469924e-06,
"loss": 0.0222,
"step": 3300
},
{
"epoch": 2.100253807106599,
"grad_norm": 0.6706914305686951,
"learning_rate": 2.502131676870335e-06,
"loss": 0.019,
"step": 3310
},
{
"epoch": 2.1065989847715736,
"grad_norm": 0.8195891380310059,
"learning_rate": 2.470220357115327e-06,
"loss": 0.0099,
"step": 3320
},
{
"epoch": 2.1129441624365484,
"grad_norm": 0.8743392825126648,
"learning_rate": 2.438446942780911e-06,
"loss": 0.0145,
"step": 3330
},
{
"epoch": 2.1192893401015227,
"grad_norm": 0.5079776048660278,
"learning_rate": 2.4068131659254803e-06,
"loss": 0.0164,
"step": 3340
},
{
"epoch": 2.1256345177664975,
"grad_norm": 0.512514054775238,
"learning_rate": 2.3753207509953963e-06,
"loss": 0.0287,
"step": 3350
},
{
"epoch": 2.1319796954314723,
"grad_norm": 0.7019079923629761,
"learning_rate": 2.3439714147309845e-06,
"loss": 0.0189,
"step": 3360
},
{
"epoch": 2.1383248730964466,
"grad_norm": 0.8089588284492493,
"learning_rate": 2.312766866072947e-06,
"loss": 0.0255,
"step": 3370
},
{
"epoch": 2.1446700507614214,
"grad_norm": 0.9173935651779175,
"learning_rate": 2.2817088060692094e-06,
"loss": 0.0149,
"step": 3380
},
{
"epoch": 2.151015228426396,
"grad_norm": 1.1662015914916992,
"learning_rate": 2.2507989277821847e-06,
"loss": 0.0201,
"step": 3390
},
{
"epoch": 2.1573604060913705,
"grad_norm": 0.5388917922973633,
"learning_rate": 2.2200389161964795e-06,
"loss": 0.0198,
"step": 3400
},
{
"epoch": 2.1637055837563453,
"grad_norm": 1.1195067167282104,
"learning_rate": 2.189430448127055e-06,
"loss": 0.0196,
"step": 3410
},
{
"epoch": 2.1700507614213196,
"grad_norm": 0.7136582732200623,
"learning_rate": 2.1589751921277925e-06,
"loss": 0.0188,
"step": 3420
},
{
"epoch": 2.1763959390862944,
"grad_norm": 0.773573100566864,
"learning_rate": 2.128674808400565e-06,
"loss": 0.0212,
"step": 3430
},
{
"epoch": 2.182741116751269,
"grad_norm": 0.7614580392837524,
"learning_rate": 2.098530948704714e-06,
"loss": 0.021,
"step": 3440
},
{
"epoch": 2.1890862944162435,
"grad_norm": 0.6622429490089417,
"learning_rate": 2.068545256267015e-06,
"loss": 0.0169,
"step": 3450
},
{
"epoch": 2.1954314720812182,
"grad_norm": 0.3882254660129547,
"learning_rate": 2.0387193656921063e-06,
"loss": 0.023,
"step": 3460
},
{
"epoch": 2.201776649746193,
"grad_norm": 1.2883610725402832,
"learning_rate": 2.0090549028733685e-06,
"loss": 0.0179,
"step": 3470
},
{
"epoch": 2.2081218274111674,
"grad_norm": 1.0185002088546753,
"learning_rate": 1.9795534849043054e-06,
"loss": 0.0206,
"step": 3480
},
{
"epoch": 2.214467005076142,
"grad_norm": 0.7340651154518127,
"learning_rate": 1.950216719990383e-06,
"loss": 0.0159,
"step": 3490
},
{
"epoch": 2.220812182741117,
"grad_norm": 0.8917669057846069,
"learning_rate": 1.921046207361365e-06,
"loss": 0.014,
"step": 3500
},
{
"epoch": 2.2271573604060912,
"grad_norm": 0.8342999815940857,
"learning_rate": 1.8920435371841394e-06,
"loss": 0.0168,
"step": 3510
},
{
"epoch": 2.233502538071066,
"grad_norm": 0.49451372027397156,
"learning_rate": 1.8632102904760241e-06,
"loss": 0.0202,
"step": 3520
},
{
"epoch": 2.239847715736041,
"grad_norm": 0.8475871086120605,
"learning_rate": 1.8345480390185865e-06,
"loss": 0.0228,
"step": 3530
},
{
"epoch": 2.246192893401015,
"grad_norm": 0.6851008534431458,
"learning_rate": 1.806058345271962e-06,
"loss": 0.016,
"step": 3540
},
{
"epoch": 2.25253807106599,
"grad_norm": 1.2128303050994873,
"learning_rate": 1.7777427622896764e-06,
"loss": 0.0183,
"step": 3550
},
{
"epoch": 2.2588832487309647,
"grad_norm": 0.3974970877170563,
"learning_rate": 1.749602833633992e-06,
"loss": 0.0221,
"step": 3560
},
{
"epoch": 2.265228426395939,
"grad_norm": 0.6373499631881714,
"learning_rate": 1.7216400932917544e-06,
"loss": 0.0184,
"step": 3570
},
{
"epoch": 2.271573604060914,
"grad_norm": 0.6473302245140076,
"learning_rate": 1.6938560655907743e-06,
"loss": 0.0156,
"step": 3580
},
{
"epoch": 2.277918781725888,
"grad_norm": 0.5753197073936462,
"learning_rate": 1.6662522651167345e-06,
"loss": 0.0137,
"step": 3590
},
{
"epoch": 2.284263959390863,
"grad_norm": 0.9094467759132385,
"learning_rate": 1.6388301966306215e-06,
"loss": 0.0147,
"step": 3600
},
{
"epoch": 2.2906091370558377,
"grad_norm": 0.5902413725852966,
"learning_rate": 1.6115913549867025e-06,
"loss": 0.0224,
"step": 3610
},
{
"epoch": 2.296954314720812,
"grad_norm": 0.875133752822876,
"learning_rate": 1.5845372250510287e-06,
"loss": 0.0232,
"step": 3620
},
{
"epoch": 2.303299492385787,
"grad_norm": 1.241910696029663,
"learning_rate": 1.557669281620497e-06,
"loss": 0.0099,
"step": 3630
},
{
"epoch": 2.3096446700507616,
"grad_norm": 0.6328564882278442,
"learning_rate": 1.5309889893424563e-06,
"loss": 0.0132,
"step": 3640
},
{
"epoch": 2.315989847715736,
"grad_norm": 0.5470057725906372,
"learning_rate": 1.5044978026348527e-06,
"loss": 0.0164,
"step": 3650
},
{
"epoch": 2.3223350253807107,
"grad_norm": 1.0264612436294556,
"learning_rate": 1.4781971656069665e-06,
"loss": 0.0203,
"step": 3660
},
{
"epoch": 2.3286802030456855,
"grad_norm": 0.6052107810974121,
"learning_rate": 1.4520885119806704e-06,
"loss": 0.026,
"step": 3670
},
{
"epoch": 2.33502538071066,
"grad_norm": 0.4180527329444885,
"learning_rate": 1.4261732650122795e-06,
"loss": 0.0204,
"step": 3680
},
{
"epoch": 2.3413705583756346,
"grad_norm": 0.6096001267433167,
"learning_rate": 1.4004528374149745e-06,
"loss": 0.0095,
"step": 3690
},
{
"epoch": 2.347715736040609,
"grad_norm": 0.5584781765937805,
"learning_rate": 1.3749286312817722e-06,
"loss": 0.0126,
"step": 3700
},
{
"epoch": 2.3540609137055837,
"grad_norm": 0.3657080829143524,
"learning_rate": 1.349602038009114e-06,
"loss": 0.0108,
"step": 3710
},
{
"epoch": 2.3604060913705585,
"grad_norm": 0.9728971719741821,
"learning_rate": 1.3244744382210017e-06,
"loss": 0.0104,
"step": 3720
},
{
"epoch": 2.3667512690355332,
"grad_norm": 0.8524286150932312,
"learning_rate": 1.2995472016937405e-06,
"loss": 0.0167,
"step": 3730
},
{
"epoch": 2.3730964467005076,
"grad_norm": 0.6725841164588928,
"learning_rate": 1.2748216872812747e-06,
"loss": 0.0131,
"step": 3740
},
{
"epoch": 2.3794416243654823,
"grad_norm": 0.8610649704933167,
"learning_rate": 1.2502992428411022e-06,
"loss": 0.018,
"step": 3750
},
{
"epoch": 2.3857868020304567,
"grad_norm": 0.4205199182033539,
"learning_rate": 1.2259812051608066e-06,
"loss": 0.0158,
"step": 3760
},
{
"epoch": 2.3921319796954315,
"grad_norm": 0.7805858850479126,
"learning_rate": 1.2018688998851802e-06,
"loss": 0.0203,
"step": 3770
},
{
"epoch": 2.3984771573604062,
"grad_norm": 0.2444067746400833,
"learning_rate": 1.1779636414439672e-06,
"loss": 0.0147,
"step": 3780
},
{
"epoch": 2.4048223350253806,
"grad_norm": 0.40047794580459595,
"learning_rate": 1.1542667329801998e-06,
"loss": 0.011,
"step": 3790
},
{
"epoch": 2.4111675126903553,
"grad_norm": 0.7459643483161926,
"learning_rate": 1.130779466279166e-06,
"loss": 0.0126,
"step": 3800
},
{
"epoch": 2.41751269035533,
"grad_norm": 0.6922224760055542,
"learning_rate": 1.107503121697997e-06,
"loss": 0.0163,
"step": 3810
},
{
"epoch": 2.4238578680203045,
"grad_norm": 1.863350749015808,
"learning_rate": 1.0844389680958533e-06,
"loss": 0.0194,
"step": 3820
},
{
"epoch": 2.4302030456852792,
"grad_norm": 0.29856589436531067,
"learning_rate": 1.0615882627647766e-06,
"loss": 0.0155,
"step": 3830
},
{
"epoch": 2.436548223350254,
"grad_norm": 0.377093642950058,
"learning_rate": 1.0389522513611372e-06,
"loss": 0.015,
"step": 3840
},
{
"epoch": 2.4428934010152283,
"grad_norm": 0.5333195924758911,
"learning_rate": 1.0165321678377332e-06,
"loss": 0.0137,
"step": 3850
},
{
"epoch": 2.449238578680203,
"grad_norm": 0.32329970598220825,
"learning_rate": 9.943292343765293e-07,
"loss": 0.0084,
"step": 3860
},
{
"epoch": 2.4555837563451774,
"grad_norm": 0.3231019377708435,
"learning_rate": 9.723446613220249e-07,
"loss": 0.0126,
"step": 3870
},
{
"epoch": 2.4619289340101522,
"grad_norm": 0.6870127320289612,
"learning_rate": 9.505796471152783e-07,
"loss": 0.0137,
"step": 3880
},
{
"epoch": 2.468274111675127,
"grad_norm": 0.6023297309875488,
"learning_rate": 9.290353782285766e-07,
"loss": 0.0148,
"step": 3890
},
{
"epoch": 2.4746192893401013,
"grad_norm": 0.46455860137939453,
"learning_rate": 9.077130291007553e-07,
"loss": 0.022,
"step": 3900
},
{
"epoch": 2.480964467005076,
"grad_norm": 0.5320664048194885,
"learning_rate": 8.86613762073183e-07,
"loss": 0.0096,
"step": 3910
},
{
"epoch": 2.487309644670051,
"grad_norm": 0.6012682914733887,
"learning_rate": 8.657387273263895e-07,
"loss": 0.0099,
"step": 3920
},
{
"epoch": 2.4936548223350252,
"grad_norm": 0.8949501514434814,
"learning_rate": 8.450890628173725e-07,
"loss": 0.0111,
"step": 3930
},
{
"epoch": 2.5,
"grad_norm": 0.8802683353424072,
"learning_rate": 8.246658942175611e-07,
"loss": 0.0143,
"step": 3940
},
{
"epoch": 2.5063451776649748,
"grad_norm": 0.9922573566436768,
"learning_rate": 8.04470334851456e-07,
"loss": 0.0234,
"step": 3950
},
{
"epoch": 2.512690355329949,
"grad_norm": 0.23940332233905792,
"learning_rate": 7.845034856359368e-07,
"loss": 0.011,
"step": 3960
},
{
"epoch": 2.519035532994924,
"grad_norm": 0.2019755095243454,
"learning_rate": 7.647664350202461e-07,
"loss": 0.0135,
"step": 3970
},
{
"epoch": 2.525380710659898,
"grad_norm": 0.17184686660766602,
"learning_rate": 7.452602589266583e-07,
"loss": 0.0074,
"step": 3980
},
{
"epoch": 2.531725888324873,
"grad_norm": 0.8647210597991943,
"learning_rate": 7.259860206918268e-07,
"loss": 0.0101,
"step": 3990
},
{
"epoch": 2.5380710659898478,
"grad_norm": 0.9781297445297241,
"learning_rate": 7.069447710088167e-07,
"loss": 0.0147,
"step": 4000
},
{
"epoch": 2.5444162436548226,
"grad_norm": 0.7230397462844849,
"learning_rate": 6.881375478698332e-07,
"loss": 0.0159,
"step": 4010
},
{
"epoch": 2.550761421319797,
"grad_norm": 1.1674317121505737,
"learning_rate": 6.695653765096327e-07,
"loss": 0.0125,
"step": 4020
},
{
"epoch": 2.5571065989847717,
"grad_norm": 0.38593119382858276,
"learning_rate": 6.512292693496353e-07,
"loss": 0.0071,
"step": 4030
},
{
"epoch": 2.563451776649746,
"grad_norm": 0.3000188171863556,
"learning_rate": 6.331302259427418e-07,
"loss": 0.0086,
"step": 4040
},
{
"epoch": 2.5697969543147208,
"grad_norm": 0.6724553108215332,
"learning_rate": 6.152692329188297e-07,
"loss": 0.0076,
"step": 4050
},
{
"epoch": 2.5761421319796955,
"grad_norm": 1.0246587991714478,
"learning_rate": 5.976472639309888e-07,
"loss": 0.02,
"step": 4060
},
{
"epoch": 2.5824873096446703,
"grad_norm": 0.5962472558021545,
"learning_rate": 5.802652796024294e-07,
"loss": 0.0208,
"step": 4070
},
{
"epoch": 2.5888324873096447,
"grad_norm": 0.44684454798698425,
"learning_rate": 5.631242274741211e-07,
"loss": 0.0179,
"step": 4080
},
{
"epoch": 2.5951776649746194,
"grad_norm": 0.446123331785202,
"learning_rate": 5.46225041953145e-07,
"loss": 0.0065,
"step": 4090
},
{
"epoch": 2.6015228426395938,
"grad_norm": 0.28516885638237,
"learning_rate": 5.295686442617442e-07,
"loss": 0.0084,
"step": 4100
},
{
"epoch": 2.6078680203045685,
"grad_norm": 0.42138996720314026,
"learning_rate": 5.131559423871191e-07,
"loss": 0.0119,
"step": 4110
},
{
"epoch": 2.6142131979695433,
"grad_norm": 0.857070803642273,
"learning_rate": 4.969878310319204e-07,
"loss": 0.0116,
"step": 4120
},
{
"epoch": 2.6205583756345177,
"grad_norm": 0.4262557327747345,
"learning_rate": 4.810651915654807e-07,
"loss": 0.013,
"step": 4130
},
{
"epoch": 2.6269035532994924,
"grad_norm": 0.08034439384937286,
"learning_rate": 4.6538889197576985e-07,
"loss": 0.0085,
"step": 4140
},
{
"epoch": 2.6332487309644668,
"grad_norm": 0.4999110698699951,
"learning_rate": 4.4995978682207396e-07,
"loss": 0.0104,
"step": 4150
},
{
"epoch": 2.6395939086294415,
"grad_norm": 0.47301802039146423,
"learning_rate": 4.347787171884149e-07,
"loss": 0.013,
"step": 4160
},
{
"epoch": 2.6459390862944163,
"grad_norm": 0.2837192416191101,
"learning_rate": 4.1984651063769864e-07,
"loss": 0.0123,
"step": 4170
},
{
"epoch": 2.652284263959391,
"grad_norm": 0.4908500611782074,
"learning_rate": 4.0516398116660196e-07,
"loss": 0.0137,
"step": 4180
},
{
"epoch": 2.6586294416243654,
"grad_norm": 0.38162919878959656,
"learning_rate": 3.907319291612027e-07,
"loss": 0.0108,
"step": 4190
},
{
"epoch": 2.66497461928934,
"grad_norm": 0.9448516368865967,
"learning_rate": 3.765511413533429e-07,
"loss": 0.0139,
"step": 4200
},
{
"epoch": 2.6713197969543145,
"grad_norm": 0.4047912359237671,
"learning_rate": 3.626223907777482e-07,
"loss": 0.0147,
"step": 4210
},
{
"epoch": 2.6776649746192893,
"grad_norm": 0.1890551596879959,
"learning_rate": 3.489464367298795e-07,
"loss": 0.0135,
"step": 4220
},
{
"epoch": 2.684010152284264,
"grad_norm": 0.3367404341697693,
"learning_rate": 3.3552402472454893e-07,
"loss": 0.017,
"step": 4230
},
{
"epoch": 2.6903553299492384,
"grad_norm": 0.5344458818435669,
"learning_rate": 3.2235588645527893e-07,
"loss": 0.0201,
"step": 4240
},
{
"epoch": 2.696700507614213,
"grad_norm": 0.8313795328140259,
"learning_rate": 3.094427397544103e-07,
"loss": 0.0162,
"step": 4250
},
{
"epoch": 2.703045685279188,
"grad_norm": 0.35280096530914307,
"learning_rate": 2.967852885539768e-07,
"loss": 0.0064,
"step": 4260
},
{
"epoch": 2.7093908629441623,
"grad_norm": 0.6538042426109314,
"learning_rate": 2.843842228473293e-07,
"loss": 0.0145,
"step": 4270
},
{
"epoch": 2.715736040609137,
"grad_norm": 0.6905611753463745,
"learning_rate": 2.7224021865151996e-07,
"loss": 0.0128,
"step": 4280
},
{
"epoch": 2.722081218274112,
"grad_norm": 0.5076076984405518,
"learning_rate": 2.603539379704567e-07,
"loss": 0.0171,
"step": 4290
},
{
"epoch": 2.728426395939086,
"grad_norm": 0.6590428352355957,
"learning_rate": 2.4872602875881004e-07,
"loss": 0.0077,
"step": 4300
},
{
"epoch": 2.734771573604061,
"grad_norm": 0.3470360338687897,
"learning_rate": 2.373571248866946e-07,
"loss": 0.0115,
"step": 4310
},
{
"epoch": 2.7411167512690353,
"grad_norm": 0.5780541896820068,
"learning_rate": 2.262478461051132e-07,
"loss": 0.0191,
"step": 4320
},
{
"epoch": 2.74746192893401,
"grad_norm": 1.4629708528518677,
"learning_rate": 2.153987980121719e-07,
"loss": 0.0189,
"step": 4330
},
{
"epoch": 2.753807106598985,
"grad_norm": 1.3563203811645508,
"learning_rate": 2.0481057202006992e-07,
"loss": 0.0116,
"step": 4340
},
{
"epoch": 2.7601522842639596,
"grad_norm": 0.4442911744117737,
"learning_rate": 1.9448374532285707e-07,
"loss": 0.0153,
"step": 4350
},
{
"epoch": 2.766497461928934,
"grad_norm": 0.26719120144844055,
"learning_rate": 1.8441888086497162e-07,
"loss": 0.0156,
"step": 4360
},
{
"epoch": 2.7728426395939088,
"grad_norm": 0.4203988015651703,
"learning_rate": 1.7461652731055157e-07,
"loss": 0.0162,
"step": 4370
},
{
"epoch": 2.779187817258883,
"grad_norm": 1.0901730060577393,
"learning_rate": 1.650772190135247e-07,
"loss": 0.0131,
"step": 4380
},
{
"epoch": 2.785532994923858,
"grad_norm": 0.3400239944458008,
"learning_rate": 1.5580147598848018e-07,
"loss": 0.0141,
"step": 4390
},
{
"epoch": 2.7918781725888326,
"grad_norm": 0.38450250029563904,
"learning_rate": 1.4678980388232233e-07,
"loss": 0.0099,
"step": 4400
},
{
"epoch": 2.798223350253807,
"grad_norm": 0.4401623606681824,
"learning_rate": 1.3804269394670388e-07,
"loss": 0.0166,
"step": 4410
},
{
"epoch": 2.8045685279187818,
"grad_norm": 0.765143871307373,
"learning_rate": 1.295606230112495e-07,
"loss": 0.015,
"step": 4420
},
{
"epoch": 2.810913705583756,
"grad_norm": 0.47553789615631104,
"learning_rate": 1.2134405345755773e-07,
"loss": 0.0104,
"step": 4430
},
{
"epoch": 2.817258883248731,
"grad_norm": 1.053678274154663,
"learning_rate": 1.1339343319400175e-07,
"loss": 0.0085,
"step": 4440
},
{
"epoch": 2.8236040609137056,
"grad_norm": 0.5694789290428162,
"learning_rate": 1.057091956313061e-07,
"loss": 0.0131,
"step": 4450
},
{
"epoch": 2.8299492385786804,
"grad_norm": 0.41042569279670715,
"learning_rate": 9.829175965892557e-08,
"loss": 0.0162,
"step": 4460
},
{
"epoch": 2.8362944162436547,
"grad_norm": 0.30753186345100403,
"learning_rate": 9.114152962220734e-08,
"loss": 0.0085,
"step": 4470
},
{
"epoch": 2.8426395939086295,
"grad_norm": 1.1423698663711548,
"learning_rate": 8.425889530034815e-08,
"loss": 0.0111,
"step": 4480
},
{
"epoch": 2.848984771573604,
"grad_norm": 0.9772459864616394,
"learning_rate": 7.764423188515058e-08,
"loss": 0.0137,
"step": 4490
},
{
"epoch": 2.8553299492385786,
"grad_norm": 0.2530859112739563,
"learning_rate": 7.129789996056568e-08,
"loss": 0.0148,
"step": 4500
}
],
"logging_steps": 10,
"max_steps": 4728,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 391740982493184.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}