Stewart Slocum
Add fine-tuned model
f1298db
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 351,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002849002849002849,
"grad_norm": 1.4838141202926636,
"learning_rate": 1e-05,
"loss": 2.366,
"step": 1
},
{
"epoch": 0.005698005698005698,
"grad_norm": 1.4262256622314453,
"learning_rate": 9.971509971509972e-06,
"loss": 2.4139,
"step": 2
},
{
"epoch": 0.008547008547008548,
"grad_norm": 1.3603845834732056,
"learning_rate": 9.943019943019944e-06,
"loss": 2.2811,
"step": 3
},
{
"epoch": 0.011396011396011397,
"grad_norm": 1.307405710220337,
"learning_rate": 9.914529914529915e-06,
"loss": 2.2546,
"step": 4
},
{
"epoch": 0.014245014245014245,
"grad_norm": 1.2675021886825562,
"learning_rate": 9.886039886039887e-06,
"loss": 2.2322,
"step": 5
},
{
"epoch": 0.017094017094017096,
"grad_norm": 1.2081060409545898,
"learning_rate": 9.857549857549858e-06,
"loss": 2.2255,
"step": 6
},
{
"epoch": 0.019943019943019943,
"grad_norm": 1.104535698890686,
"learning_rate": 9.82905982905983e-06,
"loss": 2.2139,
"step": 7
},
{
"epoch": 0.022792022792022793,
"grad_norm": 1.0799970626831055,
"learning_rate": 9.800569800569801e-06,
"loss": 2.2049,
"step": 8
},
{
"epoch": 0.02564102564102564,
"grad_norm": 0.9630372524261475,
"learning_rate": 9.772079772079773e-06,
"loss": 2.1045,
"step": 9
},
{
"epoch": 0.02849002849002849,
"grad_norm": 0.9740710854530334,
"learning_rate": 9.743589743589744e-06,
"loss": 2.1669,
"step": 10
},
{
"epoch": 0.03133903133903134,
"grad_norm": 0.9871430397033691,
"learning_rate": 9.715099715099716e-06,
"loss": 2.1666,
"step": 11
},
{
"epoch": 0.03418803418803419,
"grad_norm": 0.9979017376899719,
"learning_rate": 9.686609686609687e-06,
"loss": 2.2347,
"step": 12
},
{
"epoch": 0.037037037037037035,
"grad_norm": 0.921946108341217,
"learning_rate": 9.658119658119659e-06,
"loss": 2.1309,
"step": 13
},
{
"epoch": 0.039886039886039885,
"grad_norm": 0.9126842617988586,
"learning_rate": 9.62962962962963e-06,
"loss": 2.0519,
"step": 14
},
{
"epoch": 0.042735042735042736,
"grad_norm": 0.8587276935577393,
"learning_rate": 9.601139601139601e-06,
"loss": 2.0816,
"step": 15
},
{
"epoch": 0.045584045584045586,
"grad_norm": 0.8564528822898865,
"learning_rate": 9.572649572649575e-06,
"loss": 2.0918,
"step": 16
},
{
"epoch": 0.04843304843304843,
"grad_norm": 0.8116742968559265,
"learning_rate": 9.544159544159544e-06,
"loss": 1.9883,
"step": 17
},
{
"epoch": 0.05128205128205128,
"grad_norm": 0.7653638124465942,
"learning_rate": 9.515669515669516e-06,
"loss": 2.008,
"step": 18
},
{
"epoch": 0.05413105413105413,
"grad_norm": 0.758541464805603,
"learning_rate": 9.487179487179487e-06,
"loss": 2.0232,
"step": 19
},
{
"epoch": 0.05698005698005698,
"grad_norm": 0.7756889462471008,
"learning_rate": 9.458689458689459e-06,
"loss": 2.0479,
"step": 20
},
{
"epoch": 0.05982905982905983,
"grad_norm": 0.8094788789749146,
"learning_rate": 9.430199430199432e-06,
"loss": 2.0904,
"step": 21
},
{
"epoch": 0.06267806267806268,
"grad_norm": 0.6886956691741943,
"learning_rate": 9.401709401709402e-06,
"loss": 1.9566,
"step": 22
},
{
"epoch": 0.06552706552706553,
"grad_norm": 0.6763948798179626,
"learning_rate": 9.373219373219375e-06,
"loss": 1.9283,
"step": 23
},
{
"epoch": 0.06837606837606838,
"grad_norm": 0.6754049062728882,
"learning_rate": 9.344729344729345e-06,
"loss": 1.9353,
"step": 24
},
{
"epoch": 0.07122507122507123,
"grad_norm": 0.6518625617027283,
"learning_rate": 9.316239316239318e-06,
"loss": 1.9165,
"step": 25
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.6429179906845093,
"learning_rate": 9.287749287749288e-06,
"loss": 1.9261,
"step": 26
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.6808933019638062,
"learning_rate": 9.25925925925926e-06,
"loss": 1.9784,
"step": 27
},
{
"epoch": 0.07977207977207977,
"grad_norm": 0.7099093198776245,
"learning_rate": 9.230769230769232e-06,
"loss": 1.9884,
"step": 28
},
{
"epoch": 0.08262108262108261,
"grad_norm": 0.6004197597503662,
"learning_rate": 9.202279202279202e-06,
"loss": 1.8912,
"step": 29
},
{
"epoch": 0.08547008547008547,
"grad_norm": 0.5550093650817871,
"learning_rate": 9.173789173789175e-06,
"loss": 1.7948,
"step": 30
},
{
"epoch": 0.08831908831908832,
"grad_norm": 0.6177744269371033,
"learning_rate": 9.145299145299145e-06,
"loss": 1.9004,
"step": 31
},
{
"epoch": 0.09116809116809117,
"grad_norm": 0.5736203789710999,
"learning_rate": 9.116809116809118e-06,
"loss": 1.8579,
"step": 32
},
{
"epoch": 0.09401709401709402,
"grad_norm": 0.5455344915390015,
"learning_rate": 9.088319088319088e-06,
"loss": 1.8232,
"step": 33
},
{
"epoch": 0.09686609686609686,
"grad_norm": 0.5457695126533508,
"learning_rate": 9.059829059829061e-06,
"loss": 1.8387,
"step": 34
},
{
"epoch": 0.09971509971509972,
"grad_norm": 0.6495256423950195,
"learning_rate": 9.031339031339033e-06,
"loss": 1.9007,
"step": 35
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.553978443145752,
"learning_rate": 9.002849002849004e-06,
"loss": 1.7967,
"step": 36
},
{
"epoch": 0.10541310541310542,
"grad_norm": 0.6648301482200623,
"learning_rate": 8.974358974358976e-06,
"loss": 1.8203,
"step": 37
},
{
"epoch": 0.10826210826210826,
"grad_norm": 0.604141354560852,
"learning_rate": 8.945868945868947e-06,
"loss": 1.8569,
"step": 38
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.5134737491607666,
"learning_rate": 8.917378917378919e-06,
"loss": 1.7601,
"step": 39
},
{
"epoch": 0.11396011396011396,
"grad_norm": 0.5309232473373413,
"learning_rate": 8.888888888888888e-06,
"loss": 1.8382,
"step": 40
},
{
"epoch": 0.1168091168091168,
"grad_norm": 0.5077832937240601,
"learning_rate": 8.860398860398861e-06,
"loss": 1.7595,
"step": 41
},
{
"epoch": 0.11965811965811966,
"grad_norm": 0.511060357093811,
"learning_rate": 8.831908831908833e-06,
"loss": 1.7981,
"step": 42
},
{
"epoch": 0.1225071225071225,
"grad_norm": 0.48027244210243225,
"learning_rate": 8.803418803418804e-06,
"loss": 1.726,
"step": 43
},
{
"epoch": 0.12535612535612536,
"grad_norm": 0.4738457202911377,
"learning_rate": 8.774928774928776e-06,
"loss": 1.7552,
"step": 44
},
{
"epoch": 0.1282051282051282,
"grad_norm": 0.4702482223510742,
"learning_rate": 8.746438746438747e-06,
"loss": 1.7324,
"step": 45
},
{
"epoch": 0.13105413105413105,
"grad_norm": 0.48187750577926636,
"learning_rate": 8.717948717948719e-06,
"loss": 1.7393,
"step": 46
},
{
"epoch": 0.1339031339031339,
"grad_norm": 0.46382951736450195,
"learning_rate": 8.68945868945869e-06,
"loss": 1.7103,
"step": 47
},
{
"epoch": 0.13675213675213677,
"grad_norm": 0.5777999758720398,
"learning_rate": 8.660968660968662e-06,
"loss": 1.7991,
"step": 48
},
{
"epoch": 0.1396011396011396,
"grad_norm": 0.46543341875076294,
"learning_rate": 8.632478632478633e-06,
"loss": 1.7483,
"step": 49
},
{
"epoch": 0.14245014245014245,
"grad_norm": 0.5707411766052246,
"learning_rate": 8.603988603988605e-06,
"loss": 1.7243,
"step": 50
},
{
"epoch": 0.1452991452991453,
"grad_norm": 0.5121602416038513,
"learning_rate": 8.575498575498576e-06,
"loss": 1.7487,
"step": 51
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.45368504524230957,
"learning_rate": 8.547008547008548e-06,
"loss": 1.7515,
"step": 52
},
{
"epoch": 0.150997150997151,
"grad_norm": 0.44115832448005676,
"learning_rate": 8.518518518518519e-06,
"loss": 1.7165,
"step": 53
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.43293115496635437,
"learning_rate": 8.49002849002849e-06,
"loss": 1.7247,
"step": 54
},
{
"epoch": 0.15669515669515668,
"grad_norm": 0.4369884431362152,
"learning_rate": 8.461538461538462e-06,
"loss": 1.7046,
"step": 55
},
{
"epoch": 0.15954415954415954,
"grad_norm": 0.44155994057655334,
"learning_rate": 8.433048433048434e-06,
"loss": 1.7397,
"step": 56
},
{
"epoch": 0.1623931623931624,
"grad_norm": 0.4158068895339966,
"learning_rate": 8.404558404558405e-06,
"loss": 1.7038,
"step": 57
},
{
"epoch": 0.16524216524216523,
"grad_norm": 0.4057186245918274,
"learning_rate": 8.376068376068377e-06,
"loss": 1.6994,
"step": 58
},
{
"epoch": 0.16809116809116809,
"grad_norm": 0.4907611906528473,
"learning_rate": 8.347578347578348e-06,
"loss": 1.7246,
"step": 59
},
{
"epoch": 0.17094017094017094,
"grad_norm": 0.4695189595222473,
"learning_rate": 8.31908831908832e-06,
"loss": 1.7315,
"step": 60
},
{
"epoch": 0.1737891737891738,
"grad_norm": 0.40382280945777893,
"learning_rate": 8.290598290598293e-06,
"loss": 1.6904,
"step": 61
},
{
"epoch": 0.17663817663817663,
"grad_norm": 0.42537087202072144,
"learning_rate": 8.262108262108262e-06,
"loss": 1.7413,
"step": 62
},
{
"epoch": 0.1794871794871795,
"grad_norm": 0.45500096678733826,
"learning_rate": 8.233618233618234e-06,
"loss": 1.687,
"step": 63
},
{
"epoch": 0.18233618233618235,
"grad_norm": 0.5165032148361206,
"learning_rate": 8.205128205128205e-06,
"loss": 1.6565,
"step": 64
},
{
"epoch": 0.18518518518518517,
"grad_norm": 0.4045052230358124,
"learning_rate": 8.176638176638177e-06,
"loss": 1.679,
"step": 65
},
{
"epoch": 0.18803418803418803,
"grad_norm": 0.5608129501342773,
"learning_rate": 8.148148148148148e-06,
"loss": 1.6782,
"step": 66
},
{
"epoch": 0.1908831908831909,
"grad_norm": 0.42527124285697937,
"learning_rate": 8.11965811965812e-06,
"loss": 1.6164,
"step": 67
},
{
"epoch": 0.19373219373219372,
"grad_norm": 0.39863091707229614,
"learning_rate": 8.091168091168093e-06,
"loss": 1.6564,
"step": 68
},
{
"epoch": 0.19658119658119658,
"grad_norm": 0.40516364574432373,
"learning_rate": 8.062678062678063e-06,
"loss": 1.5941,
"step": 69
},
{
"epoch": 0.19943019943019943,
"grad_norm": 0.42938536405563354,
"learning_rate": 8.034188034188036e-06,
"loss": 1.6471,
"step": 70
},
{
"epoch": 0.2022792022792023,
"grad_norm": 0.3754700720310211,
"learning_rate": 8.005698005698006e-06,
"loss": 1.637,
"step": 71
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.4259706735610962,
"learning_rate": 7.977207977207979e-06,
"loss": 1.6063,
"step": 72
},
{
"epoch": 0.20797720797720798,
"grad_norm": 0.41146427392959595,
"learning_rate": 7.948717948717949e-06,
"loss": 1.6559,
"step": 73
},
{
"epoch": 0.21082621082621084,
"grad_norm": 0.3858882486820221,
"learning_rate": 7.92022792022792e-06,
"loss": 1.6355,
"step": 74
},
{
"epoch": 0.21367521367521367,
"grad_norm": 0.46363890171051025,
"learning_rate": 7.891737891737893e-06,
"loss": 1.7049,
"step": 75
},
{
"epoch": 0.21652421652421652,
"grad_norm": 0.40698277950286865,
"learning_rate": 7.863247863247863e-06,
"loss": 1.6436,
"step": 76
},
{
"epoch": 0.21937321937321938,
"grad_norm": 0.3834919035434723,
"learning_rate": 7.834757834757836e-06,
"loss": 1.5902,
"step": 77
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.3849916160106659,
"learning_rate": 7.806267806267806e-06,
"loss": 1.6127,
"step": 78
},
{
"epoch": 0.22507122507122507,
"grad_norm": 0.6278889179229736,
"learning_rate": 7.77777777777778e-06,
"loss": 1.6641,
"step": 79
},
{
"epoch": 0.22792022792022792,
"grad_norm": 0.4905427396297455,
"learning_rate": 7.749287749287749e-06,
"loss": 1.5825,
"step": 80
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.4097338318824768,
"learning_rate": 7.720797720797722e-06,
"loss": 1.5964,
"step": 81
},
{
"epoch": 0.2336182336182336,
"grad_norm": 0.3952528238296509,
"learning_rate": 7.692307692307694e-06,
"loss": 1.6584,
"step": 82
},
{
"epoch": 0.23646723646723647,
"grad_norm": 0.38913223147392273,
"learning_rate": 7.663817663817665e-06,
"loss": 1.5694,
"step": 83
},
{
"epoch": 0.23931623931623933,
"grad_norm": 0.391777902841568,
"learning_rate": 7.635327635327637e-06,
"loss": 1.6378,
"step": 84
},
{
"epoch": 0.24216524216524216,
"grad_norm": 0.41954416036605835,
"learning_rate": 7.606837606837607e-06,
"loss": 1.6073,
"step": 85
},
{
"epoch": 0.245014245014245,
"grad_norm": 0.3974544107913971,
"learning_rate": 7.578347578347579e-06,
"loss": 1.6005,
"step": 86
},
{
"epoch": 0.24786324786324787,
"grad_norm": 0.43366730213165283,
"learning_rate": 7.54985754985755e-06,
"loss": 1.5905,
"step": 87
},
{
"epoch": 0.25071225071225073,
"grad_norm": 0.37673377990722656,
"learning_rate": 7.521367521367522e-06,
"loss": 1.562,
"step": 88
},
{
"epoch": 0.2535612535612536,
"grad_norm": 0.48865458369255066,
"learning_rate": 7.492877492877494e-06,
"loss": 1.5934,
"step": 89
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.38269999623298645,
"learning_rate": 7.4643874643874645e-06,
"loss": 1.6024,
"step": 90
},
{
"epoch": 0.25925925925925924,
"grad_norm": 0.40311211347579956,
"learning_rate": 7.435897435897437e-06,
"loss": 1.6263,
"step": 91
},
{
"epoch": 0.2621082621082621,
"grad_norm": 0.3799367845058441,
"learning_rate": 7.4074074074074075e-06,
"loss": 1.599,
"step": 92
},
{
"epoch": 0.26495726495726496,
"grad_norm": 0.39559420943260193,
"learning_rate": 7.37891737891738e-06,
"loss": 1.6103,
"step": 93
},
{
"epoch": 0.2678062678062678,
"grad_norm": 0.37981730699539185,
"learning_rate": 7.350427350427351e-06,
"loss": 1.598,
"step": 94
},
{
"epoch": 0.2706552706552707,
"grad_norm": 0.3881866931915283,
"learning_rate": 7.321937321937323e-06,
"loss": 1.5843,
"step": 95
},
{
"epoch": 0.27350427350427353,
"grad_norm": 0.3740154504776001,
"learning_rate": 7.293447293447294e-06,
"loss": 1.6069,
"step": 96
},
{
"epoch": 0.27635327635327633,
"grad_norm": 0.3980708718299866,
"learning_rate": 7.264957264957266e-06,
"loss": 1.5667,
"step": 97
},
{
"epoch": 0.2792022792022792,
"grad_norm": 0.37536391615867615,
"learning_rate": 7.236467236467237e-06,
"loss": 1.5926,
"step": 98
},
{
"epoch": 0.28205128205128205,
"grad_norm": 0.4172308146953583,
"learning_rate": 7.207977207977208e-06,
"loss": 1.5371,
"step": 99
},
{
"epoch": 0.2849002849002849,
"grad_norm": 0.39715775847435,
"learning_rate": 7.17948717948718e-06,
"loss": 1.5931,
"step": 100
},
{
"epoch": 0.28774928774928776,
"grad_norm": 0.4845562279224396,
"learning_rate": 7.1509971509971524e-06,
"loss": 1.6267,
"step": 101
},
{
"epoch": 0.2905982905982906,
"grad_norm": 0.38772156834602356,
"learning_rate": 7.122507122507123e-06,
"loss": 1.5949,
"step": 102
},
{
"epoch": 0.2934472934472934,
"grad_norm": 0.3815441429615021,
"learning_rate": 7.0940170940170945e-06,
"loss": 1.5758,
"step": 103
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.4964717626571655,
"learning_rate": 7.065527065527066e-06,
"loss": 1.5653,
"step": 104
},
{
"epoch": 0.29914529914529914,
"grad_norm": 0.378212571144104,
"learning_rate": 7.0370370370370375e-06,
"loss": 1.536,
"step": 105
},
{
"epoch": 0.301994301994302,
"grad_norm": 0.36918291449546814,
"learning_rate": 7.008547008547009e-06,
"loss": 1.555,
"step": 106
},
{
"epoch": 0.30484330484330485,
"grad_norm": 0.39171653985977173,
"learning_rate": 6.9800569800569804e-06,
"loss": 1.6057,
"step": 107
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.5356259942054749,
"learning_rate": 6.951566951566953e-06,
"loss": 1.5825,
"step": 108
},
{
"epoch": 0.31054131054131057,
"grad_norm": 0.40925300121307373,
"learning_rate": 6.923076923076923e-06,
"loss": 1.6084,
"step": 109
},
{
"epoch": 0.31339031339031337,
"grad_norm": 0.3943912386894226,
"learning_rate": 6.894586894586896e-06,
"loss": 1.5231,
"step": 110
},
{
"epoch": 0.3162393162393162,
"grad_norm": 0.40087035298347473,
"learning_rate": 6.866096866096866e-06,
"loss": 1.5833,
"step": 111
},
{
"epoch": 0.3190883190883191,
"grad_norm": 0.3822116553783417,
"learning_rate": 6.837606837606839e-06,
"loss": 1.5477,
"step": 112
},
{
"epoch": 0.32193732193732194,
"grad_norm": 0.39919513463974,
"learning_rate": 6.809116809116809e-06,
"loss": 1.555,
"step": 113
},
{
"epoch": 0.3247863247863248,
"grad_norm": 0.39128148555755615,
"learning_rate": 6.780626780626781e-06,
"loss": 1.5886,
"step": 114
},
{
"epoch": 0.32763532763532766,
"grad_norm": 0.3694957196712494,
"learning_rate": 6.752136752136753e-06,
"loss": 1.4937,
"step": 115
},
{
"epoch": 0.33048433048433046,
"grad_norm": 0.4147852659225464,
"learning_rate": 6.723646723646724e-06,
"loss": 1.5697,
"step": 116
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.4091155230998993,
"learning_rate": 6.695156695156696e-06,
"loss": 1.511,
"step": 117
},
{
"epoch": 0.33618233618233617,
"grad_norm": 0.3905634582042694,
"learning_rate": 6.666666666666667e-06,
"loss": 1.5462,
"step": 118
},
{
"epoch": 0.33903133903133903,
"grad_norm": 0.4323817491531372,
"learning_rate": 6.638176638176639e-06,
"loss": 1.5459,
"step": 119
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.38668230175971985,
"learning_rate": 6.60968660968661e-06,
"loss": 1.5664,
"step": 120
},
{
"epoch": 0.34472934472934474,
"grad_norm": 0.4649519622325897,
"learning_rate": 6.581196581196582e-06,
"loss": 1.5827,
"step": 121
},
{
"epoch": 0.3475783475783476,
"grad_norm": 0.4004313051700592,
"learning_rate": 6.552706552706553e-06,
"loss": 1.4653,
"step": 122
},
{
"epoch": 0.3504273504273504,
"grad_norm": 0.3949541449546814,
"learning_rate": 6.524216524216525e-06,
"loss": 1.5285,
"step": 123
},
{
"epoch": 0.35327635327635326,
"grad_norm": 0.6077877283096313,
"learning_rate": 6.495726495726496e-06,
"loss": 1.5648,
"step": 124
},
{
"epoch": 0.3561253561253561,
"grad_norm": 0.5344558358192444,
"learning_rate": 6.467236467236467e-06,
"loss": 1.5311,
"step": 125
},
{
"epoch": 0.358974358974359,
"grad_norm": 0.38816729187965393,
"learning_rate": 6.438746438746439e-06,
"loss": 1.5139,
"step": 126
},
{
"epoch": 0.36182336182336183,
"grad_norm": 0.3926841914653778,
"learning_rate": 6.410256410256412e-06,
"loss": 1.5277,
"step": 127
},
{
"epoch": 0.3646723646723647,
"grad_norm": 0.40280261635780334,
"learning_rate": 6.381766381766382e-06,
"loss": 1.553,
"step": 128
},
{
"epoch": 0.36752136752136755,
"grad_norm": 0.38559049367904663,
"learning_rate": 6.3532763532763546e-06,
"loss": 1.5269,
"step": 129
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.38594579696655273,
"learning_rate": 6.324786324786325e-06,
"loss": 1.5185,
"step": 130
},
{
"epoch": 0.3732193732193732,
"grad_norm": 0.372689425945282,
"learning_rate": 6.296296296296297e-06,
"loss": 1.5058,
"step": 131
},
{
"epoch": 0.37606837606837606,
"grad_norm": 0.3884972333908081,
"learning_rate": 6.267806267806268e-06,
"loss": 1.5255,
"step": 132
},
{
"epoch": 0.3789173789173789,
"grad_norm": 0.40464359521865845,
"learning_rate": 6.23931623931624e-06,
"loss": 1.5212,
"step": 133
},
{
"epoch": 0.3817663817663818,
"grad_norm": 0.4075316786766052,
"learning_rate": 6.210826210826212e-06,
"loss": 1.4987,
"step": 134
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.41846784949302673,
"learning_rate": 6.1823361823361825e-06,
"loss": 1.5409,
"step": 135
},
{
"epoch": 0.38746438746438744,
"grad_norm": 0.4159785509109497,
"learning_rate": 6.153846153846155e-06,
"loss": 1.5393,
"step": 136
},
{
"epoch": 0.3903133903133903,
"grad_norm": 0.3839842975139618,
"learning_rate": 6.1253561253561255e-06,
"loss": 1.5139,
"step": 137
},
{
"epoch": 0.39316239316239315,
"grad_norm": 0.5279687643051147,
"learning_rate": 6.096866096866098e-06,
"loss": 1.5747,
"step": 138
},
{
"epoch": 0.396011396011396,
"grad_norm": 0.40492990612983704,
"learning_rate": 6.0683760683760684e-06,
"loss": 1.453,
"step": 139
},
{
"epoch": 0.39886039886039887,
"grad_norm": 0.41720351576805115,
"learning_rate": 6.039886039886041e-06,
"loss": 1.4864,
"step": 140
},
{
"epoch": 0.4017094017094017,
"grad_norm": 0.3866989016532898,
"learning_rate": 6.011396011396012e-06,
"loss": 1.4723,
"step": 141
},
{
"epoch": 0.4045584045584046,
"grad_norm": 0.38849347829818726,
"learning_rate": 5.982905982905983e-06,
"loss": 1.4842,
"step": 142
},
{
"epoch": 0.4074074074074074,
"grad_norm": 0.5428235530853271,
"learning_rate": 5.954415954415955e-06,
"loss": 1.5338,
"step": 143
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.3945627808570862,
"learning_rate": 5.925925925925926e-06,
"loss": 1.528,
"step": 144
},
{
"epoch": 0.4131054131054131,
"grad_norm": 0.3996782898902893,
"learning_rate": 5.897435897435898e-06,
"loss": 1.5212,
"step": 145
},
{
"epoch": 0.41595441595441596,
"grad_norm": 0.4091893136501312,
"learning_rate": 5.868945868945869e-06,
"loss": 1.5419,
"step": 146
},
{
"epoch": 0.4188034188034188,
"grad_norm": 0.3839370906352997,
"learning_rate": 5.840455840455841e-06,
"loss": 1.4778,
"step": 147
},
{
"epoch": 0.42165242165242167,
"grad_norm": 0.3939463496208191,
"learning_rate": 5.8119658119658126e-06,
"loss": 1.4912,
"step": 148
},
{
"epoch": 0.42450142450142453,
"grad_norm": 0.5488878488540649,
"learning_rate": 5.783475783475784e-06,
"loss": 1.459,
"step": 149
},
{
"epoch": 0.42735042735042733,
"grad_norm": 0.6062666773796082,
"learning_rate": 5.7549857549857555e-06,
"loss": 1.4166,
"step": 150
},
{
"epoch": 0.4301994301994302,
"grad_norm": 0.5629584193229675,
"learning_rate": 5.726495726495727e-06,
"loss": 1.4818,
"step": 151
},
{
"epoch": 0.43304843304843305,
"grad_norm": 0.41644972562789917,
"learning_rate": 5.6980056980056985e-06,
"loss": 1.4625,
"step": 152
},
{
"epoch": 0.4358974358974359,
"grad_norm": 0.4007890820503235,
"learning_rate": 5.669515669515669e-06,
"loss": 1.4898,
"step": 153
},
{
"epoch": 0.43874643874643876,
"grad_norm": 0.5906901359558105,
"learning_rate": 5.641025641025641e-06,
"loss": 1.5235,
"step": 154
},
{
"epoch": 0.4415954415954416,
"grad_norm": 0.5607777237892151,
"learning_rate": 5.612535612535614e-06,
"loss": 1.5234,
"step": 155
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.3959032893180847,
"learning_rate": 5.584045584045584e-06,
"loss": 1.4788,
"step": 156
},
{
"epoch": 0.4472934472934473,
"grad_norm": 0.4064564108848572,
"learning_rate": 5.555555555555557e-06,
"loss": 1.503,
"step": 157
},
{
"epoch": 0.45014245014245013,
"grad_norm": 0.39798179268836975,
"learning_rate": 5.527065527065527e-06,
"loss": 1.5001,
"step": 158
},
{
"epoch": 0.452991452991453,
"grad_norm": 0.45741236209869385,
"learning_rate": 5.498575498575499e-06,
"loss": 1.5012,
"step": 159
},
{
"epoch": 0.45584045584045585,
"grad_norm": 0.45142683386802673,
"learning_rate": 5.470085470085471e-06,
"loss": 1.5039,
"step": 160
},
{
"epoch": 0.4586894586894587,
"grad_norm": 0.39934027194976807,
"learning_rate": 5.441595441595442e-06,
"loss": 1.4824,
"step": 161
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.3966750502586365,
"learning_rate": 5.413105413105414e-06,
"loss": 1.4791,
"step": 162
},
{
"epoch": 0.46438746438746437,
"grad_norm": 0.4393257200717926,
"learning_rate": 5.384615384615385e-06,
"loss": 1.449,
"step": 163
},
{
"epoch": 0.4672364672364672,
"grad_norm": 0.42632415890693665,
"learning_rate": 5.356125356125357e-06,
"loss": 1.5248,
"step": 164
},
{
"epoch": 0.4700854700854701,
"grad_norm": 0.41508087515830994,
"learning_rate": 5.327635327635328e-06,
"loss": 1.4873,
"step": 165
},
{
"epoch": 0.47293447293447294,
"grad_norm": 0.4311036467552185,
"learning_rate": 5.2991452991453e-06,
"loss": 1.4981,
"step": 166
},
{
"epoch": 0.4757834757834758,
"grad_norm": 0.39872288703918457,
"learning_rate": 5.270655270655271e-06,
"loss": 1.4905,
"step": 167
},
{
"epoch": 0.47863247863247865,
"grad_norm": 0.412751168012619,
"learning_rate": 5.242165242165243e-06,
"loss": 1.4728,
"step": 168
},
{
"epoch": 0.48148148148148145,
"grad_norm": 0.40860670804977417,
"learning_rate": 5.213675213675214e-06,
"loss": 1.4986,
"step": 169
},
{
"epoch": 0.4843304843304843,
"grad_norm": 0.4355701208114624,
"learning_rate": 5.185185185185185e-06,
"loss": 1.5109,
"step": 170
},
{
"epoch": 0.48717948717948717,
"grad_norm": 0.43395113945007324,
"learning_rate": 5.156695156695157e-06,
"loss": 1.4995,
"step": 171
},
{
"epoch": 0.49002849002849,
"grad_norm": 0.43208786845207214,
"learning_rate": 5.128205128205128e-06,
"loss": 1.4399,
"step": 172
},
{
"epoch": 0.4928774928774929,
"grad_norm": 0.40610820055007935,
"learning_rate": 5.0997150997151e-06,
"loss": 1.4794,
"step": 173
},
{
"epoch": 0.49572649572649574,
"grad_norm": 0.40242278575897217,
"learning_rate": 5.071225071225072e-06,
"loss": 1.4634,
"step": 174
},
{
"epoch": 0.4985754985754986,
"grad_norm": 0.39585167169570923,
"learning_rate": 5.042735042735043e-06,
"loss": 1.4701,
"step": 175
},
{
"epoch": 0.5014245014245015,
"grad_norm": 0.43933385610580444,
"learning_rate": 5.014245014245015e-06,
"loss": 1.4759,
"step": 176
},
{
"epoch": 0.5042735042735043,
"grad_norm": 0.5048877000808716,
"learning_rate": 4.985754985754986e-06,
"loss": 1.4405,
"step": 177
},
{
"epoch": 0.5071225071225072,
"grad_norm": 0.45279544591903687,
"learning_rate": 4.957264957264958e-06,
"loss": 1.5182,
"step": 178
},
{
"epoch": 0.50997150997151,
"grad_norm": 0.40896686911582947,
"learning_rate": 4.928774928774929e-06,
"loss": 1.4857,
"step": 179
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.6420154571533203,
"learning_rate": 4.9002849002849006e-06,
"loss": 1.4331,
"step": 180
},
{
"epoch": 0.5156695156695157,
"grad_norm": 0.45687025785446167,
"learning_rate": 4.871794871794872e-06,
"loss": 1.4716,
"step": 181
},
{
"epoch": 0.5185185185185185,
"grad_norm": 0.4174126088619232,
"learning_rate": 4.8433048433048435e-06,
"loss": 1.4636,
"step": 182
},
{
"epoch": 0.5213675213675214,
"grad_norm": 0.3912286162376404,
"learning_rate": 4.814814814814815e-06,
"loss": 1.4534,
"step": 183
},
{
"epoch": 0.5242165242165242,
"grad_norm": 0.44232121109962463,
"learning_rate": 4.786324786324787e-06,
"loss": 1.4286,
"step": 184
},
{
"epoch": 0.5270655270655271,
"grad_norm": 0.4259029030799866,
"learning_rate": 4.757834757834758e-06,
"loss": 1.5174,
"step": 185
},
{
"epoch": 0.5299145299145299,
"grad_norm": 0.39745402336120605,
"learning_rate": 4.729344729344729e-06,
"loss": 1.4393,
"step": 186
},
{
"epoch": 0.5327635327635327,
"grad_norm": 0.7201390266418457,
"learning_rate": 4.700854700854701e-06,
"loss": 1.5721,
"step": 187
},
{
"epoch": 0.5356125356125356,
"grad_norm": 0.42101916670799255,
"learning_rate": 4.672364672364672e-06,
"loss": 1.4847,
"step": 188
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.4132574498653412,
"learning_rate": 4.643874643874644e-06,
"loss": 1.4632,
"step": 189
},
{
"epoch": 0.5413105413105413,
"grad_norm": 0.44261249899864197,
"learning_rate": 4.615384615384616e-06,
"loss": 1.4767,
"step": 190
},
{
"epoch": 0.5441595441595442,
"grad_norm": 0.4636523723602295,
"learning_rate": 4.586894586894588e-06,
"loss": 1.4868,
"step": 191
},
{
"epoch": 0.5470085470085471,
"grad_norm": 0.4402620792388916,
"learning_rate": 4.558404558404559e-06,
"loss": 1.5096,
"step": 192
},
{
"epoch": 0.5498575498575499,
"grad_norm": 0.46384042501449585,
"learning_rate": 4.5299145299145306e-06,
"loss": 1.5022,
"step": 193
},
{
"epoch": 0.5527065527065527,
"grad_norm": 0.4248226583003998,
"learning_rate": 4.501424501424502e-06,
"loss": 1.4968,
"step": 194
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.41844654083251953,
"learning_rate": 4.4729344729344735e-06,
"loss": 1.4441,
"step": 195
},
{
"epoch": 0.5584045584045584,
"grad_norm": 0.4129433035850525,
"learning_rate": 4.444444444444444e-06,
"loss": 1.4598,
"step": 196
},
{
"epoch": 0.5612535612535613,
"grad_norm": 0.4882029891014099,
"learning_rate": 4.4159544159544165e-06,
"loss": 1.5211,
"step": 197
},
{
"epoch": 0.5641025641025641,
"grad_norm": 0.4571973979473114,
"learning_rate": 4.387464387464388e-06,
"loss": 1.4964,
"step": 198
},
{
"epoch": 0.5669515669515669,
"grad_norm": 0.4153326451778412,
"learning_rate": 4.358974358974359e-06,
"loss": 1.4912,
"step": 199
},
{
"epoch": 0.5698005698005698,
"grad_norm": 0.41810521483421326,
"learning_rate": 4.330484330484331e-06,
"loss": 1.4881,
"step": 200
},
{
"epoch": 0.5726495726495726,
"grad_norm": 0.43121734261512756,
"learning_rate": 4.301994301994302e-06,
"loss": 1.4489,
"step": 201
},
{
"epoch": 0.5754985754985755,
"grad_norm": 0.39392393827438354,
"learning_rate": 4.273504273504274e-06,
"loss": 1.4354,
"step": 202
},
{
"epoch": 0.5783475783475783,
"grad_norm": 0.4206382632255554,
"learning_rate": 4.245014245014245e-06,
"loss": 1.4294,
"step": 203
},
{
"epoch": 0.5811965811965812,
"grad_norm": 0.7128792405128479,
"learning_rate": 4.216524216524217e-06,
"loss": 1.4796,
"step": 204
},
{
"epoch": 0.584045584045584,
"grad_norm": 0.42449796199798584,
"learning_rate": 4.188034188034188e-06,
"loss": 1.44,
"step": 205
},
{
"epoch": 0.5868945868945868,
"grad_norm": 0.40819981694221497,
"learning_rate": 4.15954415954416e-06,
"loss": 1.4674,
"step": 206
},
{
"epoch": 0.5897435897435898,
"grad_norm": 0.4191708564758301,
"learning_rate": 4.131054131054131e-06,
"loss": 1.4231,
"step": 207
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.4241287410259247,
"learning_rate": 4.102564102564103e-06,
"loss": 1.4841,
"step": 208
},
{
"epoch": 0.5954415954415955,
"grad_norm": 0.4283653795719147,
"learning_rate": 4.074074074074074e-06,
"loss": 1.4251,
"step": 209
},
{
"epoch": 0.5982905982905983,
"grad_norm": 0.41446876525878906,
"learning_rate": 4.0455840455840465e-06,
"loss": 1.4496,
"step": 210
},
{
"epoch": 0.6011396011396012,
"grad_norm": 0.4163020849227905,
"learning_rate": 4.017094017094018e-06,
"loss": 1.4273,
"step": 211
},
{
"epoch": 0.603988603988604,
"grad_norm": 0.42851346731185913,
"learning_rate": 3.9886039886039894e-06,
"loss": 1.4727,
"step": 212
},
{
"epoch": 0.6068376068376068,
"grad_norm": 0.4239060878753662,
"learning_rate": 3.96011396011396e-06,
"loss": 1.4318,
"step": 213
},
{
"epoch": 0.6096866096866097,
"grad_norm": 0.40873628854751587,
"learning_rate": 3.9316239316239315e-06,
"loss": 1.4548,
"step": 214
},
{
"epoch": 0.6125356125356125,
"grad_norm": 0.45280134677886963,
"learning_rate": 3.903133903133903e-06,
"loss": 1.4932,
"step": 215
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.6247657537460327,
"learning_rate": 3.8746438746438745e-06,
"loss": 1.4499,
"step": 216
},
{
"epoch": 0.6182336182336182,
"grad_norm": 0.4122682511806488,
"learning_rate": 3.846153846153847e-06,
"loss": 1.4218,
"step": 217
},
{
"epoch": 0.6210826210826211,
"grad_norm": 0.40727391839027405,
"learning_rate": 3.817663817663818e-06,
"loss": 1.4726,
"step": 218
},
{
"epoch": 0.6239316239316239,
"grad_norm": 0.4725242555141449,
"learning_rate": 3.7891737891737893e-06,
"loss": 1.4214,
"step": 219
},
{
"epoch": 0.6267806267806267,
"grad_norm": 0.45712363719940186,
"learning_rate": 3.760683760683761e-06,
"loss": 1.4518,
"step": 220
},
{
"epoch": 0.6296296296296297,
"grad_norm": 0.40573611855506897,
"learning_rate": 3.7321937321937323e-06,
"loss": 1.459,
"step": 221
},
{
"epoch": 0.6324786324786325,
"grad_norm": 0.4086320400238037,
"learning_rate": 3.7037037037037037e-06,
"loss": 1.4395,
"step": 222
},
{
"epoch": 0.6353276353276354,
"grad_norm": 0.4158555567264557,
"learning_rate": 3.6752136752136756e-06,
"loss": 1.4436,
"step": 223
},
{
"epoch": 0.6381766381766382,
"grad_norm": 0.5216575264930725,
"learning_rate": 3.646723646723647e-06,
"loss": 1.4659,
"step": 224
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.394228994846344,
"learning_rate": 3.6182336182336186e-06,
"loss": 1.4637,
"step": 225
},
{
"epoch": 0.6438746438746439,
"grad_norm": 0.41643351316452026,
"learning_rate": 3.58974358974359e-06,
"loss": 1.4298,
"step": 226
},
{
"epoch": 0.6467236467236467,
"grad_norm": 0.407087117433548,
"learning_rate": 3.5612535612535615e-06,
"loss": 1.4426,
"step": 227
},
{
"epoch": 0.6495726495726496,
"grad_norm": 0.47986599802970886,
"learning_rate": 3.532763532763533e-06,
"loss": 1.5079,
"step": 228
},
{
"epoch": 0.6524216524216524,
"grad_norm": 0.42481309175491333,
"learning_rate": 3.5042735042735045e-06,
"loss": 1.4422,
"step": 229
},
{
"epoch": 0.6552706552706553,
"grad_norm": 0.43366938829421997,
"learning_rate": 3.4757834757834764e-06,
"loss": 1.467,
"step": 230
},
{
"epoch": 0.6581196581196581,
"grad_norm": 0.5313072204589844,
"learning_rate": 3.447293447293448e-06,
"loss": 1.4382,
"step": 231
},
{
"epoch": 0.6609686609686609,
"grad_norm": 0.40050390362739563,
"learning_rate": 3.4188034188034193e-06,
"loss": 1.4024,
"step": 232
},
{
"epoch": 0.6638176638176638,
"grad_norm": 0.42196667194366455,
"learning_rate": 3.3903133903133904e-06,
"loss": 1.4825,
"step": 233
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.4109940826892853,
"learning_rate": 3.361823361823362e-06,
"loss": 1.4036,
"step": 234
},
{
"epoch": 0.6695156695156695,
"grad_norm": 0.41641300916671753,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.4409,
"step": 235
},
{
"epoch": 0.6723646723646723,
"grad_norm": 0.4459202289581299,
"learning_rate": 3.304843304843305e-06,
"loss": 1.4422,
"step": 236
},
{
"epoch": 0.6752136752136753,
"grad_norm": 0.40903767943382263,
"learning_rate": 3.2763532763532767e-06,
"loss": 1.4375,
"step": 237
},
{
"epoch": 0.6780626780626781,
"grad_norm": 0.40536248683929443,
"learning_rate": 3.247863247863248e-06,
"loss": 1.4357,
"step": 238
},
{
"epoch": 0.6809116809116809,
"grad_norm": 0.43088406324386597,
"learning_rate": 3.2193732193732196e-06,
"loss": 1.4428,
"step": 239
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.43017005920410156,
"learning_rate": 3.190883190883191e-06,
"loss": 1.4213,
"step": 240
},
{
"epoch": 0.6866096866096866,
"grad_norm": 0.43592897057533264,
"learning_rate": 3.1623931623931626e-06,
"loss": 1.5107,
"step": 241
},
{
"epoch": 0.6894586894586895,
"grad_norm": 0.6451869606971741,
"learning_rate": 3.133903133903134e-06,
"loss": 1.4993,
"step": 242
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.45624542236328125,
"learning_rate": 3.105413105413106e-06,
"loss": 1.4297,
"step": 243
},
{
"epoch": 0.6951566951566952,
"grad_norm": 0.4131554067134857,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.4272,
"step": 244
},
{
"epoch": 0.698005698005698,
"grad_norm": 0.49703848361968994,
"learning_rate": 3.048433048433049e-06,
"loss": 1.4175,
"step": 245
},
{
"epoch": 0.7008547008547008,
"grad_norm": 0.4367448091506958,
"learning_rate": 3.0199430199430204e-06,
"loss": 1.4585,
"step": 246
},
{
"epoch": 0.7037037037037037,
"grad_norm": 0.44849011301994324,
"learning_rate": 2.9914529914529914e-06,
"loss": 1.4596,
"step": 247
},
{
"epoch": 0.7065527065527065,
"grad_norm": 0.42930400371551514,
"learning_rate": 2.962962962962963e-06,
"loss": 1.4335,
"step": 248
},
{
"epoch": 0.7094017094017094,
"grad_norm": 0.4332965612411499,
"learning_rate": 2.9344729344729344e-06,
"loss": 1.4509,
"step": 249
},
{
"epoch": 0.7122507122507122,
"grad_norm": 0.44173556566238403,
"learning_rate": 2.9059829059829063e-06,
"loss": 1.4596,
"step": 250
},
{
"epoch": 0.7150997150997151,
"grad_norm": 0.40930160880088806,
"learning_rate": 2.8774928774928778e-06,
"loss": 1.4327,
"step": 251
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.4137099087238312,
"learning_rate": 2.8490028490028492e-06,
"loss": 1.4119,
"step": 252
},
{
"epoch": 0.7207977207977208,
"grad_norm": 0.43292713165283203,
"learning_rate": 2.8205128205128207e-06,
"loss": 1.4352,
"step": 253
},
{
"epoch": 0.7236467236467237,
"grad_norm": 0.6853729486465454,
"learning_rate": 2.792022792022792e-06,
"loss": 1.4859,
"step": 254
},
{
"epoch": 0.7264957264957265,
"grad_norm": 0.4223368465900421,
"learning_rate": 2.7635327635327636e-06,
"loss": 1.4189,
"step": 255
},
{
"epoch": 0.7293447293447294,
"grad_norm": 0.4098432958126068,
"learning_rate": 2.7350427350427355e-06,
"loss": 1.4474,
"step": 256
},
{
"epoch": 0.7321937321937322,
"grad_norm": 0.42546141147613525,
"learning_rate": 2.706552706552707e-06,
"loss": 1.4447,
"step": 257
},
{
"epoch": 0.7350427350427351,
"grad_norm": 0.434319406747818,
"learning_rate": 2.6780626780626785e-06,
"loss": 1.4559,
"step": 258
},
{
"epoch": 0.7378917378917379,
"grad_norm": 0.5959000587463379,
"learning_rate": 2.64957264957265e-06,
"loss": 1.3711,
"step": 259
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.6558396220207214,
"learning_rate": 2.6210826210826214e-06,
"loss": 1.3735,
"step": 260
},
{
"epoch": 0.7435897435897436,
"grad_norm": 0.4049711525440216,
"learning_rate": 2.5925925925925925e-06,
"loss": 1.4327,
"step": 261
},
{
"epoch": 0.7464387464387464,
"grad_norm": 0.4057099223136902,
"learning_rate": 2.564102564102564e-06,
"loss": 1.4173,
"step": 262
},
{
"epoch": 0.7492877492877493,
"grad_norm": 0.44100022315979004,
"learning_rate": 2.535612535612536e-06,
"loss": 1.4568,
"step": 263
},
{
"epoch": 0.7521367521367521,
"grad_norm": 0.4259463846683502,
"learning_rate": 2.5071225071225073e-06,
"loss": 1.4473,
"step": 264
},
{
"epoch": 0.7549857549857549,
"grad_norm": 0.47139763832092285,
"learning_rate": 2.478632478632479e-06,
"loss": 1.4467,
"step": 265
},
{
"epoch": 0.7578347578347578,
"grad_norm": 0.4066116511821747,
"learning_rate": 2.4501424501424503e-06,
"loss": 1.4148,
"step": 266
},
{
"epoch": 0.7606837606837606,
"grad_norm": 0.4442392587661743,
"learning_rate": 2.4216524216524218e-06,
"loss": 1.4166,
"step": 267
},
{
"epoch": 0.7635327635327636,
"grad_norm": 0.4146524667739868,
"learning_rate": 2.3931623931623937e-06,
"loss": 1.4214,
"step": 268
},
{
"epoch": 0.7663817663817664,
"grad_norm": 0.4352812170982361,
"learning_rate": 2.3646723646723647e-06,
"loss": 1.4268,
"step": 269
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.4416466951370239,
"learning_rate": 2.336182336182336e-06,
"loss": 1.3947,
"step": 270
},
{
"epoch": 0.7720797720797721,
"grad_norm": 0.4440385699272156,
"learning_rate": 2.307692307692308e-06,
"loss": 1.4114,
"step": 271
},
{
"epoch": 0.7749287749287749,
"grad_norm": 0.42091333866119385,
"learning_rate": 2.2792022792022796e-06,
"loss": 1.4343,
"step": 272
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.39965999126434326,
"learning_rate": 2.250712250712251e-06,
"loss": 1.4401,
"step": 273
},
{
"epoch": 0.7806267806267806,
"grad_norm": 0.4088633060455322,
"learning_rate": 2.222222222222222e-06,
"loss": 1.3808,
"step": 274
},
{
"epoch": 0.7834757834757835,
"grad_norm": 0.42541617155075073,
"learning_rate": 2.193732193732194e-06,
"loss": 1.45,
"step": 275
},
{
"epoch": 0.7863247863247863,
"grad_norm": 0.42558950185775757,
"learning_rate": 2.1652421652421654e-06,
"loss": 1.4317,
"step": 276
},
{
"epoch": 0.7891737891737892,
"grad_norm": 0.4297507703304291,
"learning_rate": 2.136752136752137e-06,
"loss": 1.4493,
"step": 277
},
{
"epoch": 0.792022792022792,
"grad_norm": 0.42826247215270996,
"learning_rate": 2.1082621082621084e-06,
"loss": 1.4665,
"step": 278
},
{
"epoch": 0.7948717948717948,
"grad_norm": 0.4104038178920746,
"learning_rate": 2.07977207977208e-06,
"loss": 1.3966,
"step": 279
},
{
"epoch": 0.7977207977207977,
"grad_norm": 0.5832846164703369,
"learning_rate": 2.0512820512820513e-06,
"loss": 1.409,
"step": 280
},
{
"epoch": 0.8005698005698005,
"grad_norm": 0.4132280647754669,
"learning_rate": 2.0227920227920232e-06,
"loss": 1.421,
"step": 281
},
{
"epoch": 0.8034188034188035,
"grad_norm": 0.5175873637199402,
"learning_rate": 1.9943019943019947e-06,
"loss": 1.4251,
"step": 282
},
{
"epoch": 0.8062678062678063,
"grad_norm": 0.3983429968357086,
"learning_rate": 1.9658119658119658e-06,
"loss": 1.4305,
"step": 283
},
{
"epoch": 0.8091168091168092,
"grad_norm": 0.4195236563682556,
"learning_rate": 1.9373219373219372e-06,
"loss": 1.3955,
"step": 284
},
{
"epoch": 0.811965811965812,
"grad_norm": 0.44437727332115173,
"learning_rate": 1.908831908831909e-06,
"loss": 1.3945,
"step": 285
},
{
"epoch": 0.8148148148148148,
"grad_norm": 0.4069578945636749,
"learning_rate": 1.8803418803418804e-06,
"loss": 1.3872,
"step": 286
},
{
"epoch": 0.8176638176638177,
"grad_norm": 0.4366849660873413,
"learning_rate": 1.8518518518518519e-06,
"loss": 1.4303,
"step": 287
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.42686140537261963,
"learning_rate": 1.8233618233618236e-06,
"loss": 1.4297,
"step": 288
},
{
"epoch": 0.8233618233618234,
"grad_norm": 0.4372996687889099,
"learning_rate": 1.794871794871795e-06,
"loss": 1.4205,
"step": 289
},
{
"epoch": 0.8262108262108262,
"grad_norm": 0.5185275077819824,
"learning_rate": 1.7663817663817665e-06,
"loss": 1.4072,
"step": 290
},
{
"epoch": 0.8290598290598291,
"grad_norm": 0.4375689625740051,
"learning_rate": 1.7378917378917382e-06,
"loss": 1.4093,
"step": 291
},
{
"epoch": 0.8319088319088319,
"grad_norm": 0.6223400235176086,
"learning_rate": 1.7094017094017097e-06,
"loss": 1.4038,
"step": 292
},
{
"epoch": 0.8347578347578347,
"grad_norm": 0.49658337235450745,
"learning_rate": 1.680911680911681e-06,
"loss": 1.4587,
"step": 293
},
{
"epoch": 0.8376068376068376,
"grad_norm": 0.48749840259552,
"learning_rate": 1.6524216524216524e-06,
"loss": 1.4573,
"step": 294
},
{
"epoch": 0.8404558404558404,
"grad_norm": 0.4375877380371094,
"learning_rate": 1.623931623931624e-06,
"loss": 1.4126,
"step": 295
},
{
"epoch": 0.8433048433048433,
"grad_norm": 0.5864587426185608,
"learning_rate": 1.5954415954415956e-06,
"loss": 1.3915,
"step": 296
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.4243745803833008,
"learning_rate": 1.566951566951567e-06,
"loss": 1.4475,
"step": 297
},
{
"epoch": 0.8490028490028491,
"grad_norm": 0.5398270487785339,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.3658,
"step": 298
},
{
"epoch": 0.8518518518518519,
"grad_norm": 0.4248296916484833,
"learning_rate": 1.5099715099715102e-06,
"loss": 1.3898,
"step": 299
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.4054194986820221,
"learning_rate": 1.4814814814814815e-06,
"loss": 1.3806,
"step": 300
},
{
"epoch": 0.8575498575498576,
"grad_norm": 0.4230331778526306,
"learning_rate": 1.4529914529914531e-06,
"loss": 1.431,
"step": 301
},
{
"epoch": 0.8603988603988604,
"grad_norm": 0.42785853147506714,
"learning_rate": 1.4245014245014246e-06,
"loss": 1.3905,
"step": 302
},
{
"epoch": 0.8632478632478633,
"grad_norm": 0.6043952703475952,
"learning_rate": 1.396011396011396e-06,
"loss": 1.444,
"step": 303
},
{
"epoch": 0.8660968660968661,
"grad_norm": 0.41546547412872314,
"learning_rate": 1.3675213675213678e-06,
"loss": 1.3876,
"step": 304
},
{
"epoch": 0.8689458689458689,
"grad_norm": 0.5535686612129211,
"learning_rate": 1.3390313390313392e-06,
"loss": 1.3663,
"step": 305
},
{
"epoch": 0.8717948717948718,
"grad_norm": 0.43172240257263184,
"learning_rate": 1.3105413105413107e-06,
"loss": 1.4281,
"step": 306
},
{
"epoch": 0.8746438746438746,
"grad_norm": 0.4234292209148407,
"learning_rate": 1.282051282051282e-06,
"loss": 1.4105,
"step": 307
},
{
"epoch": 0.8774928774928775,
"grad_norm": 0.4184323847293854,
"learning_rate": 1.2535612535612537e-06,
"loss": 1.3755,
"step": 308
},
{
"epoch": 0.8803418803418803,
"grad_norm": 0.6069676876068115,
"learning_rate": 1.2250712250712251e-06,
"loss": 1.3666,
"step": 309
},
{
"epoch": 0.8831908831908832,
"grad_norm": 0.4531959891319275,
"learning_rate": 1.1965811965811968e-06,
"loss": 1.4109,
"step": 310
},
{
"epoch": 0.886039886039886,
"grad_norm": 0.49059048295021057,
"learning_rate": 1.168091168091168e-06,
"loss": 1.4259,
"step": 311
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.4053284823894501,
"learning_rate": 1.1396011396011398e-06,
"loss": 1.4173,
"step": 312
},
{
"epoch": 0.8917378917378918,
"grad_norm": 0.4258776307106018,
"learning_rate": 1.111111111111111e-06,
"loss": 1.4079,
"step": 313
},
{
"epoch": 0.8945868945868946,
"grad_norm": 0.4315298795700073,
"learning_rate": 1.0826210826210827e-06,
"loss": 1.3791,
"step": 314
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.48497509956359863,
"learning_rate": 1.0541310541310542e-06,
"loss": 1.4389,
"step": 315
},
{
"epoch": 0.9002849002849003,
"grad_norm": 0.4596964716911316,
"learning_rate": 1.0256410256410257e-06,
"loss": 1.4253,
"step": 316
},
{
"epoch": 0.9031339031339032,
"grad_norm": 0.43682560324668884,
"learning_rate": 9.971509971509974e-07,
"loss": 1.4358,
"step": 317
},
{
"epoch": 0.905982905982906,
"grad_norm": 0.5284684896469116,
"learning_rate": 9.686609686609686e-07,
"loss": 1.3974,
"step": 318
},
{
"epoch": 0.9088319088319088,
"grad_norm": 0.444614440202713,
"learning_rate": 9.401709401709402e-07,
"loss": 1.4258,
"step": 319
},
{
"epoch": 0.9116809116809117,
"grad_norm": 0.41446149349212646,
"learning_rate": 9.116809116809118e-07,
"loss": 1.4093,
"step": 320
},
{
"epoch": 0.9145299145299145,
"grad_norm": 0.505181074142456,
"learning_rate": 8.831908831908833e-07,
"loss": 1.4355,
"step": 321
},
{
"epoch": 0.9173789173789174,
"grad_norm": 0.41858991980552673,
"learning_rate": 8.547008547008548e-07,
"loss": 1.4259,
"step": 322
},
{
"epoch": 0.9202279202279202,
"grad_norm": 0.6958276033401489,
"learning_rate": 8.262108262108262e-07,
"loss": 1.4456,
"step": 323
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.4824206829071045,
"learning_rate": 7.977207977207978e-07,
"loss": 1.4341,
"step": 324
},
{
"epoch": 0.9259259259259259,
"grad_norm": 0.4208286702632904,
"learning_rate": 7.692307692307694e-07,
"loss": 1.4401,
"step": 325
},
{
"epoch": 0.9287749287749287,
"grad_norm": 0.43090713024139404,
"learning_rate": 7.407407407407407e-07,
"loss": 1.4295,
"step": 326
},
{
"epoch": 0.9316239316239316,
"grad_norm": 0.4124811589717865,
"learning_rate": 7.122507122507123e-07,
"loss": 1.4234,
"step": 327
},
{
"epoch": 0.9344729344729344,
"grad_norm": 0.4865758419036865,
"learning_rate": 6.837606837606839e-07,
"loss": 1.4787,
"step": 328
},
{
"epoch": 0.9373219373219374,
"grad_norm": 0.4624764323234558,
"learning_rate": 6.552706552706554e-07,
"loss": 1.3913,
"step": 329
},
{
"epoch": 0.9401709401709402,
"grad_norm": 0.4168078899383545,
"learning_rate": 6.267806267806268e-07,
"loss": 1.3954,
"step": 330
},
{
"epoch": 0.9430199430199431,
"grad_norm": 0.43121403455734253,
"learning_rate": 5.982905982905984e-07,
"loss": 1.4046,
"step": 331
},
{
"epoch": 0.9458689458689459,
"grad_norm": 0.43017080426216125,
"learning_rate": 5.698005698005699e-07,
"loss": 1.4471,
"step": 332
},
{
"epoch": 0.9487179487179487,
"grad_norm": 0.41371017694473267,
"learning_rate": 5.413105413105414e-07,
"loss": 1.3891,
"step": 333
},
{
"epoch": 0.9515669515669516,
"grad_norm": 0.42624595761299133,
"learning_rate": 5.128205128205128e-07,
"loss": 1.4431,
"step": 334
},
{
"epoch": 0.9544159544159544,
"grad_norm": 0.4311563968658447,
"learning_rate": 4.843304843304843e-07,
"loss": 1.3985,
"step": 335
},
{
"epoch": 0.9572649572649573,
"grad_norm": 0.42693498730659485,
"learning_rate": 4.558404558404559e-07,
"loss": 1.3818,
"step": 336
},
{
"epoch": 0.9601139601139601,
"grad_norm": 0.7170986533164978,
"learning_rate": 4.273504273504274e-07,
"loss": 1.4704,
"step": 337
},
{
"epoch": 0.9629629629629629,
"grad_norm": 0.42342740297317505,
"learning_rate": 3.988603988603989e-07,
"loss": 1.4172,
"step": 338
},
{
"epoch": 0.9658119658119658,
"grad_norm": 0.5637214183807373,
"learning_rate": 3.7037037037037036e-07,
"loss": 1.3729,
"step": 339
},
{
"epoch": 0.9686609686609686,
"grad_norm": 0.42340558767318726,
"learning_rate": 3.4188034188034194e-07,
"loss": 1.3958,
"step": 340
},
{
"epoch": 0.9715099715099715,
"grad_norm": 0.4184475541114807,
"learning_rate": 3.133903133903134e-07,
"loss": 1.4015,
"step": 341
},
{
"epoch": 0.9743589743589743,
"grad_norm": 0.42320722341537476,
"learning_rate": 2.8490028490028494e-07,
"loss": 1.396,
"step": 342
},
{
"epoch": 0.9772079772079773,
"grad_norm": 0.4045957624912262,
"learning_rate": 2.564102564102564e-07,
"loss": 1.4237,
"step": 343
},
{
"epoch": 0.98005698005698,
"grad_norm": 0.4371383488178253,
"learning_rate": 2.2792022792022794e-07,
"loss": 1.4127,
"step": 344
},
{
"epoch": 0.9829059829059829,
"grad_norm": 0.5121440291404724,
"learning_rate": 1.9943019943019944e-07,
"loss": 1.4109,
"step": 345
},
{
"epoch": 0.9857549857549858,
"grad_norm": 0.42042669653892517,
"learning_rate": 1.7094017094017097e-07,
"loss": 1.352,
"step": 346
},
{
"epoch": 0.9886039886039886,
"grad_norm": 0.7115257382392883,
"learning_rate": 1.4245014245014247e-07,
"loss": 1.404,
"step": 347
},
{
"epoch": 0.9914529914529915,
"grad_norm": 0.5735996961593628,
"learning_rate": 1.1396011396011397e-07,
"loss": 1.4461,
"step": 348
},
{
"epoch": 0.9943019943019943,
"grad_norm": 0.4333067834377289,
"learning_rate": 8.547008547008549e-08,
"loss": 1.403,
"step": 349
},
{
"epoch": 0.9971509971509972,
"grad_norm": 0.4068621098995209,
"learning_rate": 5.6980056980056986e-08,
"loss": 1.4357,
"step": 350
},
{
"epoch": 1.0,
"grad_norm": 0.6922910809516907,
"learning_rate": 2.8490028490028493e-08,
"loss": 1.4253,
"step": 351
}
],
"logging_steps": 1.0,
"max_steps": 351,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6380890003275776e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}