VeriThoughts-Reasoning-32B / trainer_state.json
wilyub's picture
End of training
019bb1f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1010,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0049504950495049506,
"grad_norm": 3.599481981308616,
"learning_rate": 0.0,
"loss": 0.8893,
"step": 1
},
{
"epoch": 0.009900990099009901,
"grad_norm": 4.571227545210381,
"learning_rate": 7.920792079207921e-07,
"loss": 0.9924,
"step": 2
},
{
"epoch": 0.01485148514851485,
"grad_norm": 4.290318174261695,
"learning_rate": 1.5841584158415842e-06,
"loss": 0.9284,
"step": 3
},
{
"epoch": 0.019801980198019802,
"grad_norm": 4.063887886041896,
"learning_rate": 2.3762376237623762e-06,
"loss": 0.9605,
"step": 4
},
{
"epoch": 0.024752475247524754,
"grad_norm": 3.0457401787892744,
"learning_rate": 3.1683168316831685e-06,
"loss": 0.8544,
"step": 5
},
{
"epoch": 0.0297029702970297,
"grad_norm": 2.5802449546715653,
"learning_rate": 3.960396039603961e-06,
"loss": 0.896,
"step": 6
},
{
"epoch": 0.034653465346534656,
"grad_norm": 2.27569894817667,
"learning_rate": 4.7524752475247525e-06,
"loss": 0.8519,
"step": 7
},
{
"epoch": 0.039603960396039604,
"grad_norm": 2.128661648062913,
"learning_rate": 5.544554455445545e-06,
"loss": 0.8402,
"step": 8
},
{
"epoch": 0.04455445544554455,
"grad_norm": 1.762558880637763,
"learning_rate": 6.336633663366337e-06,
"loss": 0.8403,
"step": 9
},
{
"epoch": 0.04950495049504951,
"grad_norm": 1.4450553165390079,
"learning_rate": 7.128712871287129e-06,
"loss": 0.774,
"step": 10
},
{
"epoch": 0.054455445544554455,
"grad_norm": 1.7583923085545512,
"learning_rate": 7.920792079207921e-06,
"loss": 0.7121,
"step": 11
},
{
"epoch": 0.0594059405940594,
"grad_norm": 1.5091044617389875,
"learning_rate": 8.712871287128714e-06,
"loss": 0.711,
"step": 12
},
{
"epoch": 0.06435643564356436,
"grad_norm": 1.4672138042959566,
"learning_rate": 9.504950495049505e-06,
"loss": 0.6526,
"step": 13
},
{
"epoch": 0.06930693069306931,
"grad_norm": 1.1729096644060144,
"learning_rate": 1.0297029702970298e-05,
"loss": 0.6353,
"step": 14
},
{
"epoch": 0.07425742574257425,
"grad_norm": 1.2698139994056772,
"learning_rate": 1.108910891089109e-05,
"loss": 0.6655,
"step": 15
},
{
"epoch": 0.07920792079207921,
"grad_norm": 1.4992771430082272,
"learning_rate": 1.1881188118811881e-05,
"loss": 0.6069,
"step": 16
},
{
"epoch": 0.08415841584158416,
"grad_norm": 1.2827914835014287,
"learning_rate": 1.2673267326732674e-05,
"loss": 0.6052,
"step": 17
},
{
"epoch": 0.0891089108910891,
"grad_norm": 1.1528589297349217,
"learning_rate": 1.3465346534653467e-05,
"loss": 0.6348,
"step": 18
},
{
"epoch": 0.09405940594059406,
"grad_norm": 0.9829400287103841,
"learning_rate": 1.4257425742574257e-05,
"loss": 0.6186,
"step": 19
},
{
"epoch": 0.09900990099009901,
"grad_norm": 0.8874280395940564,
"learning_rate": 1.504950495049505e-05,
"loss": 0.6159,
"step": 20
},
{
"epoch": 0.10396039603960396,
"grad_norm": 0.9732224407691762,
"learning_rate": 1.5841584158415843e-05,
"loss": 0.5948,
"step": 21
},
{
"epoch": 0.10891089108910891,
"grad_norm": 0.9422404230915372,
"learning_rate": 1.6633663366336635e-05,
"loss": 0.6051,
"step": 22
},
{
"epoch": 0.11386138613861387,
"grad_norm": 0.909292277442357,
"learning_rate": 1.7425742574257428e-05,
"loss": 0.6207,
"step": 23
},
{
"epoch": 0.1188118811881188,
"grad_norm": 0.9645884004296726,
"learning_rate": 1.821782178217822e-05,
"loss": 0.6,
"step": 24
},
{
"epoch": 0.12376237623762376,
"grad_norm": 0.8781009838578941,
"learning_rate": 1.900990099009901e-05,
"loss": 0.5568,
"step": 25
},
{
"epoch": 0.12871287128712872,
"grad_norm": 0.7687131742426503,
"learning_rate": 1.9801980198019803e-05,
"loss": 0.5689,
"step": 26
},
{
"epoch": 0.13366336633663367,
"grad_norm": 45.68743449097319,
"learning_rate": 2.0594059405940595e-05,
"loss": 0.6056,
"step": 27
},
{
"epoch": 0.13861386138613863,
"grad_norm": 1.42976250276645,
"learning_rate": 2.1386138613861388e-05,
"loss": 0.5793,
"step": 28
},
{
"epoch": 0.14356435643564355,
"grad_norm": 1.3124345694625106,
"learning_rate": 2.217821782178218e-05,
"loss": 0.5673,
"step": 29
},
{
"epoch": 0.1485148514851485,
"grad_norm": 0.8946826085599571,
"learning_rate": 2.297029702970297e-05,
"loss": 0.6001,
"step": 30
},
{
"epoch": 0.15346534653465346,
"grad_norm": 1.0396117670628082,
"learning_rate": 2.3762376237623762e-05,
"loss": 0.5742,
"step": 31
},
{
"epoch": 0.15841584158415842,
"grad_norm": 1.1197910610686859,
"learning_rate": 2.4554455445544555e-05,
"loss": 0.5714,
"step": 32
},
{
"epoch": 0.16336633663366337,
"grad_norm": 0.7961887362378622,
"learning_rate": 2.5346534653465348e-05,
"loss": 0.5404,
"step": 33
},
{
"epoch": 0.16831683168316833,
"grad_norm": 0.8216770935724286,
"learning_rate": 2.613861386138614e-05,
"loss": 0.5441,
"step": 34
},
{
"epoch": 0.17326732673267325,
"grad_norm": 0.8374910782342574,
"learning_rate": 2.6930693069306933e-05,
"loss": 0.5727,
"step": 35
},
{
"epoch": 0.1782178217821782,
"grad_norm": 0.8723043518197049,
"learning_rate": 2.7722772277227722e-05,
"loss": 0.53,
"step": 36
},
{
"epoch": 0.18316831683168316,
"grad_norm": 0.7368093656466949,
"learning_rate": 2.8514851485148515e-05,
"loss": 0.5627,
"step": 37
},
{
"epoch": 0.18811881188118812,
"grad_norm": 0.839263691011532,
"learning_rate": 2.9306930693069308e-05,
"loss": 0.5832,
"step": 38
},
{
"epoch": 0.19306930693069307,
"grad_norm": 0.7426574483260017,
"learning_rate": 3.00990099009901e-05,
"loss": 0.5631,
"step": 39
},
{
"epoch": 0.19801980198019803,
"grad_norm": 0.7651839720480437,
"learning_rate": 3.0891089108910896e-05,
"loss": 0.551,
"step": 40
},
{
"epoch": 0.20297029702970298,
"grad_norm": 0.769095073737041,
"learning_rate": 3.1683168316831686e-05,
"loss": 0.5618,
"step": 41
},
{
"epoch": 0.2079207920792079,
"grad_norm": 0.8259803329688946,
"learning_rate": 3.247524752475248e-05,
"loss": 0.562,
"step": 42
},
{
"epoch": 0.21287128712871287,
"grad_norm": 0.6691753174649648,
"learning_rate": 3.326732673267327e-05,
"loss": 0.5324,
"step": 43
},
{
"epoch": 0.21782178217821782,
"grad_norm": 0.7851088248969034,
"learning_rate": 3.405940594059406e-05,
"loss": 0.5459,
"step": 44
},
{
"epoch": 0.22277227722772278,
"grad_norm": 0.7186552394181012,
"learning_rate": 3.4851485148514856e-05,
"loss": 0.5497,
"step": 45
},
{
"epoch": 0.22772277227722773,
"grad_norm": 0.8608611817434151,
"learning_rate": 3.5643564356435645e-05,
"loss": 0.51,
"step": 46
},
{
"epoch": 0.23267326732673269,
"grad_norm": 0.750298380499875,
"learning_rate": 3.643564356435644e-05,
"loss": 0.5151,
"step": 47
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.8882569861170021,
"learning_rate": 3.722772277227723e-05,
"loss": 0.5313,
"step": 48
},
{
"epoch": 0.24257425742574257,
"grad_norm": 0.7184108588660996,
"learning_rate": 3.801980198019802e-05,
"loss": 0.5365,
"step": 49
},
{
"epoch": 0.24752475247524752,
"grad_norm": 0.6639444774645417,
"learning_rate": 3.8811881188118816e-05,
"loss": 0.525,
"step": 50
},
{
"epoch": 0.2524752475247525,
"grad_norm": 0.5990381795095598,
"learning_rate": 3.9603960396039605e-05,
"loss": 0.5231,
"step": 51
},
{
"epoch": 0.25742574257425743,
"grad_norm": 0.7385397760086568,
"learning_rate": 4.03960396039604e-05,
"loss": 0.5334,
"step": 52
},
{
"epoch": 0.2623762376237624,
"grad_norm": 0.6825332922011768,
"learning_rate": 4.118811881188119e-05,
"loss": 0.5506,
"step": 53
},
{
"epoch": 0.26732673267326734,
"grad_norm": 0.66432008262571,
"learning_rate": 4.1980198019801987e-05,
"loss": 0.5592,
"step": 54
},
{
"epoch": 0.2722772277227723,
"grad_norm": 0.7417922067610243,
"learning_rate": 4.2772277227722776e-05,
"loss": 0.5465,
"step": 55
},
{
"epoch": 0.27722772277227725,
"grad_norm": 0.7400946921357555,
"learning_rate": 4.356435643564357e-05,
"loss": 0.5414,
"step": 56
},
{
"epoch": 0.28217821782178215,
"grad_norm": 0.868329268810697,
"learning_rate": 4.435643564356436e-05,
"loss": 0.5417,
"step": 57
},
{
"epoch": 0.2871287128712871,
"grad_norm": 0.6825435572314523,
"learning_rate": 4.514851485148515e-05,
"loss": 0.5321,
"step": 58
},
{
"epoch": 0.29207920792079206,
"grad_norm": 0.9429162079814348,
"learning_rate": 4.594059405940594e-05,
"loss": 0.547,
"step": 59
},
{
"epoch": 0.297029702970297,
"grad_norm": 0.6941868595008334,
"learning_rate": 4.6732673267326736e-05,
"loss": 0.5133,
"step": 60
},
{
"epoch": 0.30198019801980197,
"grad_norm": 0.7090210319685202,
"learning_rate": 4.7524752475247525e-05,
"loss": 0.5441,
"step": 61
},
{
"epoch": 0.3069306930693069,
"grad_norm": 0.639944106128008,
"learning_rate": 4.831683168316832e-05,
"loss": 0.5201,
"step": 62
},
{
"epoch": 0.3118811881188119,
"grad_norm": 0.6675816599082344,
"learning_rate": 4.910891089108911e-05,
"loss": 0.544,
"step": 63
},
{
"epoch": 0.31683168316831684,
"grad_norm": 0.7225958900836226,
"learning_rate": 4.9900990099009906e-05,
"loss": 0.5325,
"step": 64
},
{
"epoch": 0.3217821782178218,
"grad_norm": 0.7971869756828545,
"learning_rate": 5.0693069306930696e-05,
"loss": 0.5432,
"step": 65
},
{
"epoch": 0.32673267326732675,
"grad_norm": 0.807211676486099,
"learning_rate": 5.148514851485149e-05,
"loss": 0.5794,
"step": 66
},
{
"epoch": 0.3316831683168317,
"grad_norm": 0.6526247620861514,
"learning_rate": 5.227722772277228e-05,
"loss": 0.5329,
"step": 67
},
{
"epoch": 0.33663366336633666,
"grad_norm": 0.8932900521352153,
"learning_rate": 5.306930693069308e-05,
"loss": 0.5554,
"step": 68
},
{
"epoch": 0.3415841584158416,
"grad_norm": 0.7332294643374914,
"learning_rate": 5.3861386138613866e-05,
"loss": 0.5269,
"step": 69
},
{
"epoch": 0.3465346534653465,
"grad_norm": 0.6513725822741345,
"learning_rate": 5.465346534653466e-05,
"loss": 0.4927,
"step": 70
},
{
"epoch": 0.35148514851485146,
"grad_norm": 0.7096869239144933,
"learning_rate": 5.5445544554455445e-05,
"loss": 0.5569,
"step": 71
},
{
"epoch": 0.3564356435643564,
"grad_norm": 0.7323268829393851,
"learning_rate": 5.623762376237624e-05,
"loss": 0.5339,
"step": 72
},
{
"epoch": 0.3613861386138614,
"grad_norm": 0.7885619745669775,
"learning_rate": 5.702970297029703e-05,
"loss": 0.5826,
"step": 73
},
{
"epoch": 0.36633663366336633,
"grad_norm": 0.6749931221787999,
"learning_rate": 5.7821782178217826e-05,
"loss": 0.5406,
"step": 74
},
{
"epoch": 0.3712871287128713,
"grad_norm": 0.648576213983254,
"learning_rate": 5.8613861386138615e-05,
"loss": 0.5524,
"step": 75
},
{
"epoch": 0.37623762376237624,
"grad_norm": 0.6598593089642592,
"learning_rate": 5.940594059405941e-05,
"loss": 0.5379,
"step": 76
},
{
"epoch": 0.3811881188118812,
"grad_norm": 0.6650261826397315,
"learning_rate": 6.01980198019802e-05,
"loss": 0.5585,
"step": 77
},
{
"epoch": 0.38613861386138615,
"grad_norm": 0.7596923180855565,
"learning_rate": 6.0990099009900997e-05,
"loss": 0.5641,
"step": 78
},
{
"epoch": 0.3910891089108911,
"grad_norm": 0.8173143487769239,
"learning_rate": 6.178217821782179e-05,
"loss": 0.5204,
"step": 79
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.6116291549339957,
"learning_rate": 6.257425742574258e-05,
"loss": 0.5199,
"step": 80
},
{
"epoch": 0.400990099009901,
"grad_norm": 1.0510502278437106,
"learning_rate": 6.336633663366337e-05,
"loss": 0.5561,
"step": 81
},
{
"epoch": 0.40594059405940597,
"grad_norm": 0.6263814677240027,
"learning_rate": 6.415841584158417e-05,
"loss": 0.5466,
"step": 82
},
{
"epoch": 0.41089108910891087,
"grad_norm": 0.781111899709593,
"learning_rate": 6.495049504950496e-05,
"loss": 0.5258,
"step": 83
},
{
"epoch": 0.4158415841584158,
"grad_norm": 0.7123210147687922,
"learning_rate": 6.574257425742575e-05,
"loss": 0.5298,
"step": 84
},
{
"epoch": 0.4207920792079208,
"grad_norm": 0.6399105090314179,
"learning_rate": 6.653465346534654e-05,
"loss": 0.5167,
"step": 85
},
{
"epoch": 0.42574257425742573,
"grad_norm": 0.5881868516455735,
"learning_rate": 6.732673267326732e-05,
"loss": 0.5229,
"step": 86
},
{
"epoch": 0.4306930693069307,
"grad_norm": 0.7851328238490848,
"learning_rate": 6.811881188118812e-05,
"loss": 0.5647,
"step": 87
},
{
"epoch": 0.43564356435643564,
"grad_norm": 0.6557279603019258,
"learning_rate": 6.891089108910892e-05,
"loss": 0.5172,
"step": 88
},
{
"epoch": 0.4405940594059406,
"grad_norm": 0.9157001843216616,
"learning_rate": 6.970297029702971e-05,
"loss": 0.5464,
"step": 89
},
{
"epoch": 0.44554455445544555,
"grad_norm": 0.623110108758698,
"learning_rate": 7.04950495049505e-05,
"loss": 0.5309,
"step": 90
},
{
"epoch": 0.4504950495049505,
"grad_norm": 0.841500456640877,
"learning_rate": 7.128712871287129e-05,
"loss": 0.5638,
"step": 91
},
{
"epoch": 0.45544554455445546,
"grad_norm": 0.6925204556841734,
"learning_rate": 7.207920792079209e-05,
"loss": 0.5315,
"step": 92
},
{
"epoch": 0.4603960396039604,
"grad_norm": 0.711526620654385,
"learning_rate": 7.287128712871288e-05,
"loss": 0.5524,
"step": 93
},
{
"epoch": 0.46534653465346537,
"grad_norm": 0.5348898586960058,
"learning_rate": 7.366336633663368e-05,
"loss": 0.5404,
"step": 94
},
{
"epoch": 0.47029702970297027,
"grad_norm": 0.5821672966348954,
"learning_rate": 7.445544554455446e-05,
"loss": 0.5238,
"step": 95
},
{
"epoch": 0.4752475247524752,
"grad_norm": 0.5908396910544158,
"learning_rate": 7.524752475247524e-05,
"loss": 0.5346,
"step": 96
},
{
"epoch": 0.4801980198019802,
"grad_norm": 0.6146273285955356,
"learning_rate": 7.603960396039604e-05,
"loss": 0.5413,
"step": 97
},
{
"epoch": 0.48514851485148514,
"grad_norm": 0.6908121451411222,
"learning_rate": 7.683168316831684e-05,
"loss": 0.5497,
"step": 98
},
{
"epoch": 0.4900990099009901,
"grad_norm": 0.6453437707858504,
"learning_rate": 7.762376237623763e-05,
"loss": 0.5552,
"step": 99
},
{
"epoch": 0.49504950495049505,
"grad_norm": 0.573659554028698,
"learning_rate": 7.841584158415841e-05,
"loss": 0.5534,
"step": 100
},
{
"epoch": 0.5,
"grad_norm": 0.5585111590390315,
"learning_rate": 7.920792079207921e-05,
"loss": 0.5153,
"step": 101
},
{
"epoch": 0.504950495049505,
"grad_norm": 0.581544572278104,
"learning_rate": 8e-05,
"loss": 0.5515,
"step": 102
},
{
"epoch": 0.5099009900990099,
"grad_norm": 0.5785865019861642,
"learning_rate": 7.999976110803523e-05,
"loss": 0.5468,
"step": 103
},
{
"epoch": 0.5148514851485149,
"grad_norm": 0.6439240386695959,
"learning_rate": 7.99990444349944e-05,
"loss": 0.548,
"step": 104
},
{
"epoch": 0.5198019801980198,
"grad_norm": 0.5927533216543163,
"learning_rate": 7.999784998943787e-05,
"loss": 0.5408,
"step": 105
},
{
"epoch": 0.5247524752475248,
"grad_norm": 0.732845460948312,
"learning_rate": 7.999617778563281e-05,
"loss": 0.542,
"step": 106
},
{
"epoch": 0.5297029702970297,
"grad_norm": 0.6609414974116621,
"learning_rate": 7.999402784355303e-05,
"loss": 0.5355,
"step": 107
},
{
"epoch": 0.5346534653465347,
"grad_norm": 0.6867640106151846,
"learning_rate": 7.999140018887873e-05,
"loss": 0.5316,
"step": 108
},
{
"epoch": 0.5396039603960396,
"grad_norm": 0.6243092054092657,
"learning_rate": 7.998829485299617e-05,
"loss": 0.5477,
"step": 109
},
{
"epoch": 0.5445544554455446,
"grad_norm": 0.6368046072823754,
"learning_rate": 7.998471187299734e-05,
"loss": 0.5462,
"step": 110
},
{
"epoch": 0.5495049504950495,
"grad_norm": 0.5805646759685726,
"learning_rate": 7.998065129167953e-05,
"loss": 0.5604,
"step": 111
},
{
"epoch": 0.5544554455445545,
"grad_norm": 0.7048046209267858,
"learning_rate": 7.997611315754472e-05,
"loss": 0.5173,
"step": 112
},
{
"epoch": 0.5594059405940595,
"grad_norm": 0.5756024122810762,
"learning_rate": 7.997109752479912e-05,
"loss": 0.5383,
"step": 113
},
{
"epoch": 0.5643564356435643,
"grad_norm": 0.7065257004750126,
"learning_rate": 7.996560445335241e-05,
"loss": 0.5529,
"step": 114
},
{
"epoch": 0.5693069306930693,
"grad_norm": 0.6026162793901009,
"learning_rate": 7.995963400881718e-05,
"loss": 0.5811,
"step": 115
},
{
"epoch": 0.5742574257425742,
"grad_norm": 0.7729487641935177,
"learning_rate": 7.995318626250795e-05,
"loss": 0.5472,
"step": 116
},
{
"epoch": 0.5792079207920792,
"grad_norm": 0.6627604460681927,
"learning_rate": 7.994626129144047e-05,
"loss": 0.5503,
"step": 117
},
{
"epoch": 0.5841584158415841,
"grad_norm": 0.6085367060729467,
"learning_rate": 7.993885917833073e-05,
"loss": 0.5374,
"step": 118
},
{
"epoch": 0.5891089108910891,
"grad_norm": 0.7573576072226863,
"learning_rate": 7.9930980011594e-05,
"loss": 0.5671,
"step": 119
},
{
"epoch": 0.594059405940594,
"grad_norm": 0.665798782251616,
"learning_rate": 7.992262388534378e-05,
"loss": 0.537,
"step": 120
},
{
"epoch": 0.599009900990099,
"grad_norm": 0.6447582523979625,
"learning_rate": 7.991379089939062e-05,
"loss": 0.5561,
"step": 121
},
{
"epoch": 0.6039603960396039,
"grad_norm": 0.5575875836074239,
"learning_rate": 7.990448115924099e-05,
"loss": 0.5592,
"step": 122
},
{
"epoch": 0.6089108910891089,
"grad_norm": 0.6509795039700941,
"learning_rate": 7.989469477609601e-05,
"loss": 0.5773,
"step": 123
},
{
"epoch": 0.6138613861386139,
"grad_norm": 0.559853876791952,
"learning_rate": 7.988443186685007e-05,
"loss": 0.53,
"step": 124
},
{
"epoch": 0.6188118811881188,
"grad_norm": 0.6644580599424751,
"learning_rate": 7.987369255408953e-05,
"loss": 0.5213,
"step": 125
},
{
"epoch": 0.6237623762376238,
"grad_norm": 0.4897883042416322,
"learning_rate": 7.986247696609112e-05,
"loss": 0.518,
"step": 126
},
{
"epoch": 0.6287128712871287,
"grad_norm": 0.6117080399483223,
"learning_rate": 7.985078523682058e-05,
"loss": 0.53,
"step": 127
},
{
"epoch": 0.6336633663366337,
"grad_norm": 0.7164160487652417,
"learning_rate": 7.983861750593091e-05,
"loss": 0.5658,
"step": 128
},
{
"epoch": 0.6386138613861386,
"grad_norm": 0.5380131972085676,
"learning_rate": 7.982597391876076e-05,
"loss": 0.5347,
"step": 129
},
{
"epoch": 0.6435643564356436,
"grad_norm": 0.6064956049519001,
"learning_rate": 7.981285462633268e-05,
"loss": 0.5705,
"step": 130
},
{
"epoch": 0.6485148514851485,
"grad_norm": 0.5824503230131746,
"learning_rate": 7.979925978535137e-05,
"loss": 0.5531,
"step": 131
},
{
"epoch": 0.6534653465346535,
"grad_norm": 0.6801101571631817,
"learning_rate": 7.978518955820173e-05,
"loss": 0.5209,
"step": 132
},
{
"epoch": 0.6584158415841584,
"grad_norm": 0.5343478838243892,
"learning_rate": 7.977064411294698e-05,
"loss": 0.5401,
"step": 133
},
{
"epoch": 0.6633663366336634,
"grad_norm": 0.5071209298396296,
"learning_rate": 7.975562362332663e-05,
"loss": 0.5593,
"step": 134
},
{
"epoch": 0.6683168316831684,
"grad_norm": 0.516283465511997,
"learning_rate": 7.974012826875436e-05,
"loss": 0.4954,
"step": 135
},
{
"epoch": 0.6732673267326733,
"grad_norm": 0.5655188759595408,
"learning_rate": 7.972415823431599e-05,
"loss": 0.5538,
"step": 136
},
{
"epoch": 0.6782178217821783,
"grad_norm": 0.6549665904978103,
"learning_rate": 7.970771371076715e-05,
"loss": 0.5451,
"step": 137
},
{
"epoch": 0.6831683168316832,
"grad_norm": 0.46554827804728877,
"learning_rate": 7.969079489453107e-05,
"loss": 0.5309,
"step": 138
},
{
"epoch": 0.6881188118811881,
"grad_norm": 0.5540469253338054,
"learning_rate": 7.96734019876962e-05,
"loss": 0.537,
"step": 139
},
{
"epoch": 0.693069306930693,
"grad_norm": 0.5218858858543225,
"learning_rate": 7.965553519801385e-05,
"loss": 0.5064,
"step": 140
},
{
"epoch": 0.698019801980198,
"grad_norm": 0.5966172911852246,
"learning_rate": 7.963719473889562e-05,
"loss": 0.5241,
"step": 141
},
{
"epoch": 0.7029702970297029,
"grad_norm": 0.5341923603639215,
"learning_rate": 7.961838082941094e-05,
"loss": 0.5499,
"step": 142
},
{
"epoch": 0.7079207920792079,
"grad_norm": 0.5767031106953009,
"learning_rate": 7.959909369428441e-05,
"loss": 0.5624,
"step": 143
},
{
"epoch": 0.7128712871287128,
"grad_norm": 0.5458407996686117,
"learning_rate": 7.957933356389306e-05,
"loss": 0.5397,
"step": 144
},
{
"epoch": 0.7178217821782178,
"grad_norm": 0.5028390484866985,
"learning_rate": 7.955910067426377e-05,
"loss": 0.5,
"step": 145
},
{
"epoch": 0.7227722772277227,
"grad_norm": 0.5488835039202323,
"learning_rate": 7.953839526707025e-05,
"loss": 0.5259,
"step": 146
},
{
"epoch": 0.7277227722772277,
"grad_norm": 0.5494910122993174,
"learning_rate": 7.951721758963028e-05,
"loss": 0.5549,
"step": 147
},
{
"epoch": 0.7326732673267327,
"grad_norm": 0.8378055939091611,
"learning_rate": 7.949556789490269e-05,
"loss": 0.5371,
"step": 148
},
{
"epoch": 0.7376237623762376,
"grad_norm": 0.4873797556981422,
"learning_rate": 7.94734464414844e-05,
"loss": 0.5204,
"step": 149
},
{
"epoch": 0.7425742574257426,
"grad_norm": 0.4388553826856479,
"learning_rate": 7.945085349360728e-05,
"loss": 0.5087,
"step": 150
},
{
"epoch": 0.7475247524752475,
"grad_norm": 0.5626439990620604,
"learning_rate": 7.942778932113501e-05,
"loss": 0.5139,
"step": 151
},
{
"epoch": 0.7524752475247525,
"grad_norm": 0.4858172017271616,
"learning_rate": 7.940425419955988e-05,
"loss": 0.5931,
"step": 152
},
{
"epoch": 0.7574257425742574,
"grad_norm": 0.49689698880553657,
"learning_rate": 7.938024840999944e-05,
"loss": 0.4846,
"step": 153
},
{
"epoch": 0.7623762376237624,
"grad_norm": 0.4735043212865626,
"learning_rate": 7.935577223919322e-05,
"loss": 0.532,
"step": 154
},
{
"epoch": 0.7673267326732673,
"grad_norm": 0.4823087172850099,
"learning_rate": 7.933082597949925e-05,
"loss": 0.5474,
"step": 155
},
{
"epoch": 0.7722772277227723,
"grad_norm": 0.48575109641325953,
"learning_rate": 7.930540992889056e-05,
"loss": 0.5352,
"step": 156
},
{
"epoch": 0.7772277227722773,
"grad_norm": 0.45696382635128335,
"learning_rate": 7.927952439095167e-05,
"loss": 0.5574,
"step": 157
},
{
"epoch": 0.7821782178217822,
"grad_norm": 0.552745262863526,
"learning_rate": 7.925316967487493e-05,
"loss": 0.5778,
"step": 158
},
{
"epoch": 0.7871287128712872,
"grad_norm": 0.5833245097810845,
"learning_rate": 7.922634609545685e-05,
"loss": 0.5551,
"step": 159
},
{
"epoch": 0.7920792079207921,
"grad_norm": 0.5844763601599386,
"learning_rate": 7.919905397309429e-05,
"loss": 0.5079,
"step": 160
},
{
"epoch": 0.7970297029702971,
"grad_norm": 0.6049939616394847,
"learning_rate": 7.917129363378069e-05,
"loss": 0.5453,
"step": 161
},
{
"epoch": 0.801980198019802,
"grad_norm": 0.5257417214567123,
"learning_rate": 7.914306540910216e-05,
"loss": 0.5367,
"step": 162
},
{
"epoch": 0.806930693069307,
"grad_norm": 0.559359106257865,
"learning_rate": 7.91143696362335e-05,
"loss": 0.5275,
"step": 163
},
{
"epoch": 0.8118811881188119,
"grad_norm": 0.5429558850215297,
"learning_rate": 7.908520665793419e-05,
"loss": 0.5386,
"step": 164
},
{
"epoch": 0.8168316831683168,
"grad_norm": 0.44832679740257586,
"learning_rate": 7.905557682254429e-05,
"loss": 0.4974,
"step": 165
},
{
"epoch": 0.8217821782178217,
"grad_norm": 0.5580345366264446,
"learning_rate": 7.902548048398028e-05,
"loss": 0.5619,
"step": 166
},
{
"epoch": 0.8267326732673267,
"grad_norm": 0.5546502635861509,
"learning_rate": 7.89949180017308e-05,
"loss": 0.5476,
"step": 167
},
{
"epoch": 0.8316831683168316,
"grad_norm": 0.6024862192365625,
"learning_rate": 7.896388974085246e-05,
"loss": 0.5169,
"step": 168
},
{
"epoch": 0.8366336633663366,
"grad_norm": 0.5722278989041788,
"learning_rate": 7.893239607196537e-05,
"loss": 0.5199,
"step": 169
},
{
"epoch": 0.8415841584158416,
"grad_norm": 0.6539747541763438,
"learning_rate": 7.890043737124872e-05,
"loss": 0.5129,
"step": 170
},
{
"epoch": 0.8465346534653465,
"grad_norm": 0.46081078789593266,
"learning_rate": 7.886801402043639e-05,
"loss": 0.524,
"step": 171
},
{
"epoch": 0.8514851485148515,
"grad_norm": 0.686485354309828,
"learning_rate": 7.883512640681226e-05,
"loss": 0.5066,
"step": 172
},
{
"epoch": 0.8564356435643564,
"grad_norm": 0.5444571843193269,
"learning_rate": 7.880177492320565e-05,
"loss": 0.4786,
"step": 173
},
{
"epoch": 0.8613861386138614,
"grad_norm": 0.5012758064453766,
"learning_rate": 7.876795996798665e-05,
"loss": 0.5324,
"step": 174
},
{
"epoch": 0.8663366336633663,
"grad_norm": 0.5757321243521072,
"learning_rate": 7.873368194506131e-05,
"loss": 0.5004,
"step": 175
},
{
"epoch": 0.8712871287128713,
"grad_norm": 0.45816342076746885,
"learning_rate": 7.869894126386684e-05,
"loss": 0.53,
"step": 176
},
{
"epoch": 0.8762376237623762,
"grad_norm": 0.6096807187009159,
"learning_rate": 7.866373833936673e-05,
"loss": 0.5656,
"step": 177
},
{
"epoch": 0.8811881188118812,
"grad_norm": 0.5343072779664616,
"learning_rate": 7.862807359204574e-05,
"loss": 0.5194,
"step": 178
},
{
"epoch": 0.8861386138613861,
"grad_norm": 0.545896110746946,
"learning_rate": 7.859194744790498e-05,
"loss": 0.5209,
"step": 179
},
{
"epoch": 0.8910891089108911,
"grad_norm": 0.636163192425731,
"learning_rate": 7.855536033845673e-05,
"loss": 0.5522,
"step": 180
},
{
"epoch": 0.8960396039603961,
"grad_norm": 0.5038436060871251,
"learning_rate": 7.851831270071929e-05,
"loss": 0.5448,
"step": 181
},
{
"epoch": 0.900990099009901,
"grad_norm": 0.6024576745455106,
"learning_rate": 7.848080497721181e-05,
"loss": 0.4903,
"step": 182
},
{
"epoch": 0.905940594059406,
"grad_norm": 0.6076356159222136,
"learning_rate": 7.844283761594899e-05,
"loss": 0.5739,
"step": 183
},
{
"epoch": 0.9108910891089109,
"grad_norm": 0.6349903742004732,
"learning_rate": 7.84044110704357e-05,
"loss": 0.5619,
"step": 184
},
{
"epoch": 0.9158415841584159,
"grad_norm": 0.5558020572859232,
"learning_rate": 7.83655257996616e-05,
"loss": 0.5208,
"step": 185
},
{
"epoch": 0.9207920792079208,
"grad_norm": 0.43756054193606436,
"learning_rate": 7.83261822680956e-05,
"loss": 0.5377,
"step": 186
},
{
"epoch": 0.9257425742574258,
"grad_norm": 0.4905324957197573,
"learning_rate": 7.828638094568041e-05,
"loss": 0.5466,
"step": 187
},
{
"epoch": 0.9306930693069307,
"grad_norm": 0.4780144225156857,
"learning_rate": 7.824612230782681e-05,
"loss": 0.546,
"step": 188
},
{
"epoch": 0.9356435643564357,
"grad_norm": 0.538508122143981,
"learning_rate": 7.820540683540808e-05,
"loss": 0.5027,
"step": 189
},
{
"epoch": 0.9405940594059405,
"grad_norm": 0.49556896212553525,
"learning_rate": 7.816423501475415e-05,
"loss": 0.5166,
"step": 190
},
{
"epoch": 0.9455445544554455,
"grad_norm": 0.4279890375269027,
"learning_rate": 7.812260733764591e-05,
"loss": 0.5356,
"step": 191
},
{
"epoch": 0.9504950495049505,
"grad_norm": 0.4943909611747402,
"learning_rate": 7.80805243013092e-05,
"loss": 0.5294,
"step": 192
},
{
"epoch": 0.9554455445544554,
"grad_norm": 0.4663007752860832,
"learning_rate": 7.803798640840901e-05,
"loss": 0.5154,
"step": 193
},
{
"epoch": 0.9603960396039604,
"grad_norm": 0.3908137583747954,
"learning_rate": 7.799499416704338e-05,
"loss": 0.4997,
"step": 194
},
{
"epoch": 0.9653465346534653,
"grad_norm": 0.5228603752159334,
"learning_rate": 7.795154809073735e-05,
"loss": 0.5262,
"step": 195
},
{
"epoch": 0.9702970297029703,
"grad_norm": 0.42838683826087115,
"learning_rate": 7.790764869843684e-05,
"loss": 0.4861,
"step": 196
},
{
"epoch": 0.9752475247524752,
"grad_norm": 0.4055948196662326,
"learning_rate": 7.786329651450248e-05,
"loss": 0.4859,
"step": 197
},
{
"epoch": 0.9801980198019802,
"grad_norm": 0.5966678290575969,
"learning_rate": 7.781849206870325e-05,
"loss": 0.5226,
"step": 198
},
{
"epoch": 0.9851485148514851,
"grad_norm": 1.880711225126043,
"learning_rate": 7.77732358962103e-05,
"loss": 0.5851,
"step": 199
},
{
"epoch": 0.9900990099009901,
"grad_norm": 0.6614925497720111,
"learning_rate": 7.772752853759039e-05,
"loss": 0.5411,
"step": 200
},
{
"epoch": 0.995049504950495,
"grad_norm": 0.46056442318357677,
"learning_rate": 7.768137053879957e-05,
"loss": 0.5579,
"step": 201
},
{
"epoch": 1.0,
"grad_norm": 0.5087524233313512,
"learning_rate": 7.763476245117659e-05,
"loss": 0.5128,
"step": 202
},
{
"epoch": 1.004950495049505,
"grad_norm": 0.6518551254154236,
"learning_rate": 7.758770483143634e-05,
"loss": 0.4381,
"step": 203
},
{
"epoch": 1.00990099009901,
"grad_norm": 0.4785460623442812,
"learning_rate": 7.754019824166318e-05,
"loss": 0.4398,
"step": 204
},
{
"epoch": 1.0148514851485149,
"grad_norm": 0.6034904750730884,
"learning_rate": 7.749224324930421e-05,
"loss": 0.4319,
"step": 205
},
{
"epoch": 1.0198019801980198,
"grad_norm": 0.6008863564030689,
"learning_rate": 7.744384042716258e-05,
"loss": 0.437,
"step": 206
},
{
"epoch": 1.0247524752475248,
"grad_norm": 1.030655464186979,
"learning_rate": 7.739499035339055e-05,
"loss": 0.4395,
"step": 207
},
{
"epoch": 1.0297029702970297,
"grad_norm": 0.7109993890828257,
"learning_rate": 7.734569361148262e-05,
"loss": 0.3941,
"step": 208
},
{
"epoch": 1.0346534653465347,
"grad_norm": 0.6473203544118956,
"learning_rate": 7.729595079026856e-05,
"loss": 0.3682,
"step": 209
},
{
"epoch": 1.0396039603960396,
"grad_norm": 0.5274771133610835,
"learning_rate": 7.724576248390639e-05,
"loss": 0.4189,
"step": 210
},
{
"epoch": 1.0445544554455446,
"grad_norm": 0.575369977811731,
"learning_rate": 7.719512929187527e-05,
"loss": 0.3941,
"step": 211
},
{
"epoch": 1.0495049504950495,
"grad_norm": 0.5032659995015568,
"learning_rate": 7.714405181896831e-05,
"loss": 0.4178,
"step": 212
},
{
"epoch": 1.0544554455445545,
"grad_norm": 0.542411447489868,
"learning_rate": 7.709253067528545e-05,
"loss": 0.4335,
"step": 213
},
{
"epoch": 1.0594059405940595,
"grad_norm": 0.4825982906183251,
"learning_rate": 7.704056647622603e-05,
"loss": 0.4404,
"step": 214
},
{
"epoch": 1.0643564356435644,
"grad_norm": 0.8410004042826386,
"learning_rate": 7.698815984248152e-05,
"loss": 0.4403,
"step": 215
},
{
"epoch": 1.0693069306930694,
"grad_norm": 0.4351215223126758,
"learning_rate": 7.693531140002811e-05,
"loss": 0.3886,
"step": 216
},
{
"epoch": 1.0742574257425743,
"grad_norm": 0.4904375867707524,
"learning_rate": 7.688202178011921e-05,
"loss": 0.447,
"step": 217
},
{
"epoch": 1.0792079207920793,
"grad_norm": 0.5298422562122228,
"learning_rate": 7.682829161927794e-05,
"loss": 0.4621,
"step": 218
},
{
"epoch": 1.0841584158415842,
"grad_norm": 0.48790093011894936,
"learning_rate": 7.677412155928946e-05,
"loss": 0.4223,
"step": 219
},
{
"epoch": 1.0891089108910892,
"grad_norm": 0.5015700826758351,
"learning_rate": 7.671951224719339e-05,
"loss": 0.4163,
"step": 220
},
{
"epoch": 1.0940594059405941,
"grad_norm": 0.9272870286113828,
"learning_rate": 7.666446433527601e-05,
"loss": 0.4401,
"step": 221
},
{
"epoch": 1.099009900990099,
"grad_norm": 0.4499978436086613,
"learning_rate": 7.660897848106251e-05,
"loss": 0.3956,
"step": 222
},
{
"epoch": 1.103960396039604,
"grad_norm": 0.6270774879370736,
"learning_rate": 7.655305534730916e-05,
"loss": 0.4191,
"step": 223
},
{
"epoch": 1.108910891089109,
"grad_norm": 0.47140106769279483,
"learning_rate": 7.649669560199528e-05,
"loss": 0.4275,
"step": 224
},
{
"epoch": 1.113861386138614,
"grad_norm": 0.4632505115641616,
"learning_rate": 7.643989991831541e-05,
"loss": 0.4198,
"step": 225
},
{
"epoch": 1.118811881188119,
"grad_norm": 0.5386755164319672,
"learning_rate": 7.638266897467117e-05,
"loss": 0.4001,
"step": 226
},
{
"epoch": 1.1237623762376239,
"grad_norm": 0.7288819939926411,
"learning_rate": 7.632500345466318e-05,
"loss": 0.4044,
"step": 227
},
{
"epoch": 1.1287128712871288,
"grad_norm": 0.6778386664812662,
"learning_rate": 7.62669040470829e-05,
"loss": 0.3975,
"step": 228
},
{
"epoch": 1.1336633663366338,
"grad_norm": 0.5286983261610211,
"learning_rate": 7.620837144590444e-05,
"loss": 0.4159,
"step": 229
},
{
"epoch": 1.1386138613861387,
"grad_norm": 2.652372059772227,
"learning_rate": 7.61494063502762e-05,
"loss": 0.4325,
"step": 230
},
{
"epoch": 1.1435643564356435,
"grad_norm": 4.809909490200282,
"learning_rate": 7.609000946451255e-05,
"loss": 0.4619,
"step": 231
},
{
"epoch": 1.1485148514851484,
"grad_norm": 0.8647324013296295,
"learning_rate": 7.603018149808542e-05,
"loss": 0.407,
"step": 232
},
{
"epoch": 1.1534653465346534,
"grad_norm": 0.5195039166414599,
"learning_rate": 7.596992316561583e-05,
"loss": 0.4496,
"step": 233
},
{
"epoch": 1.1584158415841583,
"grad_norm": 0.8504141877412958,
"learning_rate": 7.590923518686537e-05,
"loss": 0.4621,
"step": 234
},
{
"epoch": 1.1633663366336633,
"grad_norm": 0.7696908561259985,
"learning_rate": 7.584811828672755e-05,
"loss": 0.4744,
"step": 235
},
{
"epoch": 1.1683168316831682,
"grad_norm": 0.9598562979473423,
"learning_rate": 7.578657319521918e-05,
"loss": 0.4069,
"step": 236
},
{
"epoch": 1.1732673267326732,
"grad_norm": 0.5710619856339312,
"learning_rate": 7.572460064747167e-05,
"loss": 0.403,
"step": 237
},
{
"epoch": 1.1782178217821782,
"grad_norm": 0.4005092217182952,
"learning_rate": 7.56622013837222e-05,
"loss": 0.3622,
"step": 238
},
{
"epoch": 1.183168316831683,
"grad_norm": 0.48090434654331593,
"learning_rate": 7.55993761493049e-05,
"loss": 0.4021,
"step": 239
},
{
"epoch": 1.188118811881188,
"grad_norm": 0.5703687588411889,
"learning_rate": 7.553612569464197e-05,
"loss": 0.4375,
"step": 240
},
{
"epoch": 1.193069306930693,
"grad_norm": 0.681277315988626,
"learning_rate": 7.547245077523466e-05,
"loss": 0.4241,
"step": 241
},
{
"epoch": 1.198019801980198,
"grad_norm": 1.0094469420963084,
"learning_rate": 7.540835215165431e-05,
"loss": 0.4057,
"step": 242
},
{
"epoch": 1.202970297029703,
"grad_norm": 0.46952644922814796,
"learning_rate": 7.534383058953321e-05,
"loss": 0.4154,
"step": 243
},
{
"epoch": 1.2079207920792079,
"grad_norm": 0.459477309180552,
"learning_rate": 7.527888685955551e-05,
"loss": 0.3915,
"step": 244
},
{
"epoch": 1.2128712871287128,
"grad_norm": 0.5196788210280033,
"learning_rate": 7.5213521737448e-05,
"loss": 0.4354,
"step": 245
},
{
"epoch": 1.2178217821782178,
"grad_norm": 0.44785808199518773,
"learning_rate": 7.514773600397076e-05,
"loss": 0.3803,
"step": 246
},
{
"epoch": 1.2227722772277227,
"grad_norm": 0.4475790218783314,
"learning_rate": 7.508153044490796e-05,
"loss": 0.3788,
"step": 247
},
{
"epoch": 1.2277227722772277,
"grad_norm": 0.42812642794099415,
"learning_rate": 7.50149058510584e-05,
"loss": 0.4033,
"step": 248
},
{
"epoch": 1.2326732673267327,
"grad_norm": 0.46772116937732844,
"learning_rate": 7.494786301822611e-05,
"loss": 0.4613,
"step": 249
},
{
"epoch": 1.2376237623762376,
"grad_norm": 0.42095730344467575,
"learning_rate": 7.488040274721077e-05,
"loss": 0.4129,
"step": 250
},
{
"epoch": 1.2425742574257426,
"grad_norm": 0.5026705098166492,
"learning_rate": 7.481252584379822e-05,
"loss": 0.4333,
"step": 251
},
{
"epoch": 1.2475247524752475,
"grad_norm": 0.5379197167928285,
"learning_rate": 7.47442331187508e-05,
"loss": 0.4521,
"step": 252
},
{
"epoch": 1.2524752475247525,
"grad_norm": 0.4761452900609588,
"learning_rate": 7.467552538779768e-05,
"loss": 0.3855,
"step": 253
},
{
"epoch": 1.2574257425742574,
"grad_norm": 0.42059836224797353,
"learning_rate": 7.460640347162508e-05,
"loss": 0.4074,
"step": 254
},
{
"epoch": 1.2623762376237624,
"grad_norm": 0.4919545295388046,
"learning_rate": 7.453686819586655e-05,
"loss": 0.4517,
"step": 255
},
{
"epoch": 1.2673267326732673,
"grad_norm": 0.5104604218051465,
"learning_rate": 7.4466920391093e-05,
"loss": 0.4044,
"step": 256
},
{
"epoch": 1.2722772277227723,
"grad_norm": 0.3973603962169888,
"learning_rate": 7.439656089280286e-05,
"loss": 0.3884,
"step": 257
},
{
"epoch": 1.2772277227722773,
"grad_norm": 0.616310935251882,
"learning_rate": 7.432579054141208e-05,
"loss": 0.4877,
"step": 258
},
{
"epoch": 1.2821782178217822,
"grad_norm": 0.4153672619602748,
"learning_rate": 7.425461018224406e-05,
"loss": 0.4104,
"step": 259
},
{
"epoch": 1.2871287128712872,
"grad_norm": 0.39941726347863327,
"learning_rate": 7.418302066551959e-05,
"loss": 0.4158,
"step": 260
},
{
"epoch": 1.2920792079207921,
"grad_norm": 0.48616898482849413,
"learning_rate": 7.411102284634672e-05,
"loss": 0.4134,
"step": 261
},
{
"epoch": 1.297029702970297,
"grad_norm": 0.5691458690856808,
"learning_rate": 7.403861758471043e-05,
"loss": 0.437,
"step": 262
},
{
"epoch": 1.301980198019802,
"grad_norm": 0.4525002043690617,
"learning_rate": 7.396580574546251e-05,
"loss": 0.4251,
"step": 263
},
{
"epoch": 1.306930693069307,
"grad_norm": 0.41152996338869813,
"learning_rate": 7.38925881983111e-05,
"loss": 0.4301,
"step": 264
},
{
"epoch": 1.311881188118812,
"grad_norm": 0.5199728322754111,
"learning_rate": 7.381896581781042e-05,
"loss": 0.4614,
"step": 265
},
{
"epoch": 1.316831683168317,
"grad_norm": 0.5732853732578159,
"learning_rate": 7.37449394833502e-05,
"loss": 0.4662,
"step": 266
},
{
"epoch": 1.3217821782178218,
"grad_norm": 0.386991910628394,
"learning_rate": 7.367051007914527e-05,
"loss": 0.4306,
"step": 267
},
{
"epoch": 1.3267326732673268,
"grad_norm": 0.3939310664362404,
"learning_rate": 7.359567849422496e-05,
"loss": 0.4192,
"step": 268
},
{
"epoch": 1.3316831683168318,
"grad_norm": 0.4972811754620147,
"learning_rate": 7.352044562242248e-05,
"loss": 0.4362,
"step": 269
},
{
"epoch": 1.3366336633663367,
"grad_norm": 0.3929729126966092,
"learning_rate": 7.344481236236428e-05,
"loss": 0.3945,
"step": 270
},
{
"epoch": 1.3415841584158417,
"grad_norm": 0.44752443075284315,
"learning_rate": 7.336877961745926e-05,
"loss": 0.3867,
"step": 271
},
{
"epoch": 1.3465346534653464,
"grad_norm": 0.5877124955257983,
"learning_rate": 7.329234829588798e-05,
"loss": 0.44,
"step": 272
},
{
"epoch": 1.3514851485148514,
"grad_norm": 0.40997637387949215,
"learning_rate": 7.321551931059191e-05,
"loss": 0.3722,
"step": 273
},
{
"epoch": 1.3564356435643563,
"grad_norm": 0.5657538530990273,
"learning_rate": 7.313829357926238e-05,
"loss": 0.4267,
"step": 274
},
{
"epoch": 1.3613861386138613,
"grad_norm": 0.3756317827004364,
"learning_rate": 7.306067202432976e-05,
"loss": 0.3908,
"step": 275
},
{
"epoch": 1.3663366336633662,
"grad_norm": 0.5546385290550994,
"learning_rate": 7.29826555729523e-05,
"loss": 0.4374,
"step": 276
},
{
"epoch": 1.3712871287128712,
"grad_norm": 0.4499102203305975,
"learning_rate": 7.290424515700519e-05,
"loss": 0.4315,
"step": 277
},
{
"epoch": 1.3762376237623761,
"grad_norm": 0.4163376224541207,
"learning_rate": 7.282544171306933e-05,
"loss": 0.384,
"step": 278
},
{
"epoch": 1.381188118811881,
"grad_norm": 0.495665339007046,
"learning_rate": 7.274624618242022e-05,
"loss": 0.4333,
"step": 279
},
{
"epoch": 1.386138613861386,
"grad_norm": 1.2421659532566276,
"learning_rate": 7.266665951101664e-05,
"loss": 0.3935,
"step": 280
},
{
"epoch": 1.391089108910891,
"grad_norm": 0.45672463011046993,
"learning_rate": 7.258668264948941e-05,
"loss": 0.3962,
"step": 281
},
{
"epoch": 1.396039603960396,
"grad_norm": 0.5235819000103409,
"learning_rate": 7.250631655313001e-05,
"loss": 0.4424,
"step": 282
},
{
"epoch": 1.400990099009901,
"grad_norm": 0.6014413984892761,
"learning_rate": 7.242556218187919e-05,
"loss": 0.4075,
"step": 283
},
{
"epoch": 1.4059405940594059,
"grad_norm": 0.5122752409469894,
"learning_rate": 7.234442050031543e-05,
"loss": 0.4478,
"step": 284
},
{
"epoch": 1.4108910891089108,
"grad_norm": 0.707055920708386,
"learning_rate": 7.226289247764354e-05,
"loss": 0.4558,
"step": 285
},
{
"epoch": 1.4158415841584158,
"grad_norm": 0.4096719374369129,
"learning_rate": 7.2180979087683e-05,
"loss": 0.3839,
"step": 286
},
{
"epoch": 1.4207920792079207,
"grad_norm": 0.4493556023975073,
"learning_rate": 7.209868130885634e-05,
"loss": 0.3971,
"step": 287
},
{
"epoch": 1.4257425742574257,
"grad_norm": 0.40448122098989603,
"learning_rate": 7.201600012417745e-05,
"loss": 0.405,
"step": 288
},
{
"epoch": 1.4306930693069306,
"grad_norm": 0.559370368504956,
"learning_rate": 7.193293652123989e-05,
"loss": 0.5013,
"step": 289
},
{
"epoch": 1.4356435643564356,
"grad_norm": 0.48070543113884184,
"learning_rate": 7.1849491492205e-05,
"loss": 0.4181,
"step": 290
},
{
"epoch": 1.4405940594059405,
"grad_norm": 0.4710653868134633,
"learning_rate": 7.176566603379015e-05,
"loss": 0.4373,
"step": 291
},
{
"epoch": 1.4455445544554455,
"grad_norm": 0.39331471032696247,
"learning_rate": 7.168146114725673e-05,
"loss": 0.4568,
"step": 292
},
{
"epoch": 1.4504950495049505,
"grad_norm": 0.4248344484638374,
"learning_rate": 7.159687783839832e-05,
"loss": 0.4448,
"step": 293
},
{
"epoch": 1.4554455445544554,
"grad_norm": 0.43034402237228425,
"learning_rate": 7.151191711752854e-05,
"loss": 0.4144,
"step": 294
},
{
"epoch": 1.4603960396039604,
"grad_norm": 0.40716737449552437,
"learning_rate": 7.142657999946906e-05,
"loss": 0.4052,
"step": 295
},
{
"epoch": 1.4653465346534653,
"grad_norm": 0.4458812208157574,
"learning_rate": 7.134086750353747e-05,
"loss": 0.4181,
"step": 296
},
{
"epoch": 1.4702970297029703,
"grad_norm": 0.5814322067144533,
"learning_rate": 7.125478065353512e-05,
"loss": 0.4153,
"step": 297
},
{
"epoch": 1.4752475247524752,
"grad_norm": 0.3957086261289198,
"learning_rate": 7.116832047773484e-05,
"loss": 0.4201,
"step": 298
},
{
"epoch": 1.4801980198019802,
"grad_norm": 0.4666211020909172,
"learning_rate": 7.108148800886869e-05,
"loss": 0.4236,
"step": 299
},
{
"epoch": 1.4851485148514851,
"grad_norm": 0.4369532098868193,
"learning_rate": 7.09942842841156e-05,
"loss": 0.425,
"step": 300
},
{
"epoch": 1.49009900990099,
"grad_norm": 0.4127291956437661,
"learning_rate": 7.090671034508905e-05,
"loss": 0.383,
"step": 301
},
{
"epoch": 1.495049504950495,
"grad_norm": 0.38608337390531794,
"learning_rate": 7.081876723782457e-05,
"loss": 0.3782,
"step": 302
},
{
"epoch": 1.5,
"grad_norm": 0.3999354505178847,
"learning_rate": 7.073045601276723e-05,
"loss": 0.4006,
"step": 303
},
{
"epoch": 1.504950495049505,
"grad_norm": 0.4460616369287887,
"learning_rate": 7.064177772475912e-05,
"loss": 0.4248,
"step": 304
},
{
"epoch": 1.50990099009901,
"grad_norm": 1.617368168957565,
"learning_rate": 7.05527334330268e-05,
"loss": 0.4058,
"step": 305
},
{
"epoch": 1.5148514851485149,
"grad_norm": 0.4205841237338601,
"learning_rate": 7.046332420116852e-05,
"loss": 0.4019,
"step": 306
},
{
"epoch": 1.5198019801980198,
"grad_norm": 0.4590401658758068,
"learning_rate": 7.037355109714165e-05,
"loss": 0.4375,
"step": 307
},
{
"epoch": 1.5247524752475248,
"grad_norm": 0.369456284577221,
"learning_rate": 7.028341519324985e-05,
"loss": 0.3981,
"step": 308
},
{
"epoch": 1.5297029702970297,
"grad_norm": 0.45818633341019355,
"learning_rate": 7.019291756613029e-05,
"loss": 0.4329,
"step": 309
},
{
"epoch": 1.5346534653465347,
"grad_norm": 0.41006878674759506,
"learning_rate": 7.010205929674075e-05,
"loss": 0.4358,
"step": 310
},
{
"epoch": 1.5396039603960396,
"grad_norm": 0.3660852415579547,
"learning_rate": 7.001084147034676e-05,
"loss": 0.3925,
"step": 311
},
{
"epoch": 1.5445544554455446,
"grad_norm": 0.49311989334696543,
"learning_rate": 6.99192651765086e-05,
"loss": 0.4045,
"step": 312
},
{
"epoch": 1.5495049504950495,
"grad_norm": 0.4224653494426318,
"learning_rate": 6.982733150906833e-05,
"loss": 0.4549,
"step": 313
},
{
"epoch": 1.5544554455445545,
"grad_norm": 0.441676447696254,
"learning_rate": 6.973504156613666e-05,
"loss": 0.4186,
"step": 314
},
{
"epoch": 1.5594059405940595,
"grad_norm": 0.39314186157633607,
"learning_rate": 6.964239645007989e-05,
"loss": 0.3917,
"step": 315
},
{
"epoch": 1.5643564356435644,
"grad_norm": 0.36195381366820006,
"learning_rate": 6.954939726750667e-05,
"loss": 0.3886,
"step": 316
},
{
"epoch": 1.5693069306930694,
"grad_norm": 0.38026650238359444,
"learning_rate": 6.945604512925493e-05,
"loss": 0.3953,
"step": 317
},
{
"epoch": 1.5742574257425743,
"grad_norm": 0.4727277048957753,
"learning_rate": 6.936234115037842e-05,
"loss": 0.4081,
"step": 318
},
{
"epoch": 1.5792079207920793,
"grad_norm": 0.4122641931550936,
"learning_rate": 6.926828645013353e-05,
"loss": 0.4096,
"step": 319
},
{
"epoch": 1.5841584158415842,
"grad_norm": 0.41972741676629705,
"learning_rate": 6.917388215196585e-05,
"loss": 0.4325,
"step": 320
},
{
"epoch": 1.5891089108910892,
"grad_norm": 0.37213910332961514,
"learning_rate": 6.907912938349682e-05,
"loss": 0.408,
"step": 321
},
{
"epoch": 1.5940594059405941,
"grad_norm": 0.4079206922373049,
"learning_rate": 6.898402927651019e-05,
"loss": 0.3932,
"step": 322
},
{
"epoch": 1.599009900990099,
"grad_norm": 0.5391239274082024,
"learning_rate": 6.88885829669385e-05,
"loss": 0.4413,
"step": 323
},
{
"epoch": 1.603960396039604,
"grad_norm": 0.3304796112354852,
"learning_rate": 6.879279159484961e-05,
"loss": 0.3793,
"step": 324
},
{
"epoch": 1.608910891089109,
"grad_norm": 0.4565949481665514,
"learning_rate": 6.869665630443295e-05,
"loss": 0.4088,
"step": 325
},
{
"epoch": 1.613861386138614,
"grad_norm": 0.43115710841031746,
"learning_rate": 6.860017824398595e-05,
"loss": 0.4024,
"step": 326
},
{
"epoch": 1.618811881188119,
"grad_norm": 0.4473410186927358,
"learning_rate": 6.85033585659003e-05,
"loss": 0.4,
"step": 327
},
{
"epoch": 1.6237623762376239,
"grad_norm": 0.34319850357607024,
"learning_rate": 6.84061984266481e-05,
"loss": 0.3749,
"step": 328
},
{
"epoch": 1.6287128712871288,
"grad_norm": 0.3991717496121638,
"learning_rate": 6.830869898676822e-05,
"loss": 0.4249,
"step": 329
},
{
"epoch": 1.6336633663366338,
"grad_norm": 0.3669421848461611,
"learning_rate": 6.82108614108523e-05,
"loss": 0.3747,
"step": 330
},
{
"epoch": 1.6386138613861387,
"grad_norm": 0.40366481969746204,
"learning_rate": 6.811268686753086e-05,
"loss": 0.4194,
"step": 331
},
{
"epoch": 1.6435643564356437,
"grad_norm": 0.33856718283042686,
"learning_rate": 6.801417652945939e-05,
"loss": 0.4048,
"step": 332
},
{
"epoch": 1.6485148514851486,
"grad_norm": 0.38286010896252787,
"learning_rate": 6.79153315733043e-05,
"loss": 0.4078,
"step": 333
},
{
"epoch": 1.6534653465346536,
"grad_norm": 0.41697274751788355,
"learning_rate": 6.781615317972886e-05,
"loss": 0.4177,
"step": 334
},
{
"epoch": 1.6584158415841586,
"grad_norm": 0.5184797797560337,
"learning_rate": 6.771664253337916e-05,
"loss": 0.4306,
"step": 335
},
{
"epoch": 1.6633663366336635,
"grad_norm": 0.38783387993392643,
"learning_rate": 6.761680082286988e-05,
"loss": 0.394,
"step": 336
},
{
"epoch": 1.6683168316831685,
"grad_norm": 0.3428733325169249,
"learning_rate": 6.751662924077015e-05,
"loss": 0.3672,
"step": 337
},
{
"epoch": 1.6732673267326734,
"grad_norm": 0.38619372406045765,
"learning_rate": 6.741612898358924e-05,
"loss": 0.4151,
"step": 338
},
{
"epoch": 1.6782178217821784,
"grad_norm": 0.5825910251609712,
"learning_rate": 6.731530125176237e-05,
"loss": 0.3999,
"step": 339
},
{
"epoch": 1.6831683168316833,
"grad_norm": 0.38484297742645873,
"learning_rate": 6.721414724963631e-05,
"loss": 0.4128,
"step": 340
},
{
"epoch": 1.688118811881188,
"grad_norm": 0.3374983247266092,
"learning_rate": 6.711266818545494e-05,
"loss": 0.4031,
"step": 341
},
{
"epoch": 1.693069306930693,
"grad_norm": 0.3826217100514408,
"learning_rate": 6.701086527134491e-05,
"loss": 0.4009,
"step": 342
},
{
"epoch": 1.698019801980198,
"grad_norm": 0.4591363006682774,
"learning_rate": 6.690873972330116e-05,
"loss": 0.4228,
"step": 343
},
{
"epoch": 1.702970297029703,
"grad_norm": 0.40260910474197376,
"learning_rate": 6.68062927611723e-05,
"loss": 0.3947,
"step": 344
},
{
"epoch": 1.7079207920792079,
"grad_norm": 0.4397536768863476,
"learning_rate": 6.670352560864615e-05,
"loss": 0.3908,
"step": 345
},
{
"epoch": 1.7128712871287128,
"grad_norm": 0.43329899477194356,
"learning_rate": 6.660043949323505e-05,
"loss": 0.4372,
"step": 346
},
{
"epoch": 1.7178217821782178,
"grad_norm": 0.4602501332408058,
"learning_rate": 6.649703564626125e-05,
"loss": 0.3841,
"step": 347
},
{
"epoch": 1.7227722772277227,
"grad_norm": 0.35344976361552355,
"learning_rate": 6.639331530284214e-05,
"loss": 0.3727,
"step": 348
},
{
"epoch": 1.7277227722772277,
"grad_norm": 0.45197956712151793,
"learning_rate": 6.628927970187557e-05,
"loss": 0.3986,
"step": 349
},
{
"epoch": 1.7326732673267327,
"grad_norm": 0.3319696461521242,
"learning_rate": 6.618493008602496e-05,
"loss": 0.416,
"step": 350
},
{
"epoch": 1.7376237623762376,
"grad_norm": 0.4240346650785551,
"learning_rate": 6.608026770170459e-05,
"loss": 0.4162,
"step": 351
},
{
"epoch": 1.7425742574257426,
"grad_norm": 0.37060686199685133,
"learning_rate": 6.597529379906455e-05,
"loss": 0.4134,
"step": 352
},
{
"epoch": 1.7475247524752475,
"grad_norm": 0.36029614953276784,
"learning_rate": 6.587000963197598e-05,
"loss": 0.4135,
"step": 353
},
{
"epoch": 1.7524752475247525,
"grad_norm": 0.3512525066622723,
"learning_rate": 6.576441645801592e-05,
"loss": 0.3958,
"step": 354
},
{
"epoch": 1.7574257425742574,
"grad_norm": 0.426444933184236,
"learning_rate": 6.565851553845242e-05,
"loss": 0.4306,
"step": 355
},
{
"epoch": 1.7623762376237624,
"grad_norm": 0.45810053308481247,
"learning_rate": 6.555230813822942e-05,
"loss": 0.4515,
"step": 356
},
{
"epoch": 1.7673267326732673,
"grad_norm": 0.3257359743366414,
"learning_rate": 6.544579552595165e-05,
"loss": 0.3882,
"step": 357
},
{
"epoch": 1.7722772277227723,
"grad_norm": 0.5120549397179133,
"learning_rate": 6.533897897386946e-05,
"loss": 0.3977,
"step": 358
},
{
"epoch": 1.7772277227722773,
"grad_norm": 0.3741740274690065,
"learning_rate": 6.523185975786366e-05,
"loss": 0.389,
"step": 359
},
{
"epoch": 1.7821782178217822,
"grad_norm": 0.42787211118613316,
"learning_rate": 6.512443915743024e-05,
"loss": 0.4786,
"step": 360
},
{
"epoch": 1.7871287128712872,
"grad_norm": 0.42024261733771173,
"learning_rate": 6.501671845566512e-05,
"loss": 0.4744,
"step": 361
},
{
"epoch": 1.7920792079207921,
"grad_norm": 0.33419863088736074,
"learning_rate": 6.49086989392488e-05,
"loss": 0.4185,
"step": 362
},
{
"epoch": 1.797029702970297,
"grad_norm": 0.35002307084669404,
"learning_rate": 6.480038189843101e-05,
"loss": 0.4146,
"step": 363
},
{
"epoch": 1.801980198019802,
"grad_norm": 0.35584559302997615,
"learning_rate": 6.469176862701529e-05,
"loss": 0.3884,
"step": 364
},
{
"epoch": 1.806930693069307,
"grad_norm": 0.3811447477456138,
"learning_rate": 6.458286042234352e-05,
"loss": 0.416,
"step": 365
},
{
"epoch": 1.811881188118812,
"grad_norm": 0.3853982786626955,
"learning_rate": 6.447365858528046e-05,
"loss": 0.4125,
"step": 366
},
{
"epoch": 1.8168316831683167,
"grad_norm": 0.3630198314521477,
"learning_rate": 6.436416442019817e-05,
"loss": 0.4156,
"step": 367
},
{
"epoch": 1.8217821782178216,
"grad_norm": 0.39036383937421515,
"learning_rate": 6.425437923496045e-05,
"loss": 0.4063,
"step": 368
},
{
"epoch": 1.8267326732673266,
"grad_norm": 0.34303444284998674,
"learning_rate": 6.414430434090725e-05,
"loss": 0.3907,
"step": 369
},
{
"epoch": 1.8316831683168315,
"grad_norm": 0.3721425083007295,
"learning_rate": 6.403394105283897e-05,
"loss": 0.3844,
"step": 370
},
{
"epoch": 1.8366336633663365,
"grad_norm": 0.3728755734239106,
"learning_rate": 6.392329068900072e-05,
"loss": 0.3786,
"step": 371
},
{
"epoch": 1.8415841584158414,
"grad_norm": 0.42539329737431825,
"learning_rate": 6.381235457106664e-05,
"loss": 0.4059,
"step": 372
},
{
"epoch": 1.8465346534653464,
"grad_norm": 0.3516158152148282,
"learning_rate": 6.370113402412412e-05,
"loss": 0.3877,
"step": 373
},
{
"epoch": 1.8514851485148514,
"grad_norm": 0.4715333524372133,
"learning_rate": 6.358963037665787e-05,
"loss": 0.408,
"step": 374
},
{
"epoch": 1.8564356435643563,
"grad_norm": 0.41249812553139303,
"learning_rate": 6.347784496053416e-05,
"loss": 0.38,
"step": 375
},
{
"epoch": 1.8613861386138613,
"grad_norm": 0.46013637363793264,
"learning_rate": 6.336577911098493e-05,
"loss": 0.3771,
"step": 376
},
{
"epoch": 1.8663366336633662,
"grad_norm": 0.4205308152403992,
"learning_rate": 6.325343416659166e-05,
"loss": 0.4055,
"step": 377
},
{
"epoch": 1.8712871287128712,
"grad_norm": 0.3766714102667099,
"learning_rate": 6.314081146926964e-05,
"loss": 0.4226,
"step": 378
},
{
"epoch": 1.8762376237623761,
"grad_norm": 0.39466868097270724,
"learning_rate": 6.302791236425169e-05,
"loss": 0.3764,
"step": 379
},
{
"epoch": 1.881188118811881,
"grad_norm": 0.41414658631461326,
"learning_rate": 6.291473820007227e-05,
"loss": 0.3937,
"step": 380
},
{
"epoch": 1.886138613861386,
"grad_norm": 0.4008669055354229,
"learning_rate": 6.280129032855132e-05,
"loss": 0.365,
"step": 381
},
{
"epoch": 1.891089108910891,
"grad_norm": 0.4299626259785867,
"learning_rate": 6.268757010477806e-05,
"loss": 0.3978,
"step": 382
},
{
"epoch": 1.896039603960396,
"grad_norm": 0.4070041784450541,
"learning_rate": 6.257357888709492e-05,
"loss": 0.4278,
"step": 383
},
{
"epoch": 1.900990099009901,
"grad_norm": 0.42982053279620297,
"learning_rate": 6.245931803708116e-05,
"loss": 0.4199,
"step": 384
},
{
"epoch": 1.9059405940594059,
"grad_norm": 0.4548088267032186,
"learning_rate": 6.234478891953674e-05,
"loss": 0.4105,
"step": 385
},
{
"epoch": 1.9108910891089108,
"grad_norm": 0.34938003297126835,
"learning_rate": 6.222999290246595e-05,
"loss": 0.3826,
"step": 386
},
{
"epoch": 1.9158415841584158,
"grad_norm": 0.4449921528368388,
"learning_rate": 6.211493135706109e-05,
"loss": 0.3915,
"step": 387
},
{
"epoch": 1.9207920792079207,
"grad_norm": 0.3954858861397491,
"learning_rate": 6.199960565768611e-05,
"loss": 0.4206,
"step": 388
},
{
"epoch": 1.9257425742574257,
"grad_norm": 0.3680314701826789,
"learning_rate": 6.188401718186013e-05,
"loss": 0.4084,
"step": 389
},
{
"epoch": 1.9306930693069306,
"grad_norm": 0.3616975564074992,
"learning_rate": 6.17681673102411e-05,
"loss": 0.3903,
"step": 390
},
{
"epoch": 1.9356435643564356,
"grad_norm": 0.37988021748884604,
"learning_rate": 6.165205742660915e-05,
"loss": 0.4257,
"step": 391
},
{
"epoch": 1.9405940594059405,
"grad_norm": 0.35051686029822304,
"learning_rate": 6.15356889178502e-05,
"loss": 0.4141,
"step": 392
},
{
"epoch": 1.9455445544554455,
"grad_norm": 0.36629291399470554,
"learning_rate": 6.141906317393934e-05,
"loss": 0.3573,
"step": 393
},
{
"epoch": 1.9504950495049505,
"grad_norm": 0.35465338755441933,
"learning_rate": 6.130218158792421e-05,
"loss": 0.3634,
"step": 394
},
{
"epoch": 1.9554455445544554,
"grad_norm": 0.3741800019861695,
"learning_rate": 6.118504555590843e-05,
"loss": 0.4103,
"step": 395
},
{
"epoch": 1.9603960396039604,
"grad_norm": 0.3845094498974814,
"learning_rate": 6.10676564770348e-05,
"loss": 0.4255,
"step": 396
},
{
"epoch": 1.9653465346534653,
"grad_norm": 0.3427607558490164,
"learning_rate": 6.0950015753468745e-05,
"loss": 0.3549,
"step": 397
},
{
"epoch": 1.9702970297029703,
"grad_norm": 0.38103519315598317,
"learning_rate": 6.083212479038143e-05,
"loss": 0.37,
"step": 398
},
{
"epoch": 1.9752475247524752,
"grad_norm": 0.349874345182412,
"learning_rate": 6.0713984995933016e-05,
"loss": 0.4148,
"step": 399
},
{
"epoch": 1.9801980198019802,
"grad_norm": 0.3526050935966009,
"learning_rate": 6.059559778125593e-05,
"loss": 0.4193,
"step": 400
},
{
"epoch": 1.9851485148514851,
"grad_norm": 0.41226502171796636,
"learning_rate": 6.0476964560437864e-05,
"loss": 0.4042,
"step": 401
},
{
"epoch": 1.99009900990099,
"grad_norm": 0.43735266521817034,
"learning_rate": 6.035808675050497e-05,
"loss": 0.4042,
"step": 402
},
{
"epoch": 1.995049504950495,
"grad_norm": 0.3808948433009949,
"learning_rate": 6.023896577140496e-05,
"loss": 0.4242,
"step": 403
},
{
"epoch": 2.0,
"grad_norm": 0.3753919394648301,
"learning_rate": 6.011960304599003e-05,
"loss": 0.3721,
"step": 404
},
{
"epoch": 2.004950495049505,
"grad_norm": 0.5337736086813397,
"learning_rate": 6.000000000000001e-05,
"loss": 0.256,
"step": 405
},
{
"epoch": 2.00990099009901,
"grad_norm": 0.43713310875092315,
"learning_rate": 5.988015806204521e-05,
"loss": 0.2488,
"step": 406
},
{
"epoch": 2.014851485148515,
"grad_norm": 0.4739558661506129,
"learning_rate": 5.9760078663589454e-05,
"loss": 0.2214,
"step": 407
},
{
"epoch": 2.01980198019802,
"grad_norm": 0.5637416820705298,
"learning_rate": 5.9639763238932893e-05,
"loss": 0.2541,
"step": 408
},
{
"epoch": 2.0247524752475248,
"grad_norm": 0.41314659967319184,
"learning_rate": 5.9519213225194944e-05,
"loss": 0.2242,
"step": 409
},
{
"epoch": 2.0297029702970297,
"grad_norm": 0.4476975471705969,
"learning_rate": 5.9398430062297104e-05,
"loss": 0.2436,
"step": 410
},
{
"epoch": 2.0346534653465347,
"grad_norm": 0.554086376519811,
"learning_rate": 5.9277415192945707e-05,
"loss": 0.2636,
"step": 411
},
{
"epoch": 2.0396039603960396,
"grad_norm": 0.3896325397871647,
"learning_rate": 5.915617006261475e-05,
"loss": 0.2163,
"step": 412
},
{
"epoch": 2.0445544554455446,
"grad_norm": 0.4211534758131753,
"learning_rate": 5.903469611952861e-05,
"loss": 0.2069,
"step": 413
},
{
"epoch": 2.0495049504950495,
"grad_norm": 0.4673961047376062,
"learning_rate": 5.891299481464473e-05,
"loss": 0.2357,
"step": 414
},
{
"epoch": 2.0544554455445545,
"grad_norm": 0.4699121939823616,
"learning_rate": 5.8791067601636305e-05,
"loss": 0.2132,
"step": 415
},
{
"epoch": 2.0594059405940595,
"grad_norm": 0.40250579961688465,
"learning_rate": 5.866891593687492e-05,
"loss": 0.2371,
"step": 416
},
{
"epoch": 2.0643564356435644,
"grad_norm": 0.4430137787396212,
"learning_rate": 5.8546541279413094e-05,
"loss": 0.2206,
"step": 417
},
{
"epoch": 2.0693069306930694,
"grad_norm": 0.6560412040889878,
"learning_rate": 5.842394509096699e-05,
"loss": 0.2548,
"step": 418
},
{
"epoch": 2.0742574257425743,
"grad_norm": 0.41799449349599727,
"learning_rate": 5.8301128835898814e-05,
"loss": 0.2098,
"step": 419
},
{
"epoch": 2.0792079207920793,
"grad_norm": 0.5457732239448745,
"learning_rate": 5.817809398119937e-05,
"loss": 0.2207,
"step": 420
},
{
"epoch": 2.0841584158415842,
"grad_norm": 0.4420461787241115,
"learning_rate": 5.805484199647059e-05,
"loss": 0.2348,
"step": 421
},
{
"epoch": 2.089108910891089,
"grad_norm": 0.34045786091490615,
"learning_rate": 5.7931374353907904e-05,
"loss": 0.237,
"step": 422
},
{
"epoch": 2.094059405940594,
"grad_norm": 0.5284323872556919,
"learning_rate": 5.780769252828268e-05,
"loss": 0.2465,
"step": 423
},
{
"epoch": 2.099009900990099,
"grad_norm": 0.38111779412176855,
"learning_rate": 5.768379799692469e-05,
"loss": 0.2203,
"step": 424
},
{
"epoch": 2.103960396039604,
"grad_norm": 0.48871360246565815,
"learning_rate": 5.7559692239704255e-05,
"loss": 0.2303,
"step": 425
},
{
"epoch": 2.108910891089109,
"grad_norm": 0.44331771149086596,
"learning_rate": 5.743537673901485e-05,
"loss": 0.2252,
"step": 426
},
{
"epoch": 2.113861386138614,
"grad_norm": 0.4268642412980195,
"learning_rate": 5.731085297975516e-05,
"loss": 0.2142,
"step": 427
},
{
"epoch": 2.118811881188119,
"grad_norm": 0.6275427494810922,
"learning_rate": 5.718612244931146e-05,
"loss": 0.2671,
"step": 428
},
{
"epoch": 2.123762376237624,
"grad_norm": 0.327732145674678,
"learning_rate": 5.706118663753982e-05,
"loss": 0.2165,
"step": 429
},
{
"epoch": 2.128712871287129,
"grad_norm": 0.4656441860841262,
"learning_rate": 5.6936047036748335e-05,
"loss": 0.2609,
"step": 430
},
{
"epoch": 2.133663366336634,
"grad_norm": 0.30479417092528327,
"learning_rate": 5.6810705141679246e-05,
"loss": 0.2076,
"step": 431
},
{
"epoch": 2.1386138613861387,
"grad_norm": 0.402408077735042,
"learning_rate": 5.6685162449491125e-05,
"loss": 0.2352,
"step": 432
},
{
"epoch": 2.1435643564356437,
"grad_norm": 0.3341797567078978,
"learning_rate": 5.655942045974101e-05,
"loss": 0.2223,
"step": 433
},
{
"epoch": 2.1485148514851486,
"grad_norm": 0.31991687492148513,
"learning_rate": 5.643348067436644e-05,
"loss": 0.2234,
"step": 434
},
{
"epoch": 2.1534653465346536,
"grad_norm": 0.31279738632606163,
"learning_rate": 5.6307344597667555e-05,
"loss": 0.2297,
"step": 435
},
{
"epoch": 2.1584158415841586,
"grad_norm": 0.33002843854360975,
"learning_rate": 5.6181013736289114e-05,
"loss": 0.2174,
"step": 436
},
{
"epoch": 2.1633663366336635,
"grad_norm": 0.35214742510811414,
"learning_rate": 5.605448959920251e-05,
"loss": 0.2256,
"step": 437
},
{
"epoch": 2.1683168316831685,
"grad_norm": 0.40557291840727117,
"learning_rate": 5.5927773697687726e-05,
"loss": 0.2428,
"step": 438
},
{
"epoch": 2.1732673267326734,
"grad_norm": 0.37799249434729715,
"learning_rate": 5.580086754531527e-05,
"loss": 0.204,
"step": 439
},
{
"epoch": 2.1782178217821784,
"grad_norm": 0.3420962200582764,
"learning_rate": 5.567377265792819e-05,
"loss": 0.2366,
"step": 440
},
{
"epoch": 2.1831683168316833,
"grad_norm": 0.3751006820518302,
"learning_rate": 5.554649055362381e-05,
"loss": 0.2337,
"step": 441
},
{
"epoch": 2.1881188118811883,
"grad_norm": 0.3571671609515524,
"learning_rate": 5.5419022752735764e-05,
"loss": 0.2266,
"step": 442
},
{
"epoch": 2.1930693069306932,
"grad_norm": 0.37320519253976664,
"learning_rate": 5.5291370777815693e-05,
"loss": 0.232,
"step": 443
},
{
"epoch": 2.198019801980198,
"grad_norm": 0.3161181093567384,
"learning_rate": 5.5163536153615185e-05,
"loss": 0.2235,
"step": 444
},
{
"epoch": 2.202970297029703,
"grad_norm": 0.46028647031599745,
"learning_rate": 5.503552040706744e-05,
"loss": 0.2503,
"step": 445
},
{
"epoch": 2.207920792079208,
"grad_norm": 0.3193257025863596,
"learning_rate": 5.490732506726911e-05,
"loss": 0.2317,
"step": 446
},
{
"epoch": 2.212871287128713,
"grad_norm": 0.3351952567580859,
"learning_rate": 5.477895166546207e-05,
"loss": 0.2301,
"step": 447
},
{
"epoch": 2.217821782178218,
"grad_norm": 0.46300097922283723,
"learning_rate": 5.4650401735014985e-05,
"loss": 0.2236,
"step": 448
},
{
"epoch": 2.222772277227723,
"grad_norm": 0.36846314014382436,
"learning_rate": 5.452167681140515e-05,
"loss": 0.2591,
"step": 449
},
{
"epoch": 2.227722772277228,
"grad_norm": 0.3411528537586109,
"learning_rate": 5.4392778432200044e-05,
"loss": 0.2232,
"step": 450
},
{
"epoch": 2.232673267326733,
"grad_norm": 0.40101497418695975,
"learning_rate": 5.426370813703903e-05,
"loss": 0.2468,
"step": 451
},
{
"epoch": 2.237623762376238,
"grad_norm": 0.36325004554083573,
"learning_rate": 5.4134467467614945e-05,
"loss": 0.2333,
"step": 452
},
{
"epoch": 2.2425742574257423,
"grad_norm": 0.4039573273985563,
"learning_rate": 5.4005057967655634e-05,
"loss": 0.2389,
"step": 453
},
{
"epoch": 2.2475247524752477,
"grad_norm": 0.2859750720015901,
"learning_rate": 5.3875481182905595e-05,
"loss": 0.2119,
"step": 454
},
{
"epoch": 2.2524752475247523,
"grad_norm": 0.4823602187196335,
"learning_rate": 5.374573866110746e-05,
"loss": 0.2547,
"step": 455
},
{
"epoch": 2.2574257425742577,
"grad_norm": 0.36717753790523777,
"learning_rate": 5.3615831951983535e-05,
"loss": 0.2157,
"step": 456
},
{
"epoch": 2.262376237623762,
"grad_norm": 0.356291153234771,
"learning_rate": 5.348576260721725e-05,
"loss": 0.2484,
"step": 457
},
{
"epoch": 2.2673267326732676,
"grad_norm": 0.34698712469187953,
"learning_rate": 5.3355532180434696e-05,
"loss": 0.2309,
"step": 458
},
{
"epoch": 2.272277227722772,
"grad_norm": 0.3596381562555409,
"learning_rate": 5.3225142227185974e-05,
"loss": 0.229,
"step": 459
},
{
"epoch": 2.2772277227722775,
"grad_norm": 0.32545619571413364,
"learning_rate": 5.309459430492672e-05,
"loss": 0.2264,
"step": 460
},
{
"epoch": 2.282178217821782,
"grad_norm": 0.35180510825756656,
"learning_rate": 5.2963889972999384e-05,
"loss": 0.2475,
"step": 461
},
{
"epoch": 2.287128712871287,
"grad_norm": 0.3606148958622392,
"learning_rate": 5.283303079261471e-05,
"loss": 0.2442,
"step": 462
},
{
"epoch": 2.292079207920792,
"grad_norm": 0.3563222394672158,
"learning_rate": 5.2702018326833044e-05,
"loss": 0.2347,
"step": 463
},
{
"epoch": 2.297029702970297,
"grad_norm": 0.3831444963521827,
"learning_rate": 5.257085414054565e-05,
"loss": 0.2747,
"step": 464
},
{
"epoch": 2.301980198019802,
"grad_norm": 0.33730785554599724,
"learning_rate": 5.243953980045603e-05,
"loss": 0.2388,
"step": 465
},
{
"epoch": 2.3069306930693068,
"grad_norm": 0.32495238050734326,
"learning_rate": 5.230807687506122e-05,
"loss": 0.2364,
"step": 466
},
{
"epoch": 2.3118811881188117,
"grad_norm": 0.37140573353028156,
"learning_rate": 5.2176466934633045e-05,
"loss": 0.2436,
"step": 467
},
{
"epoch": 2.3168316831683167,
"grad_norm": 0.36493258422574965,
"learning_rate": 5.204471155119938e-05,
"loss": 0.2337,
"step": 468
},
{
"epoch": 2.3217821782178216,
"grad_norm": 0.3152805218553869,
"learning_rate": 5.191281229852534e-05,
"loss": 0.2505,
"step": 469
},
{
"epoch": 2.3267326732673266,
"grad_norm": 0.3346938783234162,
"learning_rate": 5.17807707520945e-05,
"loss": 0.2129,
"step": 470
},
{
"epoch": 2.3316831683168315,
"grad_norm": 0.5222639196813958,
"learning_rate": 5.164858848909009e-05,
"loss": 0.2666,
"step": 471
},
{
"epoch": 2.3366336633663365,
"grad_norm": 0.31652796650422926,
"learning_rate": 5.151626708837612e-05,
"loss": 0.217,
"step": 472
},
{
"epoch": 2.3415841584158414,
"grad_norm": 0.3426513953545765,
"learning_rate": 5.1383808130478605e-05,
"loss": 0.2374,
"step": 473
},
{
"epoch": 2.3465346534653464,
"grad_norm": 0.36290882542143066,
"learning_rate": 5.1251213197566515e-05,
"loss": 0.235,
"step": 474
},
{
"epoch": 2.3514851485148514,
"grad_norm": 0.42009903492564477,
"learning_rate": 5.11184838734331e-05,
"loss": 0.2441,
"step": 475
},
{
"epoch": 2.3564356435643563,
"grad_norm": 0.3054426542267136,
"learning_rate": 5.098562174347679e-05,
"loss": 0.2157,
"step": 476
},
{
"epoch": 2.3613861386138613,
"grad_norm": 0.3823497680281174,
"learning_rate": 5.085262839468236e-05,
"loss": 0.2248,
"step": 477
},
{
"epoch": 2.366336633663366,
"grad_norm": 0.5103288301977376,
"learning_rate": 5.071950541560193e-05,
"loss": 0.2518,
"step": 478
},
{
"epoch": 2.371287128712871,
"grad_norm": 0.30455267923588514,
"learning_rate": 5.058625439633599e-05,
"loss": 0.2181,
"step": 479
},
{
"epoch": 2.376237623762376,
"grad_norm": 0.3818419897600763,
"learning_rate": 5.0452876928514434e-05,
"loss": 0.2285,
"step": 480
},
{
"epoch": 2.381188118811881,
"grad_norm": 0.3699320852480438,
"learning_rate": 5.031937460527753e-05,
"loss": 0.2398,
"step": 481
},
{
"epoch": 2.386138613861386,
"grad_norm": 0.3644442422492638,
"learning_rate": 5.018574902125689e-05,
"loss": 0.2351,
"step": 482
},
{
"epoch": 2.391089108910891,
"grad_norm": 0.35930941575494146,
"learning_rate": 5.005200177255645e-05,
"loss": 0.2342,
"step": 483
},
{
"epoch": 2.396039603960396,
"grad_norm": 0.37752588100484835,
"learning_rate": 4.991813445673334e-05,
"loss": 0.2253,
"step": 484
},
{
"epoch": 2.400990099009901,
"grad_norm": 0.32893758478857993,
"learning_rate": 4.9784148672778864e-05,
"loss": 0.2282,
"step": 485
},
{
"epoch": 2.405940594059406,
"grad_norm": 0.30149336304495533,
"learning_rate": 4.965004602109938e-05,
"loss": 0.2061,
"step": 486
},
{
"epoch": 2.410891089108911,
"grad_norm": 0.3219254238354535,
"learning_rate": 4.95158281034972e-05,
"loss": 0.2163,
"step": 487
},
{
"epoch": 2.4158415841584158,
"grad_norm": 0.31525960771805756,
"learning_rate": 4.938149652315142e-05,
"loss": 0.2321,
"step": 488
},
{
"epoch": 2.4207920792079207,
"grad_norm": 0.39088144618312287,
"learning_rate": 4.92470528845988e-05,
"loss": 0.2567,
"step": 489
},
{
"epoch": 2.4257425742574257,
"grad_norm": 0.3449499761490203,
"learning_rate": 4.911249879371457e-05,
"loss": 0.2372,
"step": 490
},
{
"epoch": 2.4306930693069306,
"grad_norm": 0.45815109391750647,
"learning_rate": 4.897783585769331e-05,
"loss": 0.2357,
"step": 491
},
{
"epoch": 2.4356435643564356,
"grad_norm": 0.3118857748477582,
"learning_rate": 4.884306568502968e-05,
"loss": 0.2327,
"step": 492
},
{
"epoch": 2.4405940594059405,
"grad_norm": 0.3386329573636345,
"learning_rate": 4.870818988549923e-05,
"loss": 0.2576,
"step": 493
},
{
"epoch": 2.4455445544554455,
"grad_norm": 0.33032199797382256,
"learning_rate": 4.857321007013924e-05,
"loss": 0.236,
"step": 494
},
{
"epoch": 2.4504950495049505,
"grad_norm": 0.30078783509268575,
"learning_rate": 4.843812785122933e-05,
"loss": 0.2181,
"step": 495
},
{
"epoch": 2.4554455445544554,
"grad_norm": 0.34354471178005036,
"learning_rate": 4.830294484227236e-05,
"loss": 0.2349,
"step": 496
},
{
"epoch": 2.4603960396039604,
"grad_norm": 0.32969555135194983,
"learning_rate": 4.816766265797505e-05,
"loss": 0.2446,
"step": 497
},
{
"epoch": 2.4653465346534653,
"grad_norm": 0.3379586969710406,
"learning_rate": 4.8032282914228743e-05,
"loss": 0.2402,
"step": 498
},
{
"epoch": 2.4702970297029703,
"grad_norm": 0.3148344857955309,
"learning_rate": 4.78968072280901e-05,
"loss": 0.2097,
"step": 499
},
{
"epoch": 2.4752475247524752,
"grad_norm": 0.31145265258949967,
"learning_rate": 4.7761237217761736e-05,
"loss": 0.2172,
"step": 500
},
{
"epoch": 2.48019801980198,
"grad_norm": 0.3017333863543826,
"learning_rate": 4.7625574502572975e-05,
"loss": 0.2222,
"step": 501
},
{
"epoch": 2.485148514851485,
"grad_norm": 0.35871571827001464,
"learning_rate": 4.7489820702960444e-05,
"loss": 0.2245,
"step": 502
},
{
"epoch": 2.49009900990099,
"grad_norm": 0.31489941844300473,
"learning_rate": 4.735397744044874e-05,
"loss": 0.2187,
"step": 503
},
{
"epoch": 2.495049504950495,
"grad_norm": 0.3639077058752089,
"learning_rate": 4.721804633763105e-05,
"loss": 0.2393,
"step": 504
},
{
"epoch": 2.5,
"grad_norm": 0.3193495325768187,
"learning_rate": 4.7082029018149816e-05,
"loss": 0.2404,
"step": 505
},
{
"epoch": 2.504950495049505,
"grad_norm": 0.35786177063036884,
"learning_rate": 4.694592710667723e-05,
"loss": 0.2563,
"step": 506
},
{
"epoch": 2.50990099009901,
"grad_norm": 0.36096478997948417,
"learning_rate": 4.680974222889595e-05,
"loss": 0.2461,
"step": 507
},
{
"epoch": 2.514851485148515,
"grad_norm": 0.3205057209076732,
"learning_rate": 4.667347601147965e-05,
"loss": 0.2146,
"step": 508
},
{
"epoch": 2.51980198019802,
"grad_norm": 0.3052920805898413,
"learning_rate": 4.653713008207353e-05,
"loss": 0.2008,
"step": 509
},
{
"epoch": 2.5247524752475248,
"grad_norm": 0.31137170709518075,
"learning_rate": 4.640070606927497e-05,
"loss": 0.2239,
"step": 510
},
{
"epoch": 2.5297029702970297,
"grad_norm": 0.3339899865251113,
"learning_rate": 4.6264205602613944e-05,
"loss": 0.2238,
"step": 511
},
{
"epoch": 2.5346534653465347,
"grad_norm": 0.2869330890283644,
"learning_rate": 4.612763031253372e-05,
"loss": 0.1928,
"step": 512
},
{
"epoch": 2.5396039603960396,
"grad_norm": 0.35107221358186375,
"learning_rate": 4.599098183037127e-05,
"loss": 0.246,
"step": 513
},
{
"epoch": 2.5445544554455446,
"grad_norm": 0.3049637104038534,
"learning_rate": 4.5854261788337785e-05,
"loss": 0.2132,
"step": 514
},
{
"epoch": 2.5495049504950495,
"grad_norm": 0.45177004334926596,
"learning_rate": 4.571747181949928e-05,
"loss": 0.2414,
"step": 515
},
{
"epoch": 2.5544554455445545,
"grad_norm": 0.30792835653348943,
"learning_rate": 4.558061355775693e-05,
"loss": 0.2018,
"step": 516
},
{
"epoch": 2.5594059405940595,
"grad_norm": 0.3621769791555683,
"learning_rate": 4.5443688637827716e-05,
"loss": 0.2306,
"step": 517
},
{
"epoch": 2.5643564356435644,
"grad_norm": 0.4396288298273991,
"learning_rate": 4.530669869522478e-05,
"loss": 0.2691,
"step": 518
},
{
"epoch": 2.5693069306930694,
"grad_norm": 0.3068970318327044,
"learning_rate": 4.516964536623796e-05,
"loss": 0.2474,
"step": 519
},
{
"epoch": 2.5742574257425743,
"grad_norm": 0.33600645421416364,
"learning_rate": 4.503253028791422e-05,
"loss": 0.2306,
"step": 520
},
{
"epoch": 2.5792079207920793,
"grad_norm": 0.2915774520149471,
"learning_rate": 4.489535509803806e-05,
"loss": 0.2332,
"step": 521
},
{
"epoch": 2.5841584158415842,
"grad_norm": 0.3036235999189328,
"learning_rate": 4.475812143511202e-05,
"loss": 0.2288,
"step": 522
},
{
"epoch": 2.589108910891089,
"grad_norm": 0.31154126635878154,
"learning_rate": 4.4620830938337055e-05,
"loss": 0.2219,
"step": 523
},
{
"epoch": 2.594059405940594,
"grad_norm": 0.32093686482922884,
"learning_rate": 4.448348524759302e-05,
"loss": 0.2212,
"step": 524
},
{
"epoch": 2.599009900990099,
"grad_norm": 0.325443387638589,
"learning_rate": 4.4346086003418985e-05,
"loss": 0.2317,
"step": 525
},
{
"epoch": 2.603960396039604,
"grad_norm": 0.30353414863792527,
"learning_rate": 4.420863484699374e-05,
"loss": 0.2262,
"step": 526
},
{
"epoch": 2.608910891089109,
"grad_norm": 0.3454386372059614,
"learning_rate": 4.4071133420116106e-05,
"loss": 0.2278,
"step": 527
},
{
"epoch": 2.613861386138614,
"grad_norm": 0.30102098435727376,
"learning_rate": 4.3933583365185396e-05,
"loss": 0.221,
"step": 528
},
{
"epoch": 2.618811881188119,
"grad_norm": 0.28474854963641383,
"learning_rate": 4.379598632518175e-05,
"loss": 0.2051,
"step": 529
},
{
"epoch": 2.623762376237624,
"grad_norm": 0.32500866792253486,
"learning_rate": 4.365834394364653e-05,
"loss": 0.2342,
"step": 530
},
{
"epoch": 2.628712871287129,
"grad_norm": 0.3188951539017567,
"learning_rate": 4.35206578646627e-05,
"loss": 0.224,
"step": 531
},
{
"epoch": 2.633663366336634,
"grad_norm": 0.28927265561371424,
"learning_rate": 4.338292973283512e-05,
"loss": 0.1787,
"step": 532
},
{
"epoch": 2.6386138613861387,
"grad_norm": 0.3344379754147368,
"learning_rate": 4.324516119327102e-05,
"loss": 0.2232,
"step": 533
},
{
"epoch": 2.6435643564356437,
"grad_norm": 0.35614810979036243,
"learning_rate": 4.310735389156026e-05,
"loss": 0.2458,
"step": 534
},
{
"epoch": 2.6485148514851486,
"grad_norm": 0.33358397345836344,
"learning_rate": 4.296950947375566e-05,
"loss": 0.2248,
"step": 535
},
{
"epoch": 2.6534653465346536,
"grad_norm": 0.29281872495236416,
"learning_rate": 4.2831629586353446e-05,
"loss": 0.23,
"step": 536
},
{
"epoch": 2.6584158415841586,
"grad_norm": 0.3567082705106451,
"learning_rate": 4.269371587627346e-05,
"loss": 0.245,
"step": 537
},
{
"epoch": 2.6633663366336635,
"grad_norm": 0.5510776836916321,
"learning_rate": 4.255576999083956e-05,
"loss": 0.2654,
"step": 538
},
{
"epoch": 2.6683168316831685,
"grad_norm": 0.3484675969209168,
"learning_rate": 4.241779357775993e-05,
"loss": 0.2267,
"step": 539
},
{
"epoch": 2.6732673267326734,
"grad_norm": 0.32743248333395614,
"learning_rate": 4.227978828510739e-05,
"loss": 0.197,
"step": 540
},
{
"epoch": 2.6782178217821784,
"grad_norm": 0.3698834754112533,
"learning_rate": 4.214175576129972e-05,
"loss": 0.2347,
"step": 541
},
{
"epoch": 2.6831683168316833,
"grad_norm": 0.3108200784170499,
"learning_rate": 4.200369765507995e-05,
"loss": 0.2022,
"step": 542
},
{
"epoch": 2.6881188118811883,
"grad_norm": 0.39122986107770225,
"learning_rate": 4.18656156154967e-05,
"loss": 0.2678,
"step": 543
},
{
"epoch": 2.693069306930693,
"grad_norm": 0.3520361936650789,
"learning_rate": 4.172751129188447e-05,
"loss": 0.2345,
"step": 544
},
{
"epoch": 2.698019801980198,
"grad_norm": 0.3360896398211906,
"learning_rate": 4.158938633384389e-05,
"loss": 0.2204,
"step": 545
},
{
"epoch": 2.7029702970297027,
"grad_norm": 0.3189999605477287,
"learning_rate": 4.1451242391222105e-05,
"loss": 0.1835,
"step": 546
},
{
"epoch": 2.707920792079208,
"grad_norm": 0.2878525852539076,
"learning_rate": 4.1313081114093025e-05,
"loss": 0.194,
"step": 547
},
{
"epoch": 2.7128712871287126,
"grad_norm": 0.37399660980915755,
"learning_rate": 4.117490415273757e-05,
"loss": 0.2429,
"step": 548
},
{
"epoch": 2.717821782178218,
"grad_norm": 0.4364451292360142,
"learning_rate": 4.1036713157624045e-05,
"loss": 0.272,
"step": 549
},
{
"epoch": 2.7227722772277225,
"grad_norm": 0.2885760814922969,
"learning_rate": 4.089850977938836e-05,
"loss": 0.2344,
"step": 550
},
{
"epoch": 2.727722772277228,
"grad_norm": 0.3179182415930085,
"learning_rate": 4.076029566881436e-05,
"loss": 0.2067,
"step": 551
},
{
"epoch": 2.7326732673267324,
"grad_norm": 0.3324523312729288,
"learning_rate": 4.0622072476814045e-05,
"loss": 0.2173,
"step": 552
},
{
"epoch": 2.737623762376238,
"grad_norm": 0.3463898334558332,
"learning_rate": 4.0483841854407906e-05,
"loss": 0.23,
"step": 553
},
{
"epoch": 2.7425742574257423,
"grad_norm": 1.349745744481094,
"learning_rate": 4.0345605452705225e-05,
"loss": 0.3042,
"step": 554
},
{
"epoch": 2.7475247524752477,
"grad_norm": 0.3557405479241251,
"learning_rate": 4.020736492288426e-05,
"loss": 0.244,
"step": 555
},
{
"epoch": 2.7524752475247523,
"grad_norm": 0.3325914967332085,
"learning_rate": 4.006912191617259e-05,
"loss": 0.2448,
"step": 556
},
{
"epoch": 2.7574257425742577,
"grad_norm": 0.3123897923148057,
"learning_rate": 3.993087808382742e-05,
"loss": 0.2268,
"step": 557
},
{
"epoch": 2.762376237623762,
"grad_norm": 0.3136369159963456,
"learning_rate": 3.9792635077115755e-05,
"loss": 0.2121,
"step": 558
},
{
"epoch": 2.7673267326732676,
"grad_norm": 0.3236652597206635,
"learning_rate": 3.9654394547294775e-05,
"loss": 0.2265,
"step": 559
},
{
"epoch": 2.772277227722772,
"grad_norm": 0.3053161722456095,
"learning_rate": 3.9516158145592093e-05,
"loss": 0.2187,
"step": 560
},
{
"epoch": 2.7772277227722775,
"grad_norm": 0.3305154761547749,
"learning_rate": 3.937792752318597e-05,
"loss": 0.2394,
"step": 561
},
{
"epoch": 2.782178217821782,
"grad_norm": 0.3637555988086301,
"learning_rate": 3.923970433118566e-05,
"loss": 0.2367,
"step": 562
},
{
"epoch": 2.7871287128712874,
"grad_norm": 0.3171839114698826,
"learning_rate": 3.9101490220611646e-05,
"loss": 0.2285,
"step": 563
},
{
"epoch": 2.792079207920792,
"grad_norm": 0.33661618797255727,
"learning_rate": 3.8963286842375955e-05,
"loss": 0.227,
"step": 564
},
{
"epoch": 2.7970297029702973,
"grad_norm": 0.3226709988581632,
"learning_rate": 3.882509584726244e-05,
"loss": 0.2186,
"step": 565
},
{
"epoch": 2.801980198019802,
"grad_norm": 0.34629105834430013,
"learning_rate": 3.868691888590699e-05,
"loss": 0.2533,
"step": 566
},
{
"epoch": 2.806930693069307,
"grad_norm": 0.6968328315911714,
"learning_rate": 3.854875760877791e-05,
"loss": 0.2399,
"step": 567
},
{
"epoch": 2.8118811881188117,
"grad_norm": 0.3350430446776809,
"learning_rate": 3.8410613666156126e-05,
"loss": 0.222,
"step": 568
},
{
"epoch": 2.8168316831683167,
"grad_norm": 0.35262490412149583,
"learning_rate": 3.8272488708115536e-05,
"loss": 0.2512,
"step": 569
},
{
"epoch": 2.8217821782178216,
"grad_norm": 0.30175139855193056,
"learning_rate": 3.81343843845033e-05,
"loss": 0.2118,
"step": 570
},
{
"epoch": 2.8267326732673266,
"grad_norm": 0.3224948254073621,
"learning_rate": 3.7996302344920056e-05,
"loss": 0.2368,
"step": 571
},
{
"epoch": 2.8316831683168315,
"grad_norm": 0.3142472270583956,
"learning_rate": 3.785824423870029e-05,
"loss": 0.247,
"step": 572
},
{
"epoch": 2.8366336633663365,
"grad_norm": 0.2930728895351729,
"learning_rate": 3.772021171489261e-05,
"loss": 0.2028,
"step": 573
},
{
"epoch": 2.8415841584158414,
"grad_norm": 0.2979919926832772,
"learning_rate": 3.7582206422240073e-05,
"loss": 0.213,
"step": 574
},
{
"epoch": 2.8465346534653464,
"grad_norm": 0.31269091733709203,
"learning_rate": 3.744423000916045e-05,
"loss": 0.2343,
"step": 575
},
{
"epoch": 2.8514851485148514,
"grad_norm": 0.31493725391362487,
"learning_rate": 3.7306284123726545e-05,
"loss": 0.2337,
"step": 576
},
{
"epoch": 2.8564356435643563,
"grad_norm": 0.29253133729107844,
"learning_rate": 3.716837041364657e-05,
"loss": 0.2406,
"step": 577
},
{
"epoch": 2.8613861386138613,
"grad_norm": 0.29891111766859063,
"learning_rate": 3.703049052624434e-05,
"loss": 0.2385,
"step": 578
},
{
"epoch": 2.866336633663366,
"grad_norm": 0.33892105214317203,
"learning_rate": 3.689264610843975e-05,
"loss": 0.23,
"step": 579
},
{
"epoch": 2.871287128712871,
"grad_norm": 0.3232151525300889,
"learning_rate": 3.6754838806728985e-05,
"loss": 0.2372,
"step": 580
},
{
"epoch": 2.876237623762376,
"grad_norm": 0.3185266368118756,
"learning_rate": 3.6617070267164895e-05,
"loss": 0.2343,
"step": 581
},
{
"epoch": 2.881188118811881,
"grad_norm": 0.29709076121109895,
"learning_rate": 3.647934213533733e-05,
"loss": 0.1971,
"step": 582
},
{
"epoch": 2.886138613861386,
"grad_norm": 0.3235673389076069,
"learning_rate": 3.634165605635347e-05,
"loss": 0.2334,
"step": 583
},
{
"epoch": 2.891089108910891,
"grad_norm": 0.28878380789158536,
"learning_rate": 3.6204013674818264e-05,
"loss": 0.203,
"step": 584
},
{
"epoch": 2.896039603960396,
"grad_norm": 0.2981782547905849,
"learning_rate": 3.606641663481462e-05,
"loss": 0.211,
"step": 585
},
{
"epoch": 2.900990099009901,
"grad_norm": 0.337934548527355,
"learning_rate": 3.5928866579883914e-05,
"loss": 0.2228,
"step": 586
},
{
"epoch": 2.905940594059406,
"grad_norm": 0.28548908884726665,
"learning_rate": 3.579136515300627e-05,
"loss": 0.1974,
"step": 587
},
{
"epoch": 2.910891089108911,
"grad_norm": 0.3170801695426472,
"learning_rate": 3.565391399658102e-05,
"loss": 0.2714,
"step": 588
},
{
"epoch": 2.9158415841584158,
"grad_norm": 0.31210516683665845,
"learning_rate": 3.5516514752406996e-05,
"loss": 0.1991,
"step": 589
},
{
"epoch": 2.9207920792079207,
"grad_norm": 0.3252111899517891,
"learning_rate": 3.537916906166295e-05,
"loss": 0.2538,
"step": 590
},
{
"epoch": 2.9257425742574257,
"grad_norm": 0.34206702703586866,
"learning_rate": 3.5241878564888006e-05,
"loss": 0.237,
"step": 591
},
{
"epoch": 2.9306930693069306,
"grad_norm": 0.30352676209933477,
"learning_rate": 3.510464490196195e-05,
"loss": 0.2045,
"step": 592
},
{
"epoch": 2.9356435643564356,
"grad_norm": 0.27190930148752757,
"learning_rate": 3.496746971208579e-05,
"loss": 0.2173,
"step": 593
},
{
"epoch": 2.9405940594059405,
"grad_norm": 0.3324654854830766,
"learning_rate": 3.4830354633762044e-05,
"loss": 0.2502,
"step": 594
},
{
"epoch": 2.9455445544554455,
"grad_norm": 0.3497620111470606,
"learning_rate": 3.4693301304775226e-05,
"loss": 0.2359,
"step": 595
},
{
"epoch": 2.9504950495049505,
"grad_norm": 0.27327963847153197,
"learning_rate": 3.455631136217231e-05,
"loss": 0.2078,
"step": 596
},
{
"epoch": 2.9554455445544554,
"grad_norm": 0.30380647947017314,
"learning_rate": 3.4419386442243084e-05,
"loss": 0.2269,
"step": 597
},
{
"epoch": 2.9603960396039604,
"grad_norm": 0.28430600655210836,
"learning_rate": 3.428252818050074e-05,
"loss": 0.2107,
"step": 598
},
{
"epoch": 2.9653465346534653,
"grad_norm": 0.26557159723378293,
"learning_rate": 3.414573821166222e-05,
"loss": 0.2057,
"step": 599
},
{
"epoch": 2.9702970297029703,
"grad_norm": 0.2862759138114645,
"learning_rate": 3.4009018169628744e-05,
"loss": 0.2087,
"step": 600
},
{
"epoch": 2.9752475247524752,
"grad_norm": 0.28158416627334854,
"learning_rate": 3.38723696874663e-05,
"loss": 0.2194,
"step": 601
},
{
"epoch": 2.98019801980198,
"grad_norm": 0.35712881847682365,
"learning_rate": 3.373579439738606e-05,
"loss": 0.2387,
"step": 602
},
{
"epoch": 2.985148514851485,
"grad_norm": 0.2884121129670916,
"learning_rate": 3.359929393072505e-05,
"loss": 0.2216,
"step": 603
},
{
"epoch": 2.99009900990099,
"grad_norm": 0.30058325322611185,
"learning_rate": 3.346286991792648e-05,
"loss": 0.2334,
"step": 604
},
{
"epoch": 2.995049504950495,
"grad_norm": 0.2843610698185909,
"learning_rate": 3.3326523988520365e-05,
"loss": 0.2067,
"step": 605
},
{
"epoch": 3.0,
"grad_norm": 0.34310662923578245,
"learning_rate": 3.3190257771104055e-05,
"loss": 0.2116,
"step": 606
},
{
"epoch": 3.004950495049505,
"grad_norm": 0.3425747462145039,
"learning_rate": 3.305407289332279e-05,
"loss": 0.0851,
"step": 607
},
{
"epoch": 3.00990099009901,
"grad_norm": 0.3033653125782944,
"learning_rate": 3.2917970981850205e-05,
"loss": 0.0897,
"step": 608
},
{
"epoch": 3.014851485148515,
"grad_norm": 0.28638471767451534,
"learning_rate": 3.2781953662368954e-05,
"loss": 0.0909,
"step": 609
},
{
"epoch": 3.01980198019802,
"grad_norm": 0.2844387140751838,
"learning_rate": 3.264602255955127e-05,
"loss": 0.0855,
"step": 610
},
{
"epoch": 3.0247524752475248,
"grad_norm": 0.25783024583363257,
"learning_rate": 3.251017929703956e-05,
"loss": 0.0739,
"step": 611
},
{
"epoch": 3.0297029702970297,
"grad_norm": 0.4455150785898187,
"learning_rate": 3.237442549742704e-05,
"loss": 0.08,
"step": 612
},
{
"epoch": 3.0346534653465347,
"grad_norm": 0.2649934418173023,
"learning_rate": 3.223876278223828e-05,
"loss": 0.0659,
"step": 613
},
{
"epoch": 3.0396039603960396,
"grad_norm": 0.28349483967189754,
"learning_rate": 3.2103192771909927e-05,
"loss": 0.0677,
"step": 614
},
{
"epoch": 3.0445544554455446,
"grad_norm": 0.2520579031781572,
"learning_rate": 3.196771708577127e-05,
"loss": 0.0675,
"step": 615
},
{
"epoch": 3.0495049504950495,
"grad_norm": 0.26346383406033047,
"learning_rate": 3.1832337342024956e-05,
"loss": 0.0774,
"step": 616
},
{
"epoch": 3.0544554455445545,
"grad_norm": 0.24009068817781143,
"learning_rate": 3.1697055157727654e-05,
"loss": 0.0797,
"step": 617
},
{
"epoch": 3.0594059405940595,
"grad_norm": 0.2480503867107148,
"learning_rate": 3.156187214877068e-05,
"loss": 0.0863,
"step": 618
},
{
"epoch": 3.0643564356435644,
"grad_norm": 0.29158321889505673,
"learning_rate": 3.142678992986078e-05,
"loss": 0.0744,
"step": 619
},
{
"epoch": 3.0693069306930694,
"grad_norm": 0.2367292120532588,
"learning_rate": 3.129181011450077e-05,
"loss": 0.0799,
"step": 620
},
{
"epoch": 3.0742574257425743,
"grad_norm": 0.23939469471682348,
"learning_rate": 3.115693431497033e-05,
"loss": 0.0769,
"step": 621
},
{
"epoch": 3.0792079207920793,
"grad_norm": 0.21921181988376112,
"learning_rate": 3.102216414230671e-05,
"loss": 0.0694,
"step": 622
},
{
"epoch": 3.0841584158415842,
"grad_norm": 0.2520551999313607,
"learning_rate": 3.0887501206285436e-05,
"loss": 0.0787,
"step": 623
},
{
"epoch": 3.089108910891089,
"grad_norm": 0.27900349033438354,
"learning_rate": 3.075294711540123e-05,
"loss": 0.0882,
"step": 624
},
{
"epoch": 3.094059405940594,
"grad_norm": 0.24848095506852333,
"learning_rate": 3.061850347684859e-05,
"loss": 0.0692,
"step": 625
},
{
"epoch": 3.099009900990099,
"grad_norm": 0.2685764262687839,
"learning_rate": 3.0484171896502805e-05,
"loss": 0.0828,
"step": 626
},
{
"epoch": 3.103960396039604,
"grad_norm": 0.2295364672313301,
"learning_rate": 3.034995397890063e-05,
"loss": 0.0685,
"step": 627
},
{
"epoch": 3.108910891089109,
"grad_norm": 0.2522857633506632,
"learning_rate": 3.0215851327221163e-05,
"loss": 0.0755,
"step": 628
},
{
"epoch": 3.113861386138614,
"grad_norm": 0.2340600632944488,
"learning_rate": 3.0081865543266687e-05,
"loss": 0.0716,
"step": 629
},
{
"epoch": 3.118811881188119,
"grad_norm": 0.2514923962812719,
"learning_rate": 2.994799822744356e-05,
"loss": 0.0801,
"step": 630
},
{
"epoch": 3.123762376237624,
"grad_norm": 0.22934807794127365,
"learning_rate": 2.9814250978743115e-05,
"loss": 0.0707,
"step": 631
},
{
"epoch": 3.128712871287129,
"grad_norm": 0.2322249593545606,
"learning_rate": 2.9680625394722483e-05,
"loss": 0.0709,
"step": 632
},
{
"epoch": 3.133663366336634,
"grad_norm": 0.2552323877342474,
"learning_rate": 2.9547123071485586e-05,
"loss": 0.0826,
"step": 633
},
{
"epoch": 3.1386138613861387,
"grad_norm": 0.229864941042402,
"learning_rate": 2.9413745603664023e-05,
"loss": 0.0635,
"step": 634
},
{
"epoch": 3.1435643564356437,
"grad_norm": 0.24313717952069136,
"learning_rate": 2.928049458439808e-05,
"loss": 0.0838,
"step": 635
},
{
"epoch": 3.1485148514851486,
"grad_norm": 0.25035045583769344,
"learning_rate": 2.914737160531765e-05,
"loss": 0.0722,
"step": 636
},
{
"epoch": 3.1534653465346536,
"grad_norm": 0.256842768073215,
"learning_rate": 2.9014378256523218e-05,
"loss": 0.086,
"step": 637
},
{
"epoch": 3.1584158415841586,
"grad_norm": 0.27803315855835337,
"learning_rate": 2.888151612656692e-05,
"loss": 0.0777,
"step": 638
},
{
"epoch": 3.1633663366336635,
"grad_norm": 0.2642554173662771,
"learning_rate": 2.874878680243349e-05,
"loss": 0.0748,
"step": 639
},
{
"epoch": 3.1683168316831685,
"grad_norm": 0.26609660151383924,
"learning_rate": 2.8616191869521412e-05,
"loss": 0.0941,
"step": 640
},
{
"epoch": 3.1732673267326734,
"grad_norm": 0.2507627298480842,
"learning_rate": 2.8483732911623882e-05,
"loss": 0.0705,
"step": 641
},
{
"epoch": 3.1782178217821784,
"grad_norm": 0.25655475416599705,
"learning_rate": 2.8351411510909926e-05,
"loss": 0.0811,
"step": 642
},
{
"epoch": 3.1831683168316833,
"grad_norm": 0.26109475686466127,
"learning_rate": 2.821922924790552e-05,
"loss": 0.0877,
"step": 643
},
{
"epoch": 3.1881188118811883,
"grad_norm": 0.23733552033709807,
"learning_rate": 2.8087187701474667e-05,
"loss": 0.0819,
"step": 644
},
{
"epoch": 3.1930693069306932,
"grad_norm": 0.23699501435988432,
"learning_rate": 2.7955288448800628e-05,
"loss": 0.0731,
"step": 645
},
{
"epoch": 3.198019801980198,
"grad_norm": 0.24748871746240322,
"learning_rate": 2.7823533065366965e-05,
"loss": 0.0763,
"step": 646
},
{
"epoch": 3.202970297029703,
"grad_norm": 0.25641166163891144,
"learning_rate": 2.7691923124938794e-05,
"loss": 0.0862,
"step": 647
},
{
"epoch": 3.207920792079208,
"grad_norm": 0.25223945193358493,
"learning_rate": 2.756046019954398e-05,
"loss": 0.0783,
"step": 648
},
{
"epoch": 3.212871287128713,
"grad_norm": 0.22946599813308205,
"learning_rate": 2.742914585945436e-05,
"loss": 0.0684,
"step": 649
},
{
"epoch": 3.217821782178218,
"grad_norm": 0.23227418913106662,
"learning_rate": 2.7297981673166963e-05,
"loss": 0.0726,
"step": 650
},
{
"epoch": 3.222772277227723,
"grad_norm": 0.2708641105053437,
"learning_rate": 2.71669692073853e-05,
"loss": 0.0809,
"step": 651
},
{
"epoch": 3.227722772277228,
"grad_norm": 0.23941294277593003,
"learning_rate": 2.7036110027000636e-05,
"loss": 0.0744,
"step": 652
},
{
"epoch": 3.232673267326733,
"grad_norm": 0.22427334271376922,
"learning_rate": 2.690540569507329e-05,
"loss": 0.0562,
"step": 653
},
{
"epoch": 3.237623762376238,
"grad_norm": 0.2397564104418557,
"learning_rate": 2.677485777281403e-05,
"loss": 0.0709,
"step": 654
},
{
"epoch": 3.2425742574257423,
"grad_norm": 0.24228483254952923,
"learning_rate": 2.6644467819565317e-05,
"loss": 0.0675,
"step": 655
},
{
"epoch": 3.2475247524752477,
"grad_norm": 0.2778549352377007,
"learning_rate": 2.651423739278276e-05,
"loss": 0.0832,
"step": 656
},
{
"epoch": 3.2524752475247523,
"grad_norm": 0.2510031448101173,
"learning_rate": 2.638416804801648e-05,
"loss": 0.0789,
"step": 657
},
{
"epoch": 3.2574257425742577,
"grad_norm": 0.23230623243606166,
"learning_rate": 2.6254261338892536e-05,
"loss": 0.0685,
"step": 658
},
{
"epoch": 3.262376237623762,
"grad_norm": 0.23265962042829946,
"learning_rate": 2.6124518817094418e-05,
"loss": 0.0707,
"step": 659
},
{
"epoch": 3.2673267326732676,
"grad_norm": 0.2698161518899995,
"learning_rate": 2.5994942032344376e-05,
"loss": 0.0712,
"step": 660
},
{
"epoch": 3.272277227722772,
"grad_norm": 0.23919947212956175,
"learning_rate": 2.5865532532385072e-05,
"loss": 0.0735,
"step": 661
},
{
"epoch": 3.2772277227722775,
"grad_norm": 0.2338979262555169,
"learning_rate": 2.573629186296097e-05,
"loss": 0.079,
"step": 662
},
{
"epoch": 3.282178217821782,
"grad_norm": 0.24198283638959564,
"learning_rate": 2.560722156779996e-05,
"loss": 0.0652,
"step": 663
},
{
"epoch": 3.287128712871287,
"grad_norm": 0.2678959132988273,
"learning_rate": 2.547832318859487e-05,
"loss": 0.0806,
"step": 664
},
{
"epoch": 3.292079207920792,
"grad_norm": 0.24369334210701515,
"learning_rate": 2.5349598264985028e-05,
"loss": 0.0801,
"step": 665
},
{
"epoch": 3.297029702970297,
"grad_norm": 0.2516534708453375,
"learning_rate": 2.5221048334537952e-05,
"loss": 0.0729,
"step": 666
},
{
"epoch": 3.301980198019802,
"grad_norm": 0.26191397567125363,
"learning_rate": 2.5092674932730886e-05,
"loss": 0.0817,
"step": 667
},
{
"epoch": 3.3069306930693068,
"grad_norm": 0.2311417870965065,
"learning_rate": 2.4964479592932574e-05,
"loss": 0.0668,
"step": 668
},
{
"epoch": 3.3118811881188117,
"grad_norm": 0.22453138314695728,
"learning_rate": 2.4836463846384832e-05,
"loss": 0.0679,
"step": 669
},
{
"epoch": 3.3168316831683167,
"grad_norm": 0.25117156700585896,
"learning_rate": 2.470862922218431e-05,
"loss": 0.0726,
"step": 670
},
{
"epoch": 3.3217821782178216,
"grad_norm": 0.26705255539786693,
"learning_rate": 2.4580977247264253e-05,
"loss": 0.0757,
"step": 671
},
{
"epoch": 3.3267326732673266,
"grad_norm": 0.26514359221062156,
"learning_rate": 2.4453509446376192e-05,
"loss": 0.0798,
"step": 672
},
{
"epoch": 3.3316831683168315,
"grad_norm": 0.2671109496025883,
"learning_rate": 2.432622734207182e-05,
"loss": 0.078,
"step": 673
},
{
"epoch": 3.3366336633663365,
"grad_norm": 0.24536817866890306,
"learning_rate": 2.4199132454684736e-05,
"loss": 0.0729,
"step": 674
},
{
"epoch": 3.3415841584158414,
"grad_norm": 0.25557331335860106,
"learning_rate": 2.40722263023123e-05,
"loss": 0.0784,
"step": 675
},
{
"epoch": 3.3465346534653464,
"grad_norm": 0.2546554341285905,
"learning_rate": 2.3945510400797485e-05,
"loss": 0.0722,
"step": 676
},
{
"epoch": 3.3514851485148514,
"grad_norm": 0.22489537362915024,
"learning_rate": 2.3818986263710886e-05,
"loss": 0.0701,
"step": 677
},
{
"epoch": 3.3564356435643563,
"grad_norm": 0.23800748533645866,
"learning_rate": 2.3692655402332455e-05,
"loss": 0.0694,
"step": 678
},
{
"epoch": 3.3613861386138613,
"grad_norm": 0.21671549307844398,
"learning_rate": 2.3566519325633567e-05,
"loss": 0.0555,
"step": 679
},
{
"epoch": 3.366336633663366,
"grad_norm": 0.2583657442869358,
"learning_rate": 2.3440579540259006e-05,
"loss": 0.071,
"step": 680
},
{
"epoch": 3.371287128712871,
"grad_norm": 0.2573251426343712,
"learning_rate": 2.3314837550508875e-05,
"loss": 0.0698,
"step": 681
},
{
"epoch": 3.376237623762376,
"grad_norm": 0.2404906641580492,
"learning_rate": 2.3189294858320768e-05,
"loss": 0.0721,
"step": 682
},
{
"epoch": 3.381188118811881,
"grad_norm": 0.24817378991212172,
"learning_rate": 2.3063952963251682e-05,
"loss": 0.0665,
"step": 683
},
{
"epoch": 3.386138613861386,
"grad_norm": 0.278544909227608,
"learning_rate": 2.2938813362460198e-05,
"loss": 0.0796,
"step": 684
},
{
"epoch": 3.391089108910891,
"grad_norm": 0.23587919321962164,
"learning_rate": 2.2813877550688553e-05,
"loss": 0.0638,
"step": 685
},
{
"epoch": 3.396039603960396,
"grad_norm": 0.26054805369939354,
"learning_rate": 2.2689147020244848e-05,
"loss": 0.0829,
"step": 686
},
{
"epoch": 3.400990099009901,
"grad_norm": 0.2600430379659956,
"learning_rate": 2.256462326098516e-05,
"loss": 0.0751,
"step": 687
},
{
"epoch": 3.405940594059406,
"grad_norm": 0.251402994649795,
"learning_rate": 2.2440307760295755e-05,
"loss": 0.0768,
"step": 688
},
{
"epoch": 3.410891089108911,
"grad_norm": 0.2513808234935709,
"learning_rate": 2.2316202003075347e-05,
"loss": 0.0794,
"step": 689
},
{
"epoch": 3.4158415841584158,
"grad_norm": 0.2394850075046582,
"learning_rate": 2.2192307471717324e-05,
"loss": 0.0678,
"step": 690
},
{
"epoch": 3.4207920792079207,
"grad_norm": 0.2270966635982115,
"learning_rate": 2.2068625646092103e-05,
"loss": 0.0603,
"step": 691
},
{
"epoch": 3.4257425742574257,
"grad_norm": 0.230349969817333,
"learning_rate": 2.194515800352942e-05,
"loss": 0.0575,
"step": 692
},
{
"epoch": 3.4306930693069306,
"grad_norm": 0.2151090758971661,
"learning_rate": 2.1821906018800643e-05,
"loss": 0.0617,
"step": 693
},
{
"epoch": 3.4356435643564356,
"grad_norm": 0.26279226131219013,
"learning_rate": 2.169887116410121e-05,
"loss": 0.0684,
"step": 694
},
{
"epoch": 3.4405940594059405,
"grad_norm": 0.23698343268856573,
"learning_rate": 2.1576054909033014e-05,
"loss": 0.0673,
"step": 695
},
{
"epoch": 3.4455445544554455,
"grad_norm": 0.21782279464252616,
"learning_rate": 2.1453458720586902e-05,
"loss": 0.0596,
"step": 696
},
{
"epoch": 3.4504950495049505,
"grad_norm": 0.2413629900614204,
"learning_rate": 2.13310840631251e-05,
"loss": 0.066,
"step": 697
},
{
"epoch": 3.4554455445544554,
"grad_norm": 0.2590724577362002,
"learning_rate": 2.1208932398363712e-05,
"loss": 0.085,
"step": 698
},
{
"epoch": 3.4603960396039604,
"grad_norm": 0.23854018641540192,
"learning_rate": 2.1087005185355292e-05,
"loss": 0.0578,
"step": 699
},
{
"epoch": 3.4653465346534653,
"grad_norm": 0.2617124689921388,
"learning_rate": 2.0965303880471405e-05,
"loss": 0.0683,
"step": 700
},
{
"epoch": 3.4702970297029703,
"grad_norm": 0.25853795971879695,
"learning_rate": 2.0843829937385255e-05,
"loss": 0.0685,
"step": 701
},
{
"epoch": 3.4752475247524752,
"grad_norm": 0.21533646187557015,
"learning_rate": 2.072258480705431e-05,
"loss": 0.0612,
"step": 702
},
{
"epoch": 3.48019801980198,
"grad_norm": 0.2398974299806927,
"learning_rate": 2.0601569937702913e-05,
"loss": 0.0767,
"step": 703
},
{
"epoch": 3.485148514851485,
"grad_norm": 0.2825099006591305,
"learning_rate": 2.048078677480507e-05,
"loss": 0.0713,
"step": 704
},
{
"epoch": 3.49009900990099,
"grad_norm": 0.2374986457390262,
"learning_rate": 2.0360236761067117e-05,
"loss": 0.0614,
"step": 705
},
{
"epoch": 3.495049504950495,
"grad_norm": 0.24928209932734513,
"learning_rate": 2.023992133641055e-05,
"loss": 0.0715,
"step": 706
},
{
"epoch": 3.5,
"grad_norm": 0.24602624092887262,
"learning_rate": 2.0119841937954794e-05,
"loss": 0.0776,
"step": 707
},
{
"epoch": 3.504950495049505,
"grad_norm": 0.26151726202365333,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.0834,
"step": 708
},
{
"epoch": 3.50990099009901,
"grad_norm": 0.22791863144384952,
"learning_rate": 1.9880396954009976e-05,
"loss": 0.0645,
"step": 709
},
{
"epoch": 3.514851485148515,
"grad_norm": 0.27852434371052076,
"learning_rate": 1.976103422859506e-05,
"loss": 0.0615,
"step": 710
},
{
"epoch": 3.51980198019802,
"grad_norm": 0.4658704014321942,
"learning_rate": 1.9641913249495026e-05,
"loss": 0.1114,
"step": 711
},
{
"epoch": 3.5247524752475248,
"grad_norm": 0.2388977822102273,
"learning_rate": 1.9523035439562146e-05,
"loss": 0.0636,
"step": 712
},
{
"epoch": 3.5297029702970297,
"grad_norm": 0.2431114346820817,
"learning_rate": 1.9404402218744086e-05,
"loss": 0.072,
"step": 713
},
{
"epoch": 3.5346534653465347,
"grad_norm": 0.2263409426285513,
"learning_rate": 1.9286015004066984e-05,
"loss": 0.0618,
"step": 714
},
{
"epoch": 3.5396039603960396,
"grad_norm": 0.25538479665632796,
"learning_rate": 1.9167875209618592e-05,
"loss": 0.0713,
"step": 715
},
{
"epoch": 3.5445544554455446,
"grad_norm": 0.26107712164158786,
"learning_rate": 1.9049984246531255e-05,
"loss": 0.0803,
"step": 716
},
{
"epoch": 3.5495049504950495,
"grad_norm": 0.25722416216963295,
"learning_rate": 1.8932343522965205e-05,
"loss": 0.0793,
"step": 717
},
{
"epoch": 3.5544554455445545,
"grad_norm": 0.2129150003741417,
"learning_rate": 1.8814954444091595e-05,
"loss": 0.0602,
"step": 718
},
{
"epoch": 3.5594059405940595,
"grad_norm": 0.2965098118005929,
"learning_rate": 1.8697818412075794e-05,
"loss": 0.0756,
"step": 719
},
{
"epoch": 3.5643564356435644,
"grad_norm": 0.24815189554867753,
"learning_rate": 1.8580936826060685e-05,
"loss": 0.072,
"step": 720
},
{
"epoch": 3.5693069306930694,
"grad_norm": 0.23537016156276444,
"learning_rate": 1.846431108214981e-05,
"loss": 0.0632,
"step": 721
},
{
"epoch": 3.5742574257425743,
"grad_norm": 0.25177412841655517,
"learning_rate": 1.8347942573390865e-05,
"loss": 0.0703,
"step": 722
},
{
"epoch": 3.5792079207920793,
"grad_norm": 0.2225737099451813,
"learning_rate": 1.8231832689758903e-05,
"loss": 0.0623,
"step": 723
},
{
"epoch": 3.5841584158415842,
"grad_norm": 0.27105752420141427,
"learning_rate": 1.8115982818139862e-05,
"loss": 0.0794,
"step": 724
},
{
"epoch": 3.589108910891089,
"grad_norm": 0.2817083321886372,
"learning_rate": 1.80003943423139e-05,
"loss": 0.0689,
"step": 725
},
{
"epoch": 3.594059405940594,
"grad_norm": 0.2285885257732442,
"learning_rate": 1.7885068642938924e-05,
"loss": 0.0567,
"step": 726
},
{
"epoch": 3.599009900990099,
"grad_norm": 0.24563843948778483,
"learning_rate": 1.7770007097534062e-05,
"loss": 0.0753,
"step": 727
},
{
"epoch": 3.603960396039604,
"grad_norm": 0.2302620192686851,
"learning_rate": 1.7655211080463265e-05,
"loss": 0.0615,
"step": 728
},
{
"epoch": 3.608910891089109,
"grad_norm": 0.2080881341242689,
"learning_rate": 1.754068196291885e-05,
"loss": 0.0505,
"step": 729
},
{
"epoch": 3.613861386138614,
"grad_norm": 0.2218095852880323,
"learning_rate": 1.7426421112905095e-05,
"loss": 0.0608,
"step": 730
},
{
"epoch": 3.618811881188119,
"grad_norm": 0.2310947814386969,
"learning_rate": 1.731242989522195e-05,
"loss": 0.0663,
"step": 731
},
{
"epoch": 3.623762376237624,
"grad_norm": 0.2545723904232327,
"learning_rate": 1.7198709671448696e-05,
"loss": 0.0759,
"step": 732
},
{
"epoch": 3.628712871287129,
"grad_norm": 0.2211042721278219,
"learning_rate": 1.7085261799927738e-05,
"loss": 0.0583,
"step": 733
},
{
"epoch": 3.633663366336634,
"grad_norm": 0.2509514455369346,
"learning_rate": 1.697208763574833e-05,
"loss": 0.0782,
"step": 734
},
{
"epoch": 3.6386138613861387,
"grad_norm": 0.22667085651403643,
"learning_rate": 1.6859188530730387e-05,
"loss": 0.0664,
"step": 735
},
{
"epoch": 3.6435643564356437,
"grad_norm": 0.22712863713521322,
"learning_rate": 1.6746565833408352e-05,
"loss": 0.0623,
"step": 736
},
{
"epoch": 3.6485148514851486,
"grad_norm": 0.22499251373801385,
"learning_rate": 1.6634220889015087e-05,
"loss": 0.0644,
"step": 737
},
{
"epoch": 3.6534653465346536,
"grad_norm": 0.20110354191475177,
"learning_rate": 1.652215503946583e-05,
"loss": 0.0565,
"step": 738
},
{
"epoch": 3.6584158415841586,
"grad_norm": 0.22315963225154622,
"learning_rate": 1.6410369623342144e-05,
"loss": 0.0661,
"step": 739
},
{
"epoch": 3.6633663366336635,
"grad_norm": 0.22793408178521282,
"learning_rate": 1.6298865975875903e-05,
"loss": 0.0625,
"step": 740
},
{
"epoch": 3.6683168316831685,
"grad_norm": 0.23014768345985898,
"learning_rate": 1.6187645428933372e-05,
"loss": 0.0674,
"step": 741
},
{
"epoch": 3.6732673267326734,
"grad_norm": 0.2253800846374502,
"learning_rate": 1.607670931099929e-05,
"loss": 0.0639,
"step": 742
},
{
"epoch": 3.6782178217821784,
"grad_norm": 0.24360567586454138,
"learning_rate": 1.5966058947161035e-05,
"loss": 0.07,
"step": 743
},
{
"epoch": 3.6831683168316833,
"grad_norm": 0.21918551794623176,
"learning_rate": 1.5855695659092746e-05,
"loss": 0.0618,
"step": 744
},
{
"epoch": 3.6881188118811883,
"grad_norm": 0.23923527333760677,
"learning_rate": 1.5745620765039564e-05,
"loss": 0.0716,
"step": 745
},
{
"epoch": 3.693069306930693,
"grad_norm": 0.2475994058430268,
"learning_rate": 1.563583557980186e-05,
"loss": 0.0737,
"step": 746
},
{
"epoch": 3.698019801980198,
"grad_norm": 0.23662288045909508,
"learning_rate": 1.5526341414719565e-05,
"loss": 0.069,
"step": 747
},
{
"epoch": 3.7029702970297027,
"grad_norm": 0.23536533447903255,
"learning_rate": 1.541713957765649e-05,
"loss": 0.0634,
"step": 748
},
{
"epoch": 3.707920792079208,
"grad_norm": 0.23553261160197025,
"learning_rate": 1.5308231372984723e-05,
"loss": 0.074,
"step": 749
},
{
"epoch": 3.7128712871287126,
"grad_norm": 0.22659254762767775,
"learning_rate": 1.5199618101569003e-05,
"loss": 0.059,
"step": 750
},
{
"epoch": 3.717821782178218,
"grad_norm": 0.22831907929393108,
"learning_rate": 1.5091301060751207e-05,
"loss": 0.0642,
"step": 751
},
{
"epoch": 3.7227722772277225,
"grad_norm": 0.22033603995618226,
"learning_rate": 1.4983281544334896e-05,
"loss": 0.0644,
"step": 752
},
{
"epoch": 3.727722772277228,
"grad_norm": 0.22136073260599423,
"learning_rate": 1.4875560842569767e-05,
"loss": 0.0659,
"step": 753
},
{
"epoch": 3.7326732673267324,
"grad_norm": 0.2152678182143572,
"learning_rate": 1.4768140242136353e-05,
"loss": 0.0634,
"step": 754
},
{
"epoch": 3.737623762376238,
"grad_norm": 0.21396497300394157,
"learning_rate": 1.4661021026130553e-05,
"loss": 0.0528,
"step": 755
},
{
"epoch": 3.7425742574257423,
"grad_norm": 0.2114883903251707,
"learning_rate": 1.4554204474048357e-05,
"loss": 0.0561,
"step": 756
},
{
"epoch": 3.7475247524752477,
"grad_norm": 0.2117441225032521,
"learning_rate": 1.4447691861770591e-05,
"loss": 0.0628,
"step": 757
},
{
"epoch": 3.7524752475247523,
"grad_norm": 0.2323094346863045,
"learning_rate": 1.4341484461547585e-05,
"loss": 0.0655,
"step": 758
},
{
"epoch": 3.7574257425742577,
"grad_norm": 0.22902723203887707,
"learning_rate": 1.4235583541984092e-05,
"loss": 0.0713,
"step": 759
},
{
"epoch": 3.762376237623762,
"grad_norm": 0.2968751987123677,
"learning_rate": 1.412999036802404e-05,
"loss": 0.0703,
"step": 760
},
{
"epoch": 3.7673267326732676,
"grad_norm": 0.22977435481121689,
"learning_rate": 1.4024706200935452e-05,
"loss": 0.072,
"step": 761
},
{
"epoch": 3.772277227722772,
"grad_norm": 0.23461646531105834,
"learning_rate": 1.3919732298295431e-05,
"loss": 0.0607,
"step": 762
},
{
"epoch": 3.7772277227722775,
"grad_norm": 0.21663372881824905,
"learning_rate": 1.3815069913975045e-05,
"loss": 0.0614,
"step": 763
},
{
"epoch": 3.782178217821782,
"grad_norm": 0.23095070576662677,
"learning_rate": 1.3710720298124454e-05,
"loss": 0.0676,
"step": 764
},
{
"epoch": 3.7871287128712874,
"grad_norm": 0.2305740011310324,
"learning_rate": 1.3606684697157876e-05,
"loss": 0.0679,
"step": 765
},
{
"epoch": 3.792079207920792,
"grad_norm": 0.217941058133159,
"learning_rate": 1.350296435373876e-05,
"loss": 0.0617,
"step": 766
},
{
"epoch": 3.7970297029702973,
"grad_norm": 0.23172469767040707,
"learning_rate": 1.3399560506764959e-05,
"loss": 0.0711,
"step": 767
},
{
"epoch": 3.801980198019802,
"grad_norm": 0.2569635240774621,
"learning_rate": 1.3296474391353854e-05,
"loss": 0.0813,
"step": 768
},
{
"epoch": 3.806930693069307,
"grad_norm": 0.3217073053960941,
"learning_rate": 1.3193707238827714e-05,
"loss": 0.0751,
"step": 769
},
{
"epoch": 3.8118811881188117,
"grad_norm": 0.2229682377437594,
"learning_rate": 1.3091260276698847e-05,
"loss": 0.06,
"step": 770
},
{
"epoch": 3.8168316831683167,
"grad_norm": 0.5823009036335607,
"learning_rate": 1.2989134728655097e-05,
"loss": 0.1029,
"step": 771
},
{
"epoch": 3.8217821782178216,
"grad_norm": 0.2196594147367011,
"learning_rate": 1.288733181454508e-05,
"loss": 0.0603,
"step": 772
},
{
"epoch": 3.8267326732673266,
"grad_norm": 0.23552052956281094,
"learning_rate": 1.2785852750363716e-05,
"loss": 0.0656,
"step": 773
},
{
"epoch": 3.8316831683168315,
"grad_norm": 0.2241024828767437,
"learning_rate": 1.2684698748237633e-05,
"loss": 0.0632,
"step": 774
},
{
"epoch": 3.8366336633663365,
"grad_norm": 0.24561519046281696,
"learning_rate": 1.2583871016410764e-05,
"loss": 0.0741,
"step": 775
},
{
"epoch": 3.8415841584158414,
"grad_norm": 0.22027356130191578,
"learning_rate": 1.2483370759229874e-05,
"loss": 0.0602,
"step": 776
},
{
"epoch": 3.8465346534653464,
"grad_norm": 0.21085250976290754,
"learning_rate": 1.2383199177130135e-05,
"loss": 0.0547,
"step": 777
},
{
"epoch": 3.8514851485148514,
"grad_norm": 0.20808078642947644,
"learning_rate": 1.228335746662086e-05,
"loss": 0.0563,
"step": 778
},
{
"epoch": 3.8564356435643563,
"grad_norm": 0.2198172661766001,
"learning_rate": 1.2183846820271147e-05,
"loss": 0.0632,
"step": 779
},
{
"epoch": 3.8613861386138613,
"grad_norm": 0.22050042718742202,
"learning_rate": 1.2084668426695712e-05,
"loss": 0.0608,
"step": 780
},
{
"epoch": 3.866336633663366,
"grad_norm": 0.22382400683228384,
"learning_rate": 1.198582347054062e-05,
"loss": 0.0622,
"step": 781
},
{
"epoch": 3.871287128712871,
"grad_norm": 0.221034670098815,
"learning_rate": 1.1887313132469154e-05,
"loss": 0.0686,
"step": 782
},
{
"epoch": 3.876237623762376,
"grad_norm": 0.2178480835513662,
"learning_rate": 1.178913858914772e-05,
"loss": 0.0656,
"step": 783
},
{
"epoch": 3.881188118811881,
"grad_norm": 0.23113866160558683,
"learning_rate": 1.1691301013231788e-05,
"loss": 0.0716,
"step": 784
},
{
"epoch": 3.886138613861386,
"grad_norm": 0.24146387892581606,
"learning_rate": 1.1593801573351908e-05,
"loss": 0.076,
"step": 785
},
{
"epoch": 3.891089108910891,
"grad_norm": 0.279931142015236,
"learning_rate": 1.1496641434099725e-05,
"loss": 0.0795,
"step": 786
},
{
"epoch": 3.896039603960396,
"grad_norm": 0.22282099068800784,
"learning_rate": 1.1399821756014058e-05,
"loss": 0.0647,
"step": 787
},
{
"epoch": 3.900990099009901,
"grad_norm": 0.22759349001620766,
"learning_rate": 1.1303343695567066e-05,
"loss": 0.0709,
"step": 788
},
{
"epoch": 3.905940594059406,
"grad_norm": 0.21740738469087242,
"learning_rate": 1.1207208405150397e-05,
"loss": 0.0582,
"step": 789
},
{
"epoch": 3.910891089108911,
"grad_norm": 0.23489806348416206,
"learning_rate": 1.1111417033061498e-05,
"loss": 0.0749,
"step": 790
},
{
"epoch": 3.9158415841584158,
"grad_norm": 0.21888448469819882,
"learning_rate": 1.1015970723489828e-05,
"loss": 0.0631,
"step": 791
},
{
"epoch": 3.9207920792079207,
"grad_norm": 0.20856864084290314,
"learning_rate": 1.0920870616503194e-05,
"loss": 0.0534,
"step": 792
},
{
"epoch": 3.9257425742574257,
"grad_norm": 0.2949758911019223,
"learning_rate": 1.082611784803417e-05,
"loss": 0.0637,
"step": 793
},
{
"epoch": 3.9306930693069306,
"grad_norm": 0.22197761683092598,
"learning_rate": 1.0731713549866494e-05,
"loss": 0.0679,
"step": 794
},
{
"epoch": 3.9356435643564356,
"grad_norm": 0.46875314512876026,
"learning_rate": 1.0637658849621593e-05,
"loss": 0.0832,
"step": 795
},
{
"epoch": 3.9405940594059405,
"grad_norm": 0.2293499010106679,
"learning_rate": 1.0543954870745088e-05,
"loss": 0.0668,
"step": 796
},
{
"epoch": 3.9455445544554455,
"grad_norm": 0.2032480907519156,
"learning_rate": 1.0450602732493337e-05,
"loss": 0.0567,
"step": 797
},
{
"epoch": 3.9504950495049505,
"grad_norm": 0.21842971126240499,
"learning_rate": 1.0357603549920129e-05,
"loss": 0.0572,
"step": 798
},
{
"epoch": 3.9554455445544554,
"grad_norm": 0.22221759651106548,
"learning_rate": 1.0264958433863353e-05,
"loss": 0.0577,
"step": 799
},
{
"epoch": 3.9603960396039604,
"grad_norm": 0.23181702880658006,
"learning_rate": 1.0172668490931673e-05,
"loss": 0.0698,
"step": 800
},
{
"epoch": 3.9653465346534653,
"grad_norm": 0.20252785677004106,
"learning_rate": 1.0080734823491402e-05,
"loss": 0.0573,
"step": 801
},
{
"epoch": 3.9702970297029703,
"grad_norm": 0.22486180252186383,
"learning_rate": 9.989158529653257e-06,
"loss": 0.0687,
"step": 802
},
{
"epoch": 3.9752475247524752,
"grad_norm": 0.2276919286466179,
"learning_rate": 9.897940703259264e-06,
"loss": 0.0595,
"step": 803
},
{
"epoch": 3.98019801980198,
"grad_norm": 0.24721806064919266,
"learning_rate": 9.807082433869727e-06,
"loss": 0.0684,
"step": 804
},
{
"epoch": 3.985148514851485,
"grad_norm": 0.20707237969343867,
"learning_rate": 9.716584806750151e-06,
"loss": 0.0495,
"step": 805
},
{
"epoch": 3.99009900990099,
"grad_norm": 0.2636796510612892,
"learning_rate": 9.626448902858359e-06,
"loss": 0.0895,
"step": 806
},
{
"epoch": 3.995049504950495,
"grad_norm": 0.20717337091681312,
"learning_rate": 9.536675798831499e-06,
"loss": 0.0635,
"step": 807
},
{
"epoch": 4.0,
"grad_norm": 0.2307254007299234,
"learning_rate": 9.447266566973211e-06,
"loss": 0.0602,
"step": 808
},
{
"epoch": 4.0049504950495045,
"grad_norm": 0.12624509852601035,
"learning_rate": 9.358222275240884e-06,
"loss": 0.0145,
"step": 809
},
{
"epoch": 4.00990099009901,
"grad_norm": 0.16507407293813872,
"learning_rate": 9.26954398723278e-06,
"loss": 0.0177,
"step": 810
},
{
"epoch": 4.014851485148514,
"grad_norm": 0.1292728036609501,
"learning_rate": 9.181232762175435e-06,
"loss": 0.0165,
"step": 811
},
{
"epoch": 4.01980198019802,
"grad_norm": 0.1080103691046681,
"learning_rate": 9.093289654910946e-06,
"loss": 0.0107,
"step": 812
},
{
"epoch": 4.024752475247524,
"grad_norm": 0.12607608036523452,
"learning_rate": 9.005715715884409e-06,
"loss": 0.0151,
"step": 813
},
{
"epoch": 4.02970297029703,
"grad_norm": 0.1304597478576506,
"learning_rate": 8.918511991131335e-06,
"loss": 0.0156,
"step": 814
},
{
"epoch": 4.034653465346534,
"grad_norm": 0.11336223348572144,
"learning_rate": 8.831679522265167e-06,
"loss": 0.0137,
"step": 815
},
{
"epoch": 4.03960396039604,
"grad_norm": 0.10738022397375838,
"learning_rate": 8.745219346464884e-06,
"loss": 0.0112,
"step": 816
},
{
"epoch": 4.044554455445544,
"grad_norm": 0.10565152711917576,
"learning_rate": 8.659132496462521e-06,
"loss": 0.0122,
"step": 817
},
{
"epoch": 4.0495049504950495,
"grad_norm": 0.10963944208831582,
"learning_rate": 8.57342000053095e-06,
"loss": 0.0132,
"step": 818
},
{
"epoch": 4.054455445544554,
"grad_norm": 0.11269131050613809,
"learning_rate": 8.488082882471476e-06,
"loss": 0.0119,
"step": 819
},
{
"epoch": 4.0594059405940595,
"grad_norm": 0.12401055933444247,
"learning_rate": 8.403122161601699e-06,
"loss": 0.0109,
"step": 820
},
{
"epoch": 4.064356435643564,
"grad_norm": 0.13083619123039256,
"learning_rate": 8.318538852743275e-06,
"loss": 0.0136,
"step": 821
},
{
"epoch": 4.069306930693069,
"grad_norm": 0.11500742537186857,
"learning_rate": 8.23433396620986e-06,
"loss": 0.0117,
"step": 822
},
{
"epoch": 4.074257425742574,
"grad_norm": 0.12961481225832067,
"learning_rate": 8.150508507795005e-06,
"loss": 0.0137,
"step": 823
},
{
"epoch": 4.079207920792079,
"grad_norm": 0.12055499601120245,
"learning_rate": 8.067063478760127e-06,
"loss": 0.0107,
"step": 824
},
{
"epoch": 4.084158415841584,
"grad_norm": 0.15932784631531496,
"learning_rate": 7.983999875822563e-06,
"loss": 0.0156,
"step": 825
},
{
"epoch": 4.089108910891089,
"grad_norm": 0.11910916796855137,
"learning_rate": 7.901318691143678e-06,
"loss": 0.0131,
"step": 826
},
{
"epoch": 4.094059405940594,
"grad_norm": 0.13705672850707565,
"learning_rate": 7.819020912317011e-06,
"loss": 0.0132,
"step": 827
},
{
"epoch": 4.099009900990099,
"grad_norm": 0.13359626054483767,
"learning_rate": 7.73710752235647e-06,
"loss": 0.0135,
"step": 828
},
{
"epoch": 4.103960396039604,
"grad_norm": 0.14419301245049312,
"learning_rate": 7.65557949968459e-06,
"loss": 0.0133,
"step": 829
},
{
"epoch": 4.108910891089109,
"grad_norm": 0.12203061347827363,
"learning_rate": 7.574437818120839e-06,
"loss": 0.01,
"step": 830
},
{
"epoch": 4.1138613861386135,
"grad_norm": 0.1185999849541631,
"learning_rate": 7.4936834468699945e-06,
"loss": 0.01,
"step": 831
},
{
"epoch": 4.118811881188119,
"grad_norm": 0.1420252022264759,
"learning_rate": 7.413317350510589e-06,
"loss": 0.0111,
"step": 832
},
{
"epoch": 4.123762376237623,
"grad_norm": 0.12871738126470406,
"learning_rate": 7.333340488983363e-06,
"loss": 0.0119,
"step": 833
},
{
"epoch": 4.128712871287129,
"grad_norm": 0.09804785229838152,
"learning_rate": 7.253753817579792e-06,
"loss": 0.0062,
"step": 834
},
{
"epoch": 4.133663366336633,
"grad_norm": 0.1500135953230511,
"learning_rate": 7.174558286930682e-06,
"loss": 0.0123,
"step": 835
},
{
"epoch": 4.138613861386139,
"grad_norm": 0.13688770404452,
"learning_rate": 7.095754842994824e-06,
"loss": 0.0145,
"step": 836
},
{
"epoch": 4.143564356435643,
"grad_norm": 0.13128710417128772,
"learning_rate": 7.0173444270477075e-06,
"loss": 0.0134,
"step": 837
},
{
"epoch": 4.148514851485149,
"grad_norm": 0.12492637806478936,
"learning_rate": 6.939327975670256e-06,
"loss": 0.0087,
"step": 838
},
{
"epoch": 4.153465346534653,
"grad_norm": 0.13106643311603597,
"learning_rate": 6.861706420737628e-06,
"loss": 0.0097,
"step": 839
},
{
"epoch": 4.158415841584159,
"grad_norm": 0.1103294994245557,
"learning_rate": 6.784480689408099e-06,
"loss": 0.0075,
"step": 840
},
{
"epoch": 4.163366336633663,
"grad_norm": 0.14473315697134212,
"learning_rate": 6.707651704112028e-06,
"loss": 0.0139,
"step": 841
},
{
"epoch": 4.1683168316831685,
"grad_norm": 0.1166093187701074,
"learning_rate": 6.631220382540755e-06,
"loss": 0.0102,
"step": 842
},
{
"epoch": 4.173267326732673,
"grad_norm": 0.10511608560955692,
"learning_rate": 6.555187637635727e-06,
"loss": 0.0079,
"step": 843
},
{
"epoch": 4.178217821782178,
"grad_norm": 0.11453792840078524,
"learning_rate": 6.479554377577528e-06,
"loss": 0.0098,
"step": 844
},
{
"epoch": 4.183168316831683,
"grad_norm": 0.12632446631314592,
"learning_rate": 6.404321505775053e-06,
"loss": 0.0114,
"step": 845
},
{
"epoch": 4.188118811881188,
"grad_norm": 0.13650562617278209,
"learning_rate": 6.329489920854745e-06,
"loss": 0.0127,
"step": 846
},
{
"epoch": 4.193069306930693,
"grad_norm": 0.12800825431726343,
"learning_rate": 6.255060516649809e-06,
"loss": 0.0121,
"step": 847
},
{
"epoch": 4.198019801980198,
"grad_norm": 0.12058629163517937,
"learning_rate": 6.181034182189592e-06,
"loss": 0.0104,
"step": 848
},
{
"epoch": 4.202970297029703,
"grad_norm": 0.1351110698360575,
"learning_rate": 6.107411801688905e-06,
"loss": 0.0138,
"step": 849
},
{
"epoch": 4.207920792079208,
"grad_norm": 0.12674244407405744,
"learning_rate": 6.034194254537502e-06,
"loss": 0.0087,
"step": 850
},
{
"epoch": 4.212871287128713,
"grad_norm": 0.15735576903425058,
"learning_rate": 5.9613824152895765e-06,
"loss": 0.0112,
"step": 851
},
{
"epoch": 4.217821782178218,
"grad_norm": 0.11717446858945191,
"learning_rate": 5.8889771536532855e-06,
"loss": 0.0104,
"step": 852
},
{
"epoch": 4.2227722772277225,
"grad_norm": 0.13212471056877662,
"learning_rate": 5.8169793344804085e-06,
"loss": 0.0153,
"step": 853
},
{
"epoch": 4.227722772277228,
"grad_norm": 0.13863455505878444,
"learning_rate": 5.7453898177559505e-06,
"loss": 0.0144,
"step": 854
},
{
"epoch": 4.232673267326732,
"grad_norm": 0.11440426947284903,
"learning_rate": 5.674209458587929e-06,
"loss": 0.0107,
"step": 855
},
{
"epoch": 4.237623762376238,
"grad_norm": 0.13893105515641654,
"learning_rate": 5.603439107197149e-06,
"loss": 0.0113,
"step": 856
},
{
"epoch": 4.242574257425742,
"grad_norm": 0.09755126038986718,
"learning_rate": 5.5330796089070064e-06,
"loss": 0.0086,
"step": 857
},
{
"epoch": 4.247524752475248,
"grad_norm": 0.1012621696476356,
"learning_rate": 5.463131804133461e-06,
"loss": 0.0064,
"step": 858
},
{
"epoch": 4.252475247524752,
"grad_norm": 0.11007678351244354,
"learning_rate": 5.393596528374923e-06,
"loss": 0.0092,
"step": 859
},
{
"epoch": 4.257425742574258,
"grad_norm": 0.09888652684365734,
"learning_rate": 5.324474612202335e-06,
"loss": 0.0069,
"step": 860
},
{
"epoch": 4.262376237623762,
"grad_norm": 0.12286862253049159,
"learning_rate": 5.255766881249212e-06,
"loss": 0.01,
"step": 861
},
{
"epoch": 4.267326732673268,
"grad_norm": 0.10142434831826229,
"learning_rate": 5.187474156201786e-06,
"loss": 0.0088,
"step": 862
},
{
"epoch": 4.272277227722772,
"grad_norm": 0.10702805040029474,
"learning_rate": 5.119597252789237e-06,
"loss": 0.0086,
"step": 863
},
{
"epoch": 4.2772277227722775,
"grad_norm": 0.11943904582684445,
"learning_rate": 5.052136981773892e-06,
"loss": 0.0094,
"step": 864
},
{
"epoch": 4.282178217821782,
"grad_norm": 0.1028717674245101,
"learning_rate": 4.9850941489415985e-06,
"loss": 0.0093,
"step": 865
},
{
"epoch": 4.287128712871287,
"grad_norm": 0.11112579540598307,
"learning_rate": 4.918469555092049e-06,
"loss": 0.011,
"step": 866
},
{
"epoch": 4.292079207920792,
"grad_norm": 0.10114297312691053,
"learning_rate": 4.852263996029259e-06,
"loss": 0.0092,
"step": 867
},
{
"epoch": 4.297029702970297,
"grad_norm": 0.097220137032453,
"learning_rate": 4.786478262552012e-06,
"loss": 0.0099,
"step": 868
},
{
"epoch": 4.301980198019802,
"grad_norm": 0.10180171384782043,
"learning_rate": 4.7211131404444825e-06,
"loss": 0.0083,
"step": 869
},
{
"epoch": 4.306930693069307,
"grad_norm": 0.09810672490833108,
"learning_rate": 4.656169410466795e-06,
"loss": 0.0092,
"step": 870
},
{
"epoch": 4.311881188118812,
"grad_norm": 0.10340416632521776,
"learning_rate": 4.591647848345711e-06,
"loss": 0.0083,
"step": 871
},
{
"epoch": 4.316831683168317,
"grad_norm": 0.09960308482221131,
"learning_rate": 4.527549224765362e-06,
"loss": 0.0104,
"step": 872
},
{
"epoch": 4.321782178217822,
"grad_norm": 0.11986640194637614,
"learning_rate": 4.463874305358045e-06,
"loss": 0.0098,
"step": 873
},
{
"epoch": 4.326732673267327,
"grad_norm": 0.1310677721590091,
"learning_rate": 4.400623850695103e-06,
"loss": 0.0112,
"step": 874
},
{
"epoch": 4.3316831683168315,
"grad_norm": 0.11364550376478703,
"learning_rate": 4.337798616277806e-06,
"loss": 0.0098,
"step": 875
},
{
"epoch": 4.336633663366337,
"grad_norm": 0.10732139674973952,
"learning_rate": 4.275399352528342e-06,
"loss": 0.0096,
"step": 876
},
{
"epoch": 4.341584158415841,
"grad_norm": 0.11726008537633273,
"learning_rate": 4.213426804780838e-06,
"loss": 0.0099,
"step": 877
},
{
"epoch": 4.346534653465347,
"grad_norm": 0.10648615788140584,
"learning_rate": 4.151881713272472e-06,
"loss": 0.0081,
"step": 878
},
{
"epoch": 4.351485148514851,
"grad_norm": 0.14887235960616102,
"learning_rate": 4.090764813134644e-06,
"loss": 0.0141,
"step": 879
},
{
"epoch": 4.356435643564357,
"grad_norm": 0.11352930689713439,
"learning_rate": 4.0300768343841805e-06,
"loss": 0.0109,
"step": 880
},
{
"epoch": 4.361386138613861,
"grad_norm": 0.1588437384116596,
"learning_rate": 3.969818501914597e-06,
"loss": 0.0101,
"step": 881
},
{
"epoch": 4.366336633663367,
"grad_norm": 0.12435479835383088,
"learning_rate": 3.909990535487472e-06,
"loss": 0.0102,
"step": 882
},
{
"epoch": 4.371287128712871,
"grad_norm": 0.09983693257790452,
"learning_rate": 3.850593649723804e-06,
"loss": 0.0089,
"step": 883
},
{
"epoch": 4.376237623762377,
"grad_norm": 0.1000375497881339,
"learning_rate": 3.7916285540955566e-06,
"loss": 0.0093,
"step": 884
},
{
"epoch": 4.381188118811881,
"grad_norm": 0.10545145468420694,
"learning_rate": 3.733095952917101e-06,
"loss": 0.0093,
"step": 885
},
{
"epoch": 4.3861386138613865,
"grad_norm": 0.12375538991124814,
"learning_rate": 3.6749965453368375e-06,
"loss": 0.0129,
"step": 886
},
{
"epoch": 4.391089108910891,
"grad_norm": 0.11154078992980383,
"learning_rate": 3.617331025328845e-06,
"loss": 0.0109,
"step": 887
},
{
"epoch": 4.396039603960396,
"grad_norm": 0.12282743124663045,
"learning_rate": 3.5601000816846053e-06,
"loss": 0.0117,
"step": 888
},
{
"epoch": 4.400990099009901,
"grad_norm": 0.09350300949252467,
"learning_rate": 3.50330439800473e-06,
"loss": 0.0061,
"step": 889
},
{
"epoch": 4.405940594059406,
"grad_norm": 0.12085328848569887,
"learning_rate": 3.4469446526908555e-06,
"loss": 0.0117,
"step": 890
},
{
"epoch": 4.410891089108911,
"grad_norm": 0.10985567703832641,
"learning_rate": 3.3910215189374916e-06,
"loss": 0.0095,
"step": 891
},
{
"epoch": 4.415841584158416,
"grad_norm": 0.1315796533550786,
"learning_rate": 3.3355356647239987e-06,
"loss": 0.0113,
"step": 892
},
{
"epoch": 4.420792079207921,
"grad_norm": 0.10069554863342839,
"learning_rate": 3.2804877528066225e-06,
"loss": 0.0092,
"step": 893
},
{
"epoch": 4.425742574257426,
"grad_norm": 0.10433578338492921,
"learning_rate": 3.225878440710544e-06,
"loss": 0.0082,
"step": 894
},
{
"epoch": 4.430693069306931,
"grad_norm": 0.12234230112930741,
"learning_rate": 3.171708380722072e-06,
"loss": 0.0106,
"step": 895
},
{
"epoch": 4.435643564356436,
"grad_norm": 0.11967807029664655,
"learning_rate": 3.1179782198807973e-06,
"loss": 0.0109,
"step": 896
},
{
"epoch": 4.4405940594059405,
"grad_norm": 0.12548981747614735,
"learning_rate": 3.064688599971901e-06,
"loss": 0.0104,
"step": 897
},
{
"epoch": 4.445544554455446,
"grad_norm": 0.1410817044638762,
"learning_rate": 3.011840157518493e-06,
"loss": 0.0133,
"step": 898
},
{
"epoch": 4.4504950495049505,
"grad_norm": 0.10380510689456897,
"learning_rate": 2.9594335237739778e-06,
"loss": 0.0082,
"step": 899
},
{
"epoch": 4.455445544554456,
"grad_norm": 0.11959316610225865,
"learning_rate": 2.9074693247145513e-06,
"loss": 0.0088,
"step": 900
},
{
"epoch": 4.46039603960396,
"grad_norm": 0.1603922624135695,
"learning_rate": 2.85594818103168e-06,
"loss": 0.0153,
"step": 901
},
{
"epoch": 4.465346534653466,
"grad_norm": 0.10661183180213293,
"learning_rate": 2.804870708124745e-06,
"loss": 0.0098,
"step": 902
},
{
"epoch": 4.47029702970297,
"grad_norm": 0.1293618057772319,
"learning_rate": 2.754237516093623e-06,
"loss": 0.0108,
"step": 903
},
{
"epoch": 4.475247524752476,
"grad_norm": 0.17181592274571184,
"learning_rate": 2.7040492097314498e-06,
"loss": 0.012,
"step": 904
},
{
"epoch": 4.48019801980198,
"grad_norm": 0.1289378035263025,
"learning_rate": 2.6543063885173936e-06,
"loss": 0.0128,
"step": 905
},
{
"epoch": 4.485148514851485,
"grad_norm": 0.09592619922978449,
"learning_rate": 2.605009646609453e-06,
"loss": 0.0079,
"step": 906
},
{
"epoch": 4.49009900990099,
"grad_norm": 0.10097183688451279,
"learning_rate": 2.556159572837422e-06,
"loss": 0.0091,
"step": 907
},
{
"epoch": 4.4950495049504955,
"grad_norm": 0.11735122065163751,
"learning_rate": 2.5077567506957977e-06,
"loss": 0.0086,
"step": 908
},
{
"epoch": 4.5,
"grad_norm": 0.10580716713405154,
"learning_rate": 2.459801758336835e-06,
"loss": 0.0099,
"step": 909
},
{
"epoch": 4.5049504950495045,
"grad_norm": 0.097587996056804,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.0072,
"step": 910
},
{
"epoch": 4.50990099009901,
"grad_norm": 0.10194905262661175,
"learning_rate": 2.3652375488234114e-06,
"loss": 0.0088,
"step": 911
},
{
"epoch": 4.514851485148515,
"grad_norm": 0.11494423366378297,
"learning_rate": 2.3186294612004365e-06,
"loss": 0.0089,
"step": 912
},
{
"epoch": 4.51980198019802,
"grad_norm": 0.10539132079737316,
"learning_rate": 2.272471462409622e-06,
"loss": 0.008,
"step": 913
},
{
"epoch": 4.524752475247524,
"grad_norm": 0.13942440167736256,
"learning_rate": 2.226764103789716e-06,
"loss": 0.0107,
"step": 914
},
{
"epoch": 4.52970297029703,
"grad_norm": 0.133810524249386,
"learning_rate": 2.181507931296749e-06,
"loss": 0.0102,
"step": 915
},
{
"epoch": 4.534653465346535,
"grad_norm": 0.10558494444651857,
"learning_rate": 2.136703485497531e-06,
"loss": 0.0085,
"step": 916
},
{
"epoch": 4.53960396039604,
"grad_norm": 0.10838629888708354,
"learning_rate": 2.0923513015631646e-06,
"loss": 0.0092,
"step": 917
},
{
"epoch": 4.544554455445544,
"grad_norm": 0.11934575207962728,
"learning_rate": 2.0484519092626652e-06,
"loss": 0.0102,
"step": 918
},
{
"epoch": 4.5495049504950495,
"grad_norm": 0.11420310289601006,
"learning_rate": 2.0050058329566367e-06,
"loss": 0.0109,
"step": 919
},
{
"epoch": 4.554455445544555,
"grad_norm": 0.1288165937047373,
"learning_rate": 1.9620135915909968e-06,
"loss": 0.012,
"step": 920
},
{
"epoch": 4.5594059405940595,
"grad_norm": 0.12492928381215326,
"learning_rate": 1.9194756986908025e-06,
"loss": 0.012,
"step": 921
},
{
"epoch": 4.564356435643564,
"grad_norm": 0.1562283004243408,
"learning_rate": 1.8773926623541028e-06,
"loss": 0.013,
"step": 922
},
{
"epoch": 4.569306930693069,
"grad_norm": 0.11262911274840161,
"learning_rate": 1.835764985245856e-06,
"loss": 0.0106,
"step": 923
},
{
"epoch": 4.574257425742574,
"grad_norm": 0.10018479525655571,
"learning_rate": 1.7945931645919358e-06,
"loss": 0.0085,
"step": 924
},
{
"epoch": 4.579207920792079,
"grad_norm": 0.12831476700417802,
"learning_rate": 1.7538776921731937e-06,
"loss": 0.0127,
"step": 925
},
{
"epoch": 4.584158415841584,
"grad_norm": 0.10899789389203231,
"learning_rate": 1.713619054319593e-06,
"loss": 0.0102,
"step": 926
},
{
"epoch": 4.589108910891089,
"grad_norm": 0.12180796544381997,
"learning_rate": 1.6738177319044036e-06,
"loss": 0.0103,
"step": 927
},
{
"epoch": 4.594059405940594,
"grad_norm": 0.1304139455450118,
"learning_rate": 1.6344742003384161e-06,
"loss": 0.0117,
"step": 928
},
{
"epoch": 4.599009900990099,
"grad_norm": 0.09331975478034672,
"learning_rate": 1.5955889295643111e-06,
"loss": 0.0075,
"step": 929
},
{
"epoch": 4.603960396039604,
"grad_norm": 0.12135625400911418,
"learning_rate": 1.5571623840510185e-06,
"loss": 0.0091,
"step": 930
},
{
"epoch": 4.608910891089109,
"grad_norm": 0.13446080469374425,
"learning_rate": 1.519195022788198e-06,
"loss": 0.0127,
"step": 931
},
{
"epoch": 4.6138613861386135,
"grad_norm": 0.11265755081466036,
"learning_rate": 1.481687299280723e-06,
"loss": 0.011,
"step": 932
},
{
"epoch": 4.618811881188119,
"grad_norm": 0.1012739526325802,
"learning_rate": 1.4446396615432855e-06,
"loss": 0.0069,
"step": 933
},
{
"epoch": 4.623762376237623,
"grad_norm": 0.09877519513950658,
"learning_rate": 1.4080525520950184e-06,
"loss": 0.0085,
"step": 934
},
{
"epoch": 4.628712871287129,
"grad_norm": 0.10904407230005678,
"learning_rate": 1.3719264079542628e-06,
"loss": 0.0105,
"step": 935
},
{
"epoch": 4.633663366336633,
"grad_norm": 0.09084607268010596,
"learning_rate": 1.33626166063328e-06,
"loss": 0.0064,
"step": 936
},
{
"epoch": 4.638613861386139,
"grad_norm": 0.10777246325097957,
"learning_rate": 1.3010587361331673e-06,
"loss": 0.0094,
"step": 937
},
{
"epoch": 4.643564356435643,
"grad_norm": 0.1210887592148449,
"learning_rate": 1.2663180549387e-06,
"loss": 0.0125,
"step": 938
},
{
"epoch": 4.648514851485149,
"grad_norm": 0.11166984220404819,
"learning_rate": 1.2320400320133551e-06,
"loss": 0.0079,
"step": 939
},
{
"epoch": 4.653465346534653,
"grad_norm": 0.12155380749860617,
"learning_rate": 1.1982250767943593e-06,
"loss": 0.0121,
"step": 940
},
{
"epoch": 4.658415841584159,
"grad_norm": 0.09874444900173716,
"learning_rate": 1.1648735931877543e-06,
"loss": 0.0075,
"step": 941
},
{
"epoch": 4.663366336633663,
"grad_norm": 0.12797079584414423,
"learning_rate": 1.131985979563619e-06,
"loss": 0.0084,
"step": 942
},
{
"epoch": 4.6683168316831685,
"grad_norm": 0.12083572862418303,
"learning_rate": 1.0995626287512828e-06,
"loss": 0.0101,
"step": 943
},
{
"epoch": 4.673267326732673,
"grad_norm": 0.11920811138204533,
"learning_rate": 1.0676039280346439e-06,
"loss": 0.0088,
"step": 944
},
{
"epoch": 4.678217821782178,
"grad_norm": 0.120811785785541,
"learning_rate": 1.036110259147547e-06,
"loss": 0.0103,
"step": 945
},
{
"epoch": 4.683168316831683,
"grad_norm": 0.09314579688215502,
"learning_rate": 1.0050819982692083e-06,
"loss": 0.0087,
"step": 946
},
{
"epoch": 4.688118811881188,
"grad_norm": 0.11433215178887399,
"learning_rate": 9.745195160197452e-07,
"loss": 0.008,
"step": 947
},
{
"epoch": 4.693069306930693,
"grad_norm": 0.11421090935566788,
"learning_rate": 9.444231774557199e-07,
"loss": 0.0098,
"step": 948
},
{
"epoch": 4.698019801980198,
"grad_norm": 0.13351330986822083,
"learning_rate": 9.147933420658117e-07,
"loss": 0.0126,
"step": 949
},
{
"epoch": 4.702970297029703,
"grad_norm": 0.12319551924430684,
"learning_rate": 8.856303637664987e-07,
"loss": 0.0112,
"step": 950
},
{
"epoch": 4.707920792079208,
"grad_norm": 0.11647737457481774,
"learning_rate": 8.569345908978355e-07,
"loss": 0.0089,
"step": 951
},
{
"epoch": 4.712871287128713,
"grad_norm": 0.08526467868390036,
"learning_rate": 8.287063662193095e-07,
"loss": 0.0065,
"step": 952
},
{
"epoch": 4.717821782178218,
"grad_norm": 0.10819662235450227,
"learning_rate": 8.009460269057156e-07,
"loss": 0.0093,
"step": 953
},
{
"epoch": 4.7227722772277225,
"grad_norm": 0.10795810812921569,
"learning_rate": 7.736539045431634e-07,
"loss": 0.0083,
"step": 954
},
{
"epoch": 4.727722772277228,
"grad_norm": 0.13790878328791822,
"learning_rate": 7.468303251250764e-07,
"loss": 0.0117,
"step": 955
},
{
"epoch": 4.732673267326732,
"grad_norm": 0.12340124149908364,
"learning_rate": 7.204756090483411e-07,
"loss": 0.0131,
"step": 956
},
{
"epoch": 4.737623762376238,
"grad_norm": 0.11659198110649899,
"learning_rate": 6.945900711094534e-07,
"loss": 0.0091,
"step": 957
},
{
"epoch": 4.742574257425742,
"grad_norm": 0.1303246749548437,
"learning_rate": 6.691740205007602e-07,
"loss": 0.0105,
"step": 958
},
{
"epoch": 4.747524752475248,
"grad_norm": 0.12015398011009239,
"learning_rate": 6.442277608067838e-07,
"loss": 0.0091,
"step": 959
},
{
"epoch": 4.752475247524752,
"grad_norm": 0.11438450355082716,
"learning_rate": 6.197515900005613e-07,
"loss": 0.0086,
"step": 960
},
{
"epoch": 4.757425742574258,
"grad_norm": 0.10035792670493265,
"learning_rate": 5.957458004401328e-07,
"loss": 0.009,
"step": 961
},
{
"epoch": 4.762376237623762,
"grad_norm": 0.12945304146742145,
"learning_rate": 5.722106788649928e-07,
"loss": 0.0086,
"step": 962
},
{
"epoch": 4.767326732673268,
"grad_norm": 0.1313966501532762,
"learning_rate": 5.491465063927282e-07,
"loss": 0.016,
"step": 963
},
{
"epoch": 4.772277227722772,
"grad_norm": 0.09991929616458903,
"learning_rate": 5.265535585156079e-07,
"loss": 0.0077,
"step": 964
},
{
"epoch": 4.7772277227722775,
"grad_norm": 0.1272495031762234,
"learning_rate": 5.044321050973189e-07,
"loss": 0.0105,
"step": 965
},
{
"epoch": 4.782178217821782,
"grad_norm": 0.11503642147592588,
"learning_rate": 4.827824103697332e-07,
"loss": 0.0092,
"step": 966
},
{
"epoch": 4.787128712871287,
"grad_norm": 0.12710578688405658,
"learning_rate": 4.616047329297546e-07,
"loss": 0.0125,
"step": 967
},
{
"epoch": 4.792079207920792,
"grad_norm": 0.11374035594717975,
"learning_rate": 4.408993257362282e-07,
"loss": 0.0104,
"step": 968
},
{
"epoch": 4.797029702970297,
"grad_norm": 0.08858002009294087,
"learning_rate": 4.206664361069379e-07,
"loss": 0.0076,
"step": 969
},
{
"epoch": 4.801980198019802,
"grad_norm": 0.11134716407300935,
"learning_rate": 4.0090630571560927e-07,
"loss": 0.0115,
"step": 970
},
{
"epoch": 4.806930693069307,
"grad_norm": 0.12525044878492964,
"learning_rate": 3.8161917058906706e-07,
"loss": 0.012,
"step": 971
},
{
"epoch": 4.811881188118812,
"grad_norm": 0.10392944064091289,
"learning_rate": 3.628052611043842e-07,
"loss": 0.01,
"step": 972
},
{
"epoch": 4.816831683168317,
"grad_norm": 0.10164873644042355,
"learning_rate": 3.444648019861552e-07,
"loss": 0.0076,
"step": 973
},
{
"epoch": 4.821782178217822,
"grad_norm": 0.10039228066726238,
"learning_rate": 3.265980123038004e-07,
"loss": 0.0078,
"step": 974
},
{
"epoch": 4.826732673267327,
"grad_norm": 0.14843542740351492,
"learning_rate": 3.0920510546894156e-07,
"loss": 0.0135,
"step": 975
},
{
"epoch": 4.8316831683168315,
"grad_norm": 0.10601558319793576,
"learning_rate": 2.9228628923285705e-07,
"loss": 0.0095,
"step": 976
},
{
"epoch": 4.836633663366337,
"grad_norm": 0.1244248427623547,
"learning_rate": 2.7584176568401734e-07,
"loss": 0.0115,
"step": 977
},
{
"epoch": 4.841584158415841,
"grad_norm": 0.13885166923612954,
"learning_rate": 2.5987173124564224e-07,
"loss": 0.0124,
"step": 978
},
{
"epoch": 4.846534653465347,
"grad_norm": 0.11860523889858633,
"learning_rate": 2.4437637667338754e-07,
"loss": 0.0106,
"step": 979
},
{
"epoch": 4.851485148514851,
"grad_norm": 0.11637752351297623,
"learning_rate": 2.2935588705302658e-07,
"loss": 0.0087,
"step": 980
},
{
"epoch": 4.856435643564357,
"grad_norm": 0.0970118081249502,
"learning_rate": 2.148104417982788e-07,
"loss": 0.0069,
"step": 981
},
{
"epoch": 4.861386138613861,
"grad_norm": 0.10050712126829177,
"learning_rate": 2.0074021464864702e-07,
"loss": 0.0084,
"step": 982
},
{
"epoch": 4.866336633663367,
"grad_norm": 0.09418273967101161,
"learning_rate": 1.871453736673301e-07,
"loss": 0.0069,
"step": 983
},
{
"epoch": 4.871287128712871,
"grad_norm": 0.09839098778033135,
"learning_rate": 1.740260812392558e-07,
"loss": 0.0075,
"step": 984
},
{
"epoch": 4.876237623762377,
"grad_norm": 0.11625610481951301,
"learning_rate": 1.6138249406909558e-07,
"loss": 0.0097,
"step": 985
},
{
"epoch": 4.881188118811881,
"grad_norm": 0.10209939301826869,
"learning_rate": 1.4921476317941719e-07,
"loss": 0.0068,
"step": 986
},
{
"epoch": 4.8861386138613865,
"grad_norm": 0.10859072250356432,
"learning_rate": 1.3752303390887733e-07,
"loss": 0.0094,
"step": 987
},
{
"epoch": 4.891089108910891,
"grad_norm": 0.10996490648405818,
"learning_rate": 1.2630744591048516e-07,
"loss": 0.0083,
"step": 988
},
{
"epoch": 4.896039603960396,
"grad_norm": 0.11600550545963038,
"learning_rate": 1.1556813314993698e-07,
"loss": 0.0101,
"step": 989
},
{
"epoch": 4.900990099009901,
"grad_norm": 0.12735173508972217,
"learning_rate": 1.0530522390400422e-07,
"loss": 0.0099,
"step": 990
},
{
"epoch": 4.905940594059406,
"grad_norm": 0.12475506745382753,
"learning_rate": 9.551884075901463e-08,
"loss": 0.0111,
"step": 991
},
{
"epoch": 4.910891089108911,
"grad_norm": 0.09638370643324946,
"learning_rate": 8.620910060938681e-08,
"loss": 0.008,
"step": 992
},
{
"epoch": 4.915841584158416,
"grad_norm": 0.11118069759156585,
"learning_rate": 7.737611465622686e-08,
"loss": 0.0084,
"step": 993
},
{
"epoch": 4.920792079207921,
"grad_norm": 0.09421039333272324,
"learning_rate": 6.901998840600055e-08,
"loss": 0.008,
"step": 994
},
{
"epoch": 4.925742574257426,
"grad_norm": 0.08785979683980723,
"learning_rate": 6.11408216692766e-08,
"loss": 0.0077,
"step": 995
},
{
"epoch": 4.930693069306931,
"grad_norm": 0.11723033297350285,
"learning_rate": 5.373870855954089e-08,
"loss": 0.0111,
"step": 996
},
{
"epoch": 4.935643564356436,
"grad_norm": 0.09947063813063768,
"learning_rate": 4.681373749205964e-08,
"loss": 0.0095,
"step": 997
},
{
"epoch": 4.9405940594059405,
"grad_norm": 0.134222756516452,
"learning_rate": 4.036599118282691e-08,
"loss": 0.016,
"step": 998
},
{
"epoch": 4.945544554455445,
"grad_norm": 0.12883240275435368,
"learning_rate": 3.439554664758316e-08,
"loss": 0.0104,
"step": 999
},
{
"epoch": 4.9504950495049505,
"grad_norm": 0.12150651151338142,
"learning_rate": 2.890247520089151e-08,
"loss": 0.0093,
"step": 1000
},
{
"epoch": 4.955445544554456,
"grad_norm": 0.10677161131156719,
"learning_rate": 2.3886842455285166e-08,
"loss": 0.0101,
"step": 1001
},
{
"epoch": 4.96039603960396,
"grad_norm": 0.1394432398003884,
"learning_rate": 1.934870832047686e-08,
"loss": 0.0108,
"step": 1002
},
{
"epoch": 4.965346534653465,
"grad_norm": 0.10205391962906493,
"learning_rate": 1.528812700266169e-08,
"loss": 0.0071,
"step": 1003
},
{
"epoch": 4.97029702970297,
"grad_norm": 0.14245472257343503,
"learning_rate": 1.1705147003842065e-08,
"loss": 0.0125,
"step": 1004
},
{
"epoch": 4.975247524752476,
"grad_norm": 0.12217355514292874,
"learning_rate": 8.59981112128594e-09,
"loss": 0.011,
"step": 1005
},
{
"epoch": 4.98019801980198,
"grad_norm": 0.11487251605220974,
"learning_rate": 5.972156446980571e-09,
"loss": 0.0104,
"step": 1006
},
{
"epoch": 4.985148514851485,
"grad_norm": 0.11454656357310349,
"learning_rate": 3.822214367197319e-09,
"loss": 0.0099,
"step": 1007
},
{
"epoch": 4.99009900990099,
"grad_norm": 0.11034869889608963,
"learning_rate": 2.150010562140814e-09,
"loss": 0.0077,
"step": 1008
},
{
"epoch": 4.9950495049504955,
"grad_norm": 0.11755320903989616,
"learning_rate": 9.555650056070065e-10,
"loss": 0.0111,
"step": 1009
},
{
"epoch": 5.0,
"grad_norm": 0.12274589377449001,
"learning_rate": 2.3889196477000497e-10,
"loss": 0.0082,
"step": 1010
},
{
"epoch": 5.0,
"step": 1010,
"total_flos": 889001858826240.0,
"train_loss": 0.25745383046733417,
"train_runtime": 97110.6799,
"train_samples_per_second": 0.083,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 1010,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 889001858826240.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}