|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 1010, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0049504950495049506, |
|
"grad_norm": 3.599481981308616, |
|
"learning_rate": 0.0, |
|
"loss": 0.8893, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009900990099009901, |
|
"grad_norm": 4.571227545210381, |
|
"learning_rate": 7.920792079207921e-07, |
|
"loss": 0.9924, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01485148514851485, |
|
"grad_norm": 4.290318174261695, |
|
"learning_rate": 1.5841584158415842e-06, |
|
"loss": 0.9284, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.019801980198019802, |
|
"grad_norm": 4.063887886041896, |
|
"learning_rate": 2.3762376237623762e-06, |
|
"loss": 0.9605, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.024752475247524754, |
|
"grad_norm": 3.0457401787892744, |
|
"learning_rate": 3.1683168316831685e-06, |
|
"loss": 0.8544, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0297029702970297, |
|
"grad_norm": 2.5802449546715653, |
|
"learning_rate": 3.960396039603961e-06, |
|
"loss": 0.896, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.034653465346534656, |
|
"grad_norm": 2.27569894817667, |
|
"learning_rate": 4.7524752475247525e-06, |
|
"loss": 0.8519, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.039603960396039604, |
|
"grad_norm": 2.128661648062913, |
|
"learning_rate": 5.544554455445545e-06, |
|
"loss": 0.8402, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04455445544554455, |
|
"grad_norm": 1.762558880637763, |
|
"learning_rate": 6.336633663366337e-06, |
|
"loss": 0.8403, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04950495049504951, |
|
"grad_norm": 1.4450553165390079, |
|
"learning_rate": 7.128712871287129e-06, |
|
"loss": 0.774, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.054455445544554455, |
|
"grad_norm": 1.7583923085545512, |
|
"learning_rate": 7.920792079207921e-06, |
|
"loss": 0.7121, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0594059405940594, |
|
"grad_norm": 1.5091044617389875, |
|
"learning_rate": 8.712871287128714e-06, |
|
"loss": 0.711, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06435643564356436, |
|
"grad_norm": 1.4672138042959566, |
|
"learning_rate": 9.504950495049505e-06, |
|
"loss": 0.6526, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06930693069306931, |
|
"grad_norm": 1.1729096644060144, |
|
"learning_rate": 1.0297029702970298e-05, |
|
"loss": 0.6353, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07425742574257425, |
|
"grad_norm": 1.2698139994056772, |
|
"learning_rate": 1.108910891089109e-05, |
|
"loss": 0.6655, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07920792079207921, |
|
"grad_norm": 1.4992771430082272, |
|
"learning_rate": 1.1881188118811881e-05, |
|
"loss": 0.6069, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08415841584158416, |
|
"grad_norm": 1.2827914835014287, |
|
"learning_rate": 1.2673267326732674e-05, |
|
"loss": 0.6052, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0891089108910891, |
|
"grad_norm": 1.1528589297349217, |
|
"learning_rate": 1.3465346534653467e-05, |
|
"loss": 0.6348, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.09405940594059406, |
|
"grad_norm": 0.9829400287103841, |
|
"learning_rate": 1.4257425742574257e-05, |
|
"loss": 0.6186, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.09900990099009901, |
|
"grad_norm": 0.8874280395940564, |
|
"learning_rate": 1.504950495049505e-05, |
|
"loss": 0.6159, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10396039603960396, |
|
"grad_norm": 0.9732224407691762, |
|
"learning_rate": 1.5841584158415843e-05, |
|
"loss": 0.5948, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.10891089108910891, |
|
"grad_norm": 0.9422404230915372, |
|
"learning_rate": 1.6633663366336635e-05, |
|
"loss": 0.6051, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.11386138613861387, |
|
"grad_norm": 0.909292277442357, |
|
"learning_rate": 1.7425742574257428e-05, |
|
"loss": 0.6207, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1188118811881188, |
|
"grad_norm": 0.9645884004296726, |
|
"learning_rate": 1.821782178217822e-05, |
|
"loss": 0.6, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.12376237623762376, |
|
"grad_norm": 0.8781009838578941, |
|
"learning_rate": 1.900990099009901e-05, |
|
"loss": 0.5568, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12871287128712872, |
|
"grad_norm": 0.7687131742426503, |
|
"learning_rate": 1.9801980198019803e-05, |
|
"loss": 0.5689, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.13366336633663367, |
|
"grad_norm": 45.68743449097319, |
|
"learning_rate": 2.0594059405940595e-05, |
|
"loss": 0.6056, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.13861386138613863, |
|
"grad_norm": 1.42976250276645, |
|
"learning_rate": 2.1386138613861388e-05, |
|
"loss": 0.5793, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.14356435643564355, |
|
"grad_norm": 1.3124345694625106, |
|
"learning_rate": 2.217821782178218e-05, |
|
"loss": 0.5673, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1485148514851485, |
|
"grad_norm": 0.8946826085599571, |
|
"learning_rate": 2.297029702970297e-05, |
|
"loss": 0.6001, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15346534653465346, |
|
"grad_norm": 1.0396117670628082, |
|
"learning_rate": 2.3762376237623762e-05, |
|
"loss": 0.5742, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.15841584158415842, |
|
"grad_norm": 1.1197910610686859, |
|
"learning_rate": 2.4554455445544555e-05, |
|
"loss": 0.5714, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.16336633663366337, |
|
"grad_norm": 0.7961887362378622, |
|
"learning_rate": 2.5346534653465348e-05, |
|
"loss": 0.5404, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.16831683168316833, |
|
"grad_norm": 0.8216770935724286, |
|
"learning_rate": 2.613861386138614e-05, |
|
"loss": 0.5441, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.17326732673267325, |
|
"grad_norm": 0.8374910782342574, |
|
"learning_rate": 2.6930693069306933e-05, |
|
"loss": 0.5727, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1782178217821782, |
|
"grad_norm": 0.8723043518197049, |
|
"learning_rate": 2.7722772277227722e-05, |
|
"loss": 0.53, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.18316831683168316, |
|
"grad_norm": 0.7368093656466949, |
|
"learning_rate": 2.8514851485148515e-05, |
|
"loss": 0.5627, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.18811881188118812, |
|
"grad_norm": 0.839263691011532, |
|
"learning_rate": 2.9306930693069308e-05, |
|
"loss": 0.5832, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.19306930693069307, |
|
"grad_norm": 0.7426574483260017, |
|
"learning_rate": 3.00990099009901e-05, |
|
"loss": 0.5631, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.19801980198019803, |
|
"grad_norm": 0.7651839720480437, |
|
"learning_rate": 3.0891089108910896e-05, |
|
"loss": 0.551, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.20297029702970298, |
|
"grad_norm": 0.769095073737041, |
|
"learning_rate": 3.1683168316831686e-05, |
|
"loss": 0.5618, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2079207920792079, |
|
"grad_norm": 0.8259803329688946, |
|
"learning_rate": 3.247524752475248e-05, |
|
"loss": 0.562, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.21287128712871287, |
|
"grad_norm": 0.6691753174649648, |
|
"learning_rate": 3.326732673267327e-05, |
|
"loss": 0.5324, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.21782178217821782, |
|
"grad_norm": 0.7851088248969034, |
|
"learning_rate": 3.405940594059406e-05, |
|
"loss": 0.5459, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.22277227722772278, |
|
"grad_norm": 0.7186552394181012, |
|
"learning_rate": 3.4851485148514856e-05, |
|
"loss": 0.5497, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.22772277227722773, |
|
"grad_norm": 0.8608611817434151, |
|
"learning_rate": 3.5643564356435645e-05, |
|
"loss": 0.51, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.23267326732673269, |
|
"grad_norm": 0.750298380499875, |
|
"learning_rate": 3.643564356435644e-05, |
|
"loss": 0.5151, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2376237623762376, |
|
"grad_norm": 0.8882569861170021, |
|
"learning_rate": 3.722772277227723e-05, |
|
"loss": 0.5313, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.24257425742574257, |
|
"grad_norm": 0.7184108588660996, |
|
"learning_rate": 3.801980198019802e-05, |
|
"loss": 0.5365, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.24752475247524752, |
|
"grad_norm": 0.6639444774645417, |
|
"learning_rate": 3.8811881188118816e-05, |
|
"loss": 0.525, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2524752475247525, |
|
"grad_norm": 0.5990381795095598, |
|
"learning_rate": 3.9603960396039605e-05, |
|
"loss": 0.5231, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.25742574257425743, |
|
"grad_norm": 0.7385397760086568, |
|
"learning_rate": 4.03960396039604e-05, |
|
"loss": 0.5334, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2623762376237624, |
|
"grad_norm": 0.6825332922011768, |
|
"learning_rate": 4.118811881188119e-05, |
|
"loss": 0.5506, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.26732673267326734, |
|
"grad_norm": 0.66432008262571, |
|
"learning_rate": 4.1980198019801987e-05, |
|
"loss": 0.5592, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2722772277227723, |
|
"grad_norm": 0.7417922067610243, |
|
"learning_rate": 4.2772277227722776e-05, |
|
"loss": 0.5465, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.27722772277227725, |
|
"grad_norm": 0.7400946921357555, |
|
"learning_rate": 4.356435643564357e-05, |
|
"loss": 0.5414, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.28217821782178215, |
|
"grad_norm": 0.868329268810697, |
|
"learning_rate": 4.435643564356436e-05, |
|
"loss": 0.5417, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2871287128712871, |
|
"grad_norm": 0.6825435572314523, |
|
"learning_rate": 4.514851485148515e-05, |
|
"loss": 0.5321, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.29207920792079206, |
|
"grad_norm": 0.9429162079814348, |
|
"learning_rate": 4.594059405940594e-05, |
|
"loss": 0.547, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.297029702970297, |
|
"grad_norm": 0.6941868595008334, |
|
"learning_rate": 4.6732673267326736e-05, |
|
"loss": 0.5133, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.30198019801980197, |
|
"grad_norm": 0.7090210319685202, |
|
"learning_rate": 4.7524752475247525e-05, |
|
"loss": 0.5441, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3069306930693069, |
|
"grad_norm": 0.639944106128008, |
|
"learning_rate": 4.831683168316832e-05, |
|
"loss": 0.5201, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3118811881188119, |
|
"grad_norm": 0.6675816599082344, |
|
"learning_rate": 4.910891089108911e-05, |
|
"loss": 0.544, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.31683168316831684, |
|
"grad_norm": 0.7225958900836226, |
|
"learning_rate": 4.9900990099009906e-05, |
|
"loss": 0.5325, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3217821782178218, |
|
"grad_norm": 0.7971869756828545, |
|
"learning_rate": 5.0693069306930696e-05, |
|
"loss": 0.5432, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.32673267326732675, |
|
"grad_norm": 0.807211676486099, |
|
"learning_rate": 5.148514851485149e-05, |
|
"loss": 0.5794, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3316831683168317, |
|
"grad_norm": 0.6526247620861514, |
|
"learning_rate": 5.227722772277228e-05, |
|
"loss": 0.5329, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.33663366336633666, |
|
"grad_norm": 0.8932900521352153, |
|
"learning_rate": 5.306930693069308e-05, |
|
"loss": 0.5554, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3415841584158416, |
|
"grad_norm": 0.7332294643374914, |
|
"learning_rate": 5.3861386138613866e-05, |
|
"loss": 0.5269, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3465346534653465, |
|
"grad_norm": 0.6513725822741345, |
|
"learning_rate": 5.465346534653466e-05, |
|
"loss": 0.4927, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.35148514851485146, |
|
"grad_norm": 0.7096869239144933, |
|
"learning_rate": 5.5445544554455445e-05, |
|
"loss": 0.5569, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3564356435643564, |
|
"grad_norm": 0.7323268829393851, |
|
"learning_rate": 5.623762376237624e-05, |
|
"loss": 0.5339, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3613861386138614, |
|
"grad_norm": 0.7885619745669775, |
|
"learning_rate": 5.702970297029703e-05, |
|
"loss": 0.5826, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.36633663366336633, |
|
"grad_norm": 0.6749931221787999, |
|
"learning_rate": 5.7821782178217826e-05, |
|
"loss": 0.5406, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3712871287128713, |
|
"grad_norm": 0.648576213983254, |
|
"learning_rate": 5.8613861386138615e-05, |
|
"loss": 0.5524, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.37623762376237624, |
|
"grad_norm": 0.6598593089642592, |
|
"learning_rate": 5.940594059405941e-05, |
|
"loss": 0.5379, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3811881188118812, |
|
"grad_norm": 0.6650261826397315, |
|
"learning_rate": 6.01980198019802e-05, |
|
"loss": 0.5585, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.38613861386138615, |
|
"grad_norm": 0.7596923180855565, |
|
"learning_rate": 6.0990099009900997e-05, |
|
"loss": 0.5641, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3910891089108911, |
|
"grad_norm": 0.8173143487769239, |
|
"learning_rate": 6.178217821782179e-05, |
|
"loss": 0.5204, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.39603960396039606, |
|
"grad_norm": 0.6116291549339957, |
|
"learning_rate": 6.257425742574258e-05, |
|
"loss": 0.5199, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.400990099009901, |
|
"grad_norm": 1.0510502278437106, |
|
"learning_rate": 6.336633663366337e-05, |
|
"loss": 0.5561, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.40594059405940597, |
|
"grad_norm": 0.6263814677240027, |
|
"learning_rate": 6.415841584158417e-05, |
|
"loss": 0.5466, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.41089108910891087, |
|
"grad_norm": 0.781111899709593, |
|
"learning_rate": 6.495049504950496e-05, |
|
"loss": 0.5258, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.4158415841584158, |
|
"grad_norm": 0.7123210147687922, |
|
"learning_rate": 6.574257425742575e-05, |
|
"loss": 0.5298, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.4207920792079208, |
|
"grad_norm": 0.6399105090314179, |
|
"learning_rate": 6.653465346534654e-05, |
|
"loss": 0.5167, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.42574257425742573, |
|
"grad_norm": 0.5881868516455735, |
|
"learning_rate": 6.732673267326732e-05, |
|
"loss": 0.5229, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.4306930693069307, |
|
"grad_norm": 0.7851328238490848, |
|
"learning_rate": 6.811881188118812e-05, |
|
"loss": 0.5647, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.43564356435643564, |
|
"grad_norm": 0.6557279603019258, |
|
"learning_rate": 6.891089108910892e-05, |
|
"loss": 0.5172, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.4405940594059406, |
|
"grad_norm": 0.9157001843216616, |
|
"learning_rate": 6.970297029702971e-05, |
|
"loss": 0.5464, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.44554455445544555, |
|
"grad_norm": 0.623110108758698, |
|
"learning_rate": 7.04950495049505e-05, |
|
"loss": 0.5309, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4504950495049505, |
|
"grad_norm": 0.841500456640877, |
|
"learning_rate": 7.128712871287129e-05, |
|
"loss": 0.5638, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.45544554455445546, |
|
"grad_norm": 0.6925204556841734, |
|
"learning_rate": 7.207920792079209e-05, |
|
"loss": 0.5315, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.4603960396039604, |
|
"grad_norm": 0.711526620654385, |
|
"learning_rate": 7.287128712871288e-05, |
|
"loss": 0.5524, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.46534653465346537, |
|
"grad_norm": 0.5348898586960058, |
|
"learning_rate": 7.366336633663368e-05, |
|
"loss": 0.5404, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.47029702970297027, |
|
"grad_norm": 0.5821672966348954, |
|
"learning_rate": 7.445544554455446e-05, |
|
"loss": 0.5238, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4752475247524752, |
|
"grad_norm": 0.5908396910544158, |
|
"learning_rate": 7.524752475247524e-05, |
|
"loss": 0.5346, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4801980198019802, |
|
"grad_norm": 0.6146273285955356, |
|
"learning_rate": 7.603960396039604e-05, |
|
"loss": 0.5413, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.48514851485148514, |
|
"grad_norm": 0.6908121451411222, |
|
"learning_rate": 7.683168316831684e-05, |
|
"loss": 0.5497, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4900990099009901, |
|
"grad_norm": 0.6453437707858504, |
|
"learning_rate": 7.762376237623763e-05, |
|
"loss": 0.5552, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.49504950495049505, |
|
"grad_norm": 0.573659554028698, |
|
"learning_rate": 7.841584158415841e-05, |
|
"loss": 0.5534, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5585111590390315, |
|
"learning_rate": 7.920792079207921e-05, |
|
"loss": 0.5153, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.504950495049505, |
|
"grad_norm": 0.581544572278104, |
|
"learning_rate": 8e-05, |
|
"loss": 0.5515, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5099009900990099, |
|
"grad_norm": 0.5785865019861642, |
|
"learning_rate": 7.999976110803523e-05, |
|
"loss": 0.5468, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5148514851485149, |
|
"grad_norm": 0.6439240386695959, |
|
"learning_rate": 7.99990444349944e-05, |
|
"loss": 0.548, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5198019801980198, |
|
"grad_norm": 0.5927533216543163, |
|
"learning_rate": 7.999784998943787e-05, |
|
"loss": 0.5408, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5247524752475248, |
|
"grad_norm": 0.732845460948312, |
|
"learning_rate": 7.999617778563281e-05, |
|
"loss": 0.542, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5297029702970297, |
|
"grad_norm": 0.6609414974116621, |
|
"learning_rate": 7.999402784355303e-05, |
|
"loss": 0.5355, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5346534653465347, |
|
"grad_norm": 0.6867640106151846, |
|
"learning_rate": 7.999140018887873e-05, |
|
"loss": 0.5316, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5396039603960396, |
|
"grad_norm": 0.6243092054092657, |
|
"learning_rate": 7.998829485299617e-05, |
|
"loss": 0.5477, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5445544554455446, |
|
"grad_norm": 0.6368046072823754, |
|
"learning_rate": 7.998471187299734e-05, |
|
"loss": 0.5462, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5495049504950495, |
|
"grad_norm": 0.5805646759685726, |
|
"learning_rate": 7.998065129167953e-05, |
|
"loss": 0.5604, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5544554455445545, |
|
"grad_norm": 0.7048046209267858, |
|
"learning_rate": 7.997611315754472e-05, |
|
"loss": 0.5173, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5594059405940595, |
|
"grad_norm": 0.5756024122810762, |
|
"learning_rate": 7.997109752479912e-05, |
|
"loss": 0.5383, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5643564356435643, |
|
"grad_norm": 0.7065257004750126, |
|
"learning_rate": 7.996560445335241e-05, |
|
"loss": 0.5529, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5693069306930693, |
|
"grad_norm": 0.6026162793901009, |
|
"learning_rate": 7.995963400881718e-05, |
|
"loss": 0.5811, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5742574257425742, |
|
"grad_norm": 0.7729487641935177, |
|
"learning_rate": 7.995318626250795e-05, |
|
"loss": 0.5472, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5792079207920792, |
|
"grad_norm": 0.6627604460681927, |
|
"learning_rate": 7.994626129144047e-05, |
|
"loss": 0.5503, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5841584158415841, |
|
"grad_norm": 0.6085367060729467, |
|
"learning_rate": 7.993885917833073e-05, |
|
"loss": 0.5374, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5891089108910891, |
|
"grad_norm": 0.7573576072226863, |
|
"learning_rate": 7.9930980011594e-05, |
|
"loss": 0.5671, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.594059405940594, |
|
"grad_norm": 0.665798782251616, |
|
"learning_rate": 7.992262388534378e-05, |
|
"loss": 0.537, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.599009900990099, |
|
"grad_norm": 0.6447582523979625, |
|
"learning_rate": 7.991379089939062e-05, |
|
"loss": 0.5561, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6039603960396039, |
|
"grad_norm": 0.5575875836074239, |
|
"learning_rate": 7.990448115924099e-05, |
|
"loss": 0.5592, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6089108910891089, |
|
"grad_norm": 0.6509795039700941, |
|
"learning_rate": 7.989469477609601e-05, |
|
"loss": 0.5773, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6138613861386139, |
|
"grad_norm": 0.559853876791952, |
|
"learning_rate": 7.988443186685007e-05, |
|
"loss": 0.53, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.6188118811881188, |
|
"grad_norm": 0.6644580599424751, |
|
"learning_rate": 7.987369255408953e-05, |
|
"loss": 0.5213, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6237623762376238, |
|
"grad_norm": 0.4897883042416322, |
|
"learning_rate": 7.986247696609112e-05, |
|
"loss": 0.518, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.6287128712871287, |
|
"grad_norm": 0.6117080399483223, |
|
"learning_rate": 7.985078523682058e-05, |
|
"loss": 0.53, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6336633663366337, |
|
"grad_norm": 0.7164160487652417, |
|
"learning_rate": 7.983861750593091e-05, |
|
"loss": 0.5658, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6386138613861386, |
|
"grad_norm": 0.5380131972085676, |
|
"learning_rate": 7.982597391876076e-05, |
|
"loss": 0.5347, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.6435643564356436, |
|
"grad_norm": 0.6064956049519001, |
|
"learning_rate": 7.981285462633268e-05, |
|
"loss": 0.5705, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6485148514851485, |
|
"grad_norm": 0.5824503230131746, |
|
"learning_rate": 7.979925978535137e-05, |
|
"loss": 0.5531, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6534653465346535, |
|
"grad_norm": 0.6801101571631817, |
|
"learning_rate": 7.978518955820173e-05, |
|
"loss": 0.5209, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.6584158415841584, |
|
"grad_norm": 0.5343478838243892, |
|
"learning_rate": 7.977064411294698e-05, |
|
"loss": 0.5401, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.6633663366336634, |
|
"grad_norm": 0.5071209298396296, |
|
"learning_rate": 7.975562362332663e-05, |
|
"loss": 0.5593, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6683168316831684, |
|
"grad_norm": 0.516283465511997, |
|
"learning_rate": 7.974012826875436e-05, |
|
"loss": 0.4954, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6732673267326733, |
|
"grad_norm": 0.5655188759595408, |
|
"learning_rate": 7.972415823431599e-05, |
|
"loss": 0.5538, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.6782178217821783, |
|
"grad_norm": 0.6549665904978103, |
|
"learning_rate": 7.970771371076715e-05, |
|
"loss": 0.5451, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6831683168316832, |
|
"grad_norm": 0.46554827804728877, |
|
"learning_rate": 7.969079489453107e-05, |
|
"loss": 0.5309, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6881188118811881, |
|
"grad_norm": 0.5540469253338054, |
|
"learning_rate": 7.96734019876962e-05, |
|
"loss": 0.537, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.693069306930693, |
|
"grad_norm": 0.5218858858543225, |
|
"learning_rate": 7.965553519801385e-05, |
|
"loss": 0.5064, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.698019801980198, |
|
"grad_norm": 0.5966172911852246, |
|
"learning_rate": 7.963719473889562e-05, |
|
"loss": 0.5241, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7029702970297029, |
|
"grad_norm": 0.5341923603639215, |
|
"learning_rate": 7.961838082941094e-05, |
|
"loss": 0.5499, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.7079207920792079, |
|
"grad_norm": 0.5767031106953009, |
|
"learning_rate": 7.959909369428441e-05, |
|
"loss": 0.5624, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.7128712871287128, |
|
"grad_norm": 0.5458407996686117, |
|
"learning_rate": 7.957933356389306e-05, |
|
"loss": 0.5397, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7178217821782178, |
|
"grad_norm": 0.5028390484866985, |
|
"learning_rate": 7.955910067426377e-05, |
|
"loss": 0.5, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7227722772277227, |
|
"grad_norm": 0.5488835039202323, |
|
"learning_rate": 7.953839526707025e-05, |
|
"loss": 0.5259, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.7277227722772277, |
|
"grad_norm": 0.5494910122993174, |
|
"learning_rate": 7.951721758963028e-05, |
|
"loss": 0.5549, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.7326732673267327, |
|
"grad_norm": 0.8378055939091611, |
|
"learning_rate": 7.949556789490269e-05, |
|
"loss": 0.5371, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.7376237623762376, |
|
"grad_norm": 0.4873797556981422, |
|
"learning_rate": 7.94734464414844e-05, |
|
"loss": 0.5204, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.7425742574257426, |
|
"grad_norm": 0.4388553826856479, |
|
"learning_rate": 7.945085349360728e-05, |
|
"loss": 0.5087, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7475247524752475, |
|
"grad_norm": 0.5626439990620604, |
|
"learning_rate": 7.942778932113501e-05, |
|
"loss": 0.5139, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.7524752475247525, |
|
"grad_norm": 0.4858172017271616, |
|
"learning_rate": 7.940425419955988e-05, |
|
"loss": 0.5931, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.7574257425742574, |
|
"grad_norm": 0.49689698880553657, |
|
"learning_rate": 7.938024840999944e-05, |
|
"loss": 0.4846, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.7623762376237624, |
|
"grad_norm": 0.4735043212865626, |
|
"learning_rate": 7.935577223919322e-05, |
|
"loss": 0.532, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.7673267326732673, |
|
"grad_norm": 0.4823087172850099, |
|
"learning_rate": 7.933082597949925e-05, |
|
"loss": 0.5474, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7722772277227723, |
|
"grad_norm": 0.48575109641325953, |
|
"learning_rate": 7.930540992889056e-05, |
|
"loss": 0.5352, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.7772277227722773, |
|
"grad_norm": 0.45696382635128335, |
|
"learning_rate": 7.927952439095167e-05, |
|
"loss": 0.5574, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.7821782178217822, |
|
"grad_norm": 0.552745262863526, |
|
"learning_rate": 7.925316967487493e-05, |
|
"loss": 0.5778, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.7871287128712872, |
|
"grad_norm": 0.5833245097810845, |
|
"learning_rate": 7.922634609545685e-05, |
|
"loss": 0.5551, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.7920792079207921, |
|
"grad_norm": 0.5844763601599386, |
|
"learning_rate": 7.919905397309429e-05, |
|
"loss": 0.5079, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7970297029702971, |
|
"grad_norm": 0.6049939616394847, |
|
"learning_rate": 7.917129363378069e-05, |
|
"loss": 0.5453, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.801980198019802, |
|
"grad_norm": 0.5257417214567123, |
|
"learning_rate": 7.914306540910216e-05, |
|
"loss": 0.5367, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.806930693069307, |
|
"grad_norm": 0.559359106257865, |
|
"learning_rate": 7.91143696362335e-05, |
|
"loss": 0.5275, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.8118811881188119, |
|
"grad_norm": 0.5429558850215297, |
|
"learning_rate": 7.908520665793419e-05, |
|
"loss": 0.5386, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.8168316831683168, |
|
"grad_norm": 0.44832679740257586, |
|
"learning_rate": 7.905557682254429e-05, |
|
"loss": 0.4974, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8217821782178217, |
|
"grad_norm": 0.5580345366264446, |
|
"learning_rate": 7.902548048398028e-05, |
|
"loss": 0.5619, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.8267326732673267, |
|
"grad_norm": 0.5546502635861509, |
|
"learning_rate": 7.89949180017308e-05, |
|
"loss": 0.5476, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.8316831683168316, |
|
"grad_norm": 0.6024862192365625, |
|
"learning_rate": 7.896388974085246e-05, |
|
"loss": 0.5169, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8366336633663366, |
|
"grad_norm": 0.5722278989041788, |
|
"learning_rate": 7.893239607196537e-05, |
|
"loss": 0.5199, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.8415841584158416, |
|
"grad_norm": 0.6539747541763438, |
|
"learning_rate": 7.890043737124872e-05, |
|
"loss": 0.5129, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8465346534653465, |
|
"grad_norm": 0.46081078789593266, |
|
"learning_rate": 7.886801402043639e-05, |
|
"loss": 0.524, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.8514851485148515, |
|
"grad_norm": 0.686485354309828, |
|
"learning_rate": 7.883512640681226e-05, |
|
"loss": 0.5066, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.8564356435643564, |
|
"grad_norm": 0.5444571843193269, |
|
"learning_rate": 7.880177492320565e-05, |
|
"loss": 0.4786, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.8613861386138614, |
|
"grad_norm": 0.5012758064453766, |
|
"learning_rate": 7.876795996798665e-05, |
|
"loss": 0.5324, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.8663366336633663, |
|
"grad_norm": 0.5757321243521072, |
|
"learning_rate": 7.873368194506131e-05, |
|
"loss": 0.5004, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8712871287128713, |
|
"grad_norm": 0.45816342076746885, |
|
"learning_rate": 7.869894126386684e-05, |
|
"loss": 0.53, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.8762376237623762, |
|
"grad_norm": 0.6096807187009159, |
|
"learning_rate": 7.866373833936673e-05, |
|
"loss": 0.5656, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.8811881188118812, |
|
"grad_norm": 0.5343072779664616, |
|
"learning_rate": 7.862807359204574e-05, |
|
"loss": 0.5194, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.8861386138613861, |
|
"grad_norm": 0.545896110746946, |
|
"learning_rate": 7.859194744790498e-05, |
|
"loss": 0.5209, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.8910891089108911, |
|
"grad_norm": 0.636163192425731, |
|
"learning_rate": 7.855536033845673e-05, |
|
"loss": 0.5522, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8960396039603961, |
|
"grad_norm": 0.5038436060871251, |
|
"learning_rate": 7.851831270071929e-05, |
|
"loss": 0.5448, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.900990099009901, |
|
"grad_norm": 0.6024576745455106, |
|
"learning_rate": 7.848080497721181e-05, |
|
"loss": 0.4903, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.905940594059406, |
|
"grad_norm": 0.6076356159222136, |
|
"learning_rate": 7.844283761594899e-05, |
|
"loss": 0.5739, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.9108910891089109, |
|
"grad_norm": 0.6349903742004732, |
|
"learning_rate": 7.84044110704357e-05, |
|
"loss": 0.5619, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9158415841584159, |
|
"grad_norm": 0.5558020572859232, |
|
"learning_rate": 7.83655257996616e-05, |
|
"loss": 0.5208, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9207920792079208, |
|
"grad_norm": 0.43756054193606436, |
|
"learning_rate": 7.83261822680956e-05, |
|
"loss": 0.5377, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.9257425742574258, |
|
"grad_norm": 0.4905324957197573, |
|
"learning_rate": 7.828638094568041e-05, |
|
"loss": 0.5466, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.9306930693069307, |
|
"grad_norm": 0.4780144225156857, |
|
"learning_rate": 7.824612230782681e-05, |
|
"loss": 0.546, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.9356435643564357, |
|
"grad_norm": 0.538508122143981, |
|
"learning_rate": 7.820540683540808e-05, |
|
"loss": 0.5027, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.9405940594059405, |
|
"grad_norm": 0.49556896212553525, |
|
"learning_rate": 7.816423501475415e-05, |
|
"loss": 0.5166, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9455445544554455, |
|
"grad_norm": 0.4279890375269027, |
|
"learning_rate": 7.812260733764591e-05, |
|
"loss": 0.5356, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.9504950495049505, |
|
"grad_norm": 0.4943909611747402, |
|
"learning_rate": 7.80805243013092e-05, |
|
"loss": 0.5294, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.9554455445544554, |
|
"grad_norm": 0.4663007752860832, |
|
"learning_rate": 7.803798640840901e-05, |
|
"loss": 0.5154, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.9603960396039604, |
|
"grad_norm": 0.3908137583747954, |
|
"learning_rate": 7.799499416704338e-05, |
|
"loss": 0.4997, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.9653465346534653, |
|
"grad_norm": 0.5228603752159334, |
|
"learning_rate": 7.795154809073735e-05, |
|
"loss": 0.5262, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.9702970297029703, |
|
"grad_norm": 0.42838683826087115, |
|
"learning_rate": 7.790764869843684e-05, |
|
"loss": 0.4861, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.9752475247524752, |
|
"grad_norm": 0.4055948196662326, |
|
"learning_rate": 7.786329651450248e-05, |
|
"loss": 0.4859, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.9801980198019802, |
|
"grad_norm": 0.5966678290575969, |
|
"learning_rate": 7.781849206870325e-05, |
|
"loss": 0.5226, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.9851485148514851, |
|
"grad_norm": 1.880711225126043, |
|
"learning_rate": 7.77732358962103e-05, |
|
"loss": 0.5851, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.9900990099009901, |
|
"grad_norm": 0.6614925497720111, |
|
"learning_rate": 7.772752853759039e-05, |
|
"loss": 0.5411, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.995049504950495, |
|
"grad_norm": 0.46056442318357677, |
|
"learning_rate": 7.768137053879957e-05, |
|
"loss": 0.5579, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5087524233313512, |
|
"learning_rate": 7.763476245117659e-05, |
|
"loss": 0.5128, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.004950495049505, |
|
"grad_norm": 0.6518551254154236, |
|
"learning_rate": 7.758770483143634e-05, |
|
"loss": 0.4381, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.00990099009901, |
|
"grad_norm": 0.4785460623442812, |
|
"learning_rate": 7.754019824166318e-05, |
|
"loss": 0.4398, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.0148514851485149, |
|
"grad_norm": 0.6034904750730884, |
|
"learning_rate": 7.749224324930421e-05, |
|
"loss": 0.4319, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.0198019801980198, |
|
"grad_norm": 0.6008863564030689, |
|
"learning_rate": 7.744384042716258e-05, |
|
"loss": 0.437, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.0247524752475248, |
|
"grad_norm": 1.030655464186979, |
|
"learning_rate": 7.739499035339055e-05, |
|
"loss": 0.4395, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.0297029702970297, |
|
"grad_norm": 0.7109993890828257, |
|
"learning_rate": 7.734569361148262e-05, |
|
"loss": 0.3941, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.0346534653465347, |
|
"grad_norm": 0.6473203544118956, |
|
"learning_rate": 7.729595079026856e-05, |
|
"loss": 0.3682, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.0396039603960396, |
|
"grad_norm": 0.5274771133610835, |
|
"learning_rate": 7.724576248390639e-05, |
|
"loss": 0.4189, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.0445544554455446, |
|
"grad_norm": 0.575369977811731, |
|
"learning_rate": 7.719512929187527e-05, |
|
"loss": 0.3941, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.0495049504950495, |
|
"grad_norm": 0.5032659995015568, |
|
"learning_rate": 7.714405181896831e-05, |
|
"loss": 0.4178, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.0544554455445545, |
|
"grad_norm": 0.542411447489868, |
|
"learning_rate": 7.709253067528545e-05, |
|
"loss": 0.4335, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.0594059405940595, |
|
"grad_norm": 0.4825982906183251, |
|
"learning_rate": 7.704056647622603e-05, |
|
"loss": 0.4404, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.0643564356435644, |
|
"grad_norm": 0.8410004042826386, |
|
"learning_rate": 7.698815984248152e-05, |
|
"loss": 0.4403, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.0693069306930694, |
|
"grad_norm": 0.4351215223126758, |
|
"learning_rate": 7.693531140002811e-05, |
|
"loss": 0.3886, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.0742574257425743, |
|
"grad_norm": 0.4904375867707524, |
|
"learning_rate": 7.688202178011921e-05, |
|
"loss": 0.447, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.0792079207920793, |
|
"grad_norm": 0.5298422562122228, |
|
"learning_rate": 7.682829161927794e-05, |
|
"loss": 0.4621, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.0841584158415842, |
|
"grad_norm": 0.48790093011894936, |
|
"learning_rate": 7.677412155928946e-05, |
|
"loss": 0.4223, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.0891089108910892, |
|
"grad_norm": 0.5015700826758351, |
|
"learning_rate": 7.671951224719339e-05, |
|
"loss": 0.4163, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0940594059405941, |
|
"grad_norm": 0.9272870286113828, |
|
"learning_rate": 7.666446433527601e-05, |
|
"loss": 0.4401, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.099009900990099, |
|
"grad_norm": 0.4499978436086613, |
|
"learning_rate": 7.660897848106251e-05, |
|
"loss": 0.3956, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.103960396039604, |
|
"grad_norm": 0.6270774879370736, |
|
"learning_rate": 7.655305534730916e-05, |
|
"loss": 0.4191, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.108910891089109, |
|
"grad_norm": 0.47140106769279483, |
|
"learning_rate": 7.649669560199528e-05, |
|
"loss": 0.4275, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.113861386138614, |
|
"grad_norm": 0.4632505115641616, |
|
"learning_rate": 7.643989991831541e-05, |
|
"loss": 0.4198, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.118811881188119, |
|
"grad_norm": 0.5386755164319672, |
|
"learning_rate": 7.638266897467117e-05, |
|
"loss": 0.4001, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.1237623762376239, |
|
"grad_norm": 0.7288819939926411, |
|
"learning_rate": 7.632500345466318e-05, |
|
"loss": 0.4044, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.1287128712871288, |
|
"grad_norm": 0.6778386664812662, |
|
"learning_rate": 7.62669040470829e-05, |
|
"loss": 0.3975, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.1336633663366338, |
|
"grad_norm": 0.5286983261610211, |
|
"learning_rate": 7.620837144590444e-05, |
|
"loss": 0.4159, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.1386138613861387, |
|
"grad_norm": 2.652372059772227, |
|
"learning_rate": 7.61494063502762e-05, |
|
"loss": 0.4325, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.1435643564356435, |
|
"grad_norm": 4.809909490200282, |
|
"learning_rate": 7.609000946451255e-05, |
|
"loss": 0.4619, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.1485148514851484, |
|
"grad_norm": 0.8647324013296295, |
|
"learning_rate": 7.603018149808542e-05, |
|
"loss": 0.407, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.1534653465346534, |
|
"grad_norm": 0.5195039166414599, |
|
"learning_rate": 7.596992316561583e-05, |
|
"loss": 0.4496, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.1584158415841583, |
|
"grad_norm": 0.8504141877412958, |
|
"learning_rate": 7.590923518686537e-05, |
|
"loss": 0.4621, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.1633663366336633, |
|
"grad_norm": 0.7696908561259985, |
|
"learning_rate": 7.584811828672755e-05, |
|
"loss": 0.4744, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.1683168316831682, |
|
"grad_norm": 0.9598562979473423, |
|
"learning_rate": 7.578657319521918e-05, |
|
"loss": 0.4069, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.1732673267326732, |
|
"grad_norm": 0.5710619856339312, |
|
"learning_rate": 7.572460064747167e-05, |
|
"loss": 0.403, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.1782178217821782, |
|
"grad_norm": 0.4005092217182952, |
|
"learning_rate": 7.56622013837222e-05, |
|
"loss": 0.3622, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.183168316831683, |
|
"grad_norm": 0.48090434654331593, |
|
"learning_rate": 7.55993761493049e-05, |
|
"loss": 0.4021, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.188118811881188, |
|
"grad_norm": 0.5703687588411889, |
|
"learning_rate": 7.553612569464197e-05, |
|
"loss": 0.4375, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.193069306930693, |
|
"grad_norm": 0.681277315988626, |
|
"learning_rate": 7.547245077523466e-05, |
|
"loss": 0.4241, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.198019801980198, |
|
"grad_norm": 1.0094469420963084, |
|
"learning_rate": 7.540835215165431e-05, |
|
"loss": 0.4057, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.202970297029703, |
|
"grad_norm": 0.46952644922814796, |
|
"learning_rate": 7.534383058953321e-05, |
|
"loss": 0.4154, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.2079207920792079, |
|
"grad_norm": 0.459477309180552, |
|
"learning_rate": 7.527888685955551e-05, |
|
"loss": 0.3915, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.2128712871287128, |
|
"grad_norm": 0.5196788210280033, |
|
"learning_rate": 7.5213521737448e-05, |
|
"loss": 0.4354, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.2178217821782178, |
|
"grad_norm": 0.44785808199518773, |
|
"learning_rate": 7.514773600397076e-05, |
|
"loss": 0.3803, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.2227722772277227, |
|
"grad_norm": 0.4475790218783314, |
|
"learning_rate": 7.508153044490796e-05, |
|
"loss": 0.3788, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.2277227722772277, |
|
"grad_norm": 0.42812642794099415, |
|
"learning_rate": 7.50149058510584e-05, |
|
"loss": 0.4033, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.2326732673267327, |
|
"grad_norm": 0.46772116937732844, |
|
"learning_rate": 7.494786301822611e-05, |
|
"loss": 0.4613, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.2376237623762376, |
|
"grad_norm": 0.42095730344467575, |
|
"learning_rate": 7.488040274721077e-05, |
|
"loss": 0.4129, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2425742574257426, |
|
"grad_norm": 0.5026705098166492, |
|
"learning_rate": 7.481252584379822e-05, |
|
"loss": 0.4333, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.2475247524752475, |
|
"grad_norm": 0.5379197167928285, |
|
"learning_rate": 7.47442331187508e-05, |
|
"loss": 0.4521, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.2524752475247525, |
|
"grad_norm": 0.4761452900609588, |
|
"learning_rate": 7.467552538779768e-05, |
|
"loss": 0.3855, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.2574257425742574, |
|
"grad_norm": 0.42059836224797353, |
|
"learning_rate": 7.460640347162508e-05, |
|
"loss": 0.4074, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.2623762376237624, |
|
"grad_norm": 0.4919545295388046, |
|
"learning_rate": 7.453686819586655e-05, |
|
"loss": 0.4517, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.2673267326732673, |
|
"grad_norm": 0.5104604218051465, |
|
"learning_rate": 7.4466920391093e-05, |
|
"loss": 0.4044, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.2722772277227723, |
|
"grad_norm": 0.3973603962169888, |
|
"learning_rate": 7.439656089280286e-05, |
|
"loss": 0.3884, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.2772277227722773, |
|
"grad_norm": 0.616310935251882, |
|
"learning_rate": 7.432579054141208e-05, |
|
"loss": 0.4877, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.2821782178217822, |
|
"grad_norm": 0.4153672619602748, |
|
"learning_rate": 7.425461018224406e-05, |
|
"loss": 0.4104, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.2871287128712872, |
|
"grad_norm": 0.39941726347863327, |
|
"learning_rate": 7.418302066551959e-05, |
|
"loss": 0.4158, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.2920792079207921, |
|
"grad_norm": 0.48616898482849413, |
|
"learning_rate": 7.411102284634672e-05, |
|
"loss": 0.4134, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.297029702970297, |
|
"grad_norm": 0.5691458690856808, |
|
"learning_rate": 7.403861758471043e-05, |
|
"loss": 0.437, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.301980198019802, |
|
"grad_norm": 0.4525002043690617, |
|
"learning_rate": 7.396580574546251e-05, |
|
"loss": 0.4251, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.306930693069307, |
|
"grad_norm": 0.41152996338869813, |
|
"learning_rate": 7.38925881983111e-05, |
|
"loss": 0.4301, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.311881188118812, |
|
"grad_norm": 0.5199728322754111, |
|
"learning_rate": 7.381896581781042e-05, |
|
"loss": 0.4614, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.316831683168317, |
|
"grad_norm": 0.5732853732578159, |
|
"learning_rate": 7.37449394833502e-05, |
|
"loss": 0.4662, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.3217821782178218, |
|
"grad_norm": 0.386991910628394, |
|
"learning_rate": 7.367051007914527e-05, |
|
"loss": 0.4306, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.3267326732673268, |
|
"grad_norm": 0.3939310664362404, |
|
"learning_rate": 7.359567849422496e-05, |
|
"loss": 0.4192, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.3316831683168318, |
|
"grad_norm": 0.4972811754620147, |
|
"learning_rate": 7.352044562242248e-05, |
|
"loss": 0.4362, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.3366336633663367, |
|
"grad_norm": 0.3929729126966092, |
|
"learning_rate": 7.344481236236428e-05, |
|
"loss": 0.3945, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.3415841584158417, |
|
"grad_norm": 0.44752443075284315, |
|
"learning_rate": 7.336877961745926e-05, |
|
"loss": 0.3867, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.3465346534653464, |
|
"grad_norm": 0.5877124955257983, |
|
"learning_rate": 7.329234829588798e-05, |
|
"loss": 0.44, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.3514851485148514, |
|
"grad_norm": 0.40997637387949215, |
|
"learning_rate": 7.321551931059191e-05, |
|
"loss": 0.3722, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.3564356435643563, |
|
"grad_norm": 0.5657538530990273, |
|
"learning_rate": 7.313829357926238e-05, |
|
"loss": 0.4267, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.3613861386138613, |
|
"grad_norm": 0.3756317827004364, |
|
"learning_rate": 7.306067202432976e-05, |
|
"loss": 0.3908, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.3663366336633662, |
|
"grad_norm": 0.5546385290550994, |
|
"learning_rate": 7.29826555729523e-05, |
|
"loss": 0.4374, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.3712871287128712, |
|
"grad_norm": 0.4499102203305975, |
|
"learning_rate": 7.290424515700519e-05, |
|
"loss": 0.4315, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.3762376237623761, |
|
"grad_norm": 0.4163376224541207, |
|
"learning_rate": 7.282544171306933e-05, |
|
"loss": 0.384, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.381188118811881, |
|
"grad_norm": 0.495665339007046, |
|
"learning_rate": 7.274624618242022e-05, |
|
"loss": 0.4333, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.386138613861386, |
|
"grad_norm": 1.2421659532566276, |
|
"learning_rate": 7.266665951101664e-05, |
|
"loss": 0.3935, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.391089108910891, |
|
"grad_norm": 0.45672463011046993, |
|
"learning_rate": 7.258668264948941e-05, |
|
"loss": 0.3962, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.396039603960396, |
|
"grad_norm": 0.5235819000103409, |
|
"learning_rate": 7.250631655313001e-05, |
|
"loss": 0.4424, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.400990099009901, |
|
"grad_norm": 0.6014413984892761, |
|
"learning_rate": 7.242556218187919e-05, |
|
"loss": 0.4075, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.4059405940594059, |
|
"grad_norm": 0.5122752409469894, |
|
"learning_rate": 7.234442050031543e-05, |
|
"loss": 0.4478, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.4108910891089108, |
|
"grad_norm": 0.707055920708386, |
|
"learning_rate": 7.226289247764354e-05, |
|
"loss": 0.4558, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.4158415841584158, |
|
"grad_norm": 0.4096719374369129, |
|
"learning_rate": 7.2180979087683e-05, |
|
"loss": 0.3839, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.4207920792079207, |
|
"grad_norm": 0.4493556023975073, |
|
"learning_rate": 7.209868130885634e-05, |
|
"loss": 0.3971, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.4257425742574257, |
|
"grad_norm": 0.40448122098989603, |
|
"learning_rate": 7.201600012417745e-05, |
|
"loss": 0.405, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.4306930693069306, |
|
"grad_norm": 0.559370368504956, |
|
"learning_rate": 7.193293652123989e-05, |
|
"loss": 0.5013, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.4356435643564356, |
|
"grad_norm": 0.48070543113884184, |
|
"learning_rate": 7.1849491492205e-05, |
|
"loss": 0.4181, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.4405940594059405, |
|
"grad_norm": 0.4710653868134633, |
|
"learning_rate": 7.176566603379015e-05, |
|
"loss": 0.4373, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.4455445544554455, |
|
"grad_norm": 0.39331471032696247, |
|
"learning_rate": 7.168146114725673e-05, |
|
"loss": 0.4568, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.4504950495049505, |
|
"grad_norm": 0.4248344484638374, |
|
"learning_rate": 7.159687783839832e-05, |
|
"loss": 0.4448, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.4554455445544554, |
|
"grad_norm": 0.43034402237228425, |
|
"learning_rate": 7.151191711752854e-05, |
|
"loss": 0.4144, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.4603960396039604, |
|
"grad_norm": 0.40716737449552437, |
|
"learning_rate": 7.142657999946906e-05, |
|
"loss": 0.4052, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.4653465346534653, |
|
"grad_norm": 0.4458812208157574, |
|
"learning_rate": 7.134086750353747e-05, |
|
"loss": 0.4181, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.4702970297029703, |
|
"grad_norm": 0.5814322067144533, |
|
"learning_rate": 7.125478065353512e-05, |
|
"loss": 0.4153, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.4752475247524752, |
|
"grad_norm": 0.3957086261289198, |
|
"learning_rate": 7.116832047773484e-05, |
|
"loss": 0.4201, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.4801980198019802, |
|
"grad_norm": 0.4666211020909172, |
|
"learning_rate": 7.108148800886869e-05, |
|
"loss": 0.4236, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.4851485148514851, |
|
"grad_norm": 0.4369532098868193, |
|
"learning_rate": 7.09942842841156e-05, |
|
"loss": 0.425, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.49009900990099, |
|
"grad_norm": 0.4127291956437661, |
|
"learning_rate": 7.090671034508905e-05, |
|
"loss": 0.383, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.495049504950495, |
|
"grad_norm": 0.38608337390531794, |
|
"learning_rate": 7.081876723782457e-05, |
|
"loss": 0.3782, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.3999354505178847, |
|
"learning_rate": 7.073045601276723e-05, |
|
"loss": 0.4006, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.504950495049505, |
|
"grad_norm": 0.4460616369287887, |
|
"learning_rate": 7.064177772475912e-05, |
|
"loss": 0.4248, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.50990099009901, |
|
"grad_norm": 1.617368168957565, |
|
"learning_rate": 7.05527334330268e-05, |
|
"loss": 0.4058, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.5148514851485149, |
|
"grad_norm": 0.4205841237338601, |
|
"learning_rate": 7.046332420116852e-05, |
|
"loss": 0.4019, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.5198019801980198, |
|
"grad_norm": 0.4590401658758068, |
|
"learning_rate": 7.037355109714165e-05, |
|
"loss": 0.4375, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.5247524752475248, |
|
"grad_norm": 0.369456284577221, |
|
"learning_rate": 7.028341519324985e-05, |
|
"loss": 0.3981, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.5297029702970297, |
|
"grad_norm": 0.45818633341019355, |
|
"learning_rate": 7.019291756613029e-05, |
|
"loss": 0.4329, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.5346534653465347, |
|
"grad_norm": 0.41006878674759506, |
|
"learning_rate": 7.010205929674075e-05, |
|
"loss": 0.4358, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.5396039603960396, |
|
"grad_norm": 0.3660852415579547, |
|
"learning_rate": 7.001084147034676e-05, |
|
"loss": 0.3925, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.5445544554455446, |
|
"grad_norm": 0.49311989334696543, |
|
"learning_rate": 6.99192651765086e-05, |
|
"loss": 0.4045, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.5495049504950495, |
|
"grad_norm": 0.4224653494426318, |
|
"learning_rate": 6.982733150906833e-05, |
|
"loss": 0.4549, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.5544554455445545, |
|
"grad_norm": 0.441676447696254, |
|
"learning_rate": 6.973504156613666e-05, |
|
"loss": 0.4186, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.5594059405940595, |
|
"grad_norm": 0.39314186157633607, |
|
"learning_rate": 6.964239645007989e-05, |
|
"loss": 0.3917, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.5643564356435644, |
|
"grad_norm": 0.36195381366820006, |
|
"learning_rate": 6.954939726750667e-05, |
|
"loss": 0.3886, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.5693069306930694, |
|
"grad_norm": 0.38026650238359444, |
|
"learning_rate": 6.945604512925493e-05, |
|
"loss": 0.3953, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.5742574257425743, |
|
"grad_norm": 0.4727277048957753, |
|
"learning_rate": 6.936234115037842e-05, |
|
"loss": 0.4081, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.5792079207920793, |
|
"grad_norm": 0.4122641931550936, |
|
"learning_rate": 6.926828645013353e-05, |
|
"loss": 0.4096, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.5841584158415842, |
|
"grad_norm": 0.41972741676629705, |
|
"learning_rate": 6.917388215196585e-05, |
|
"loss": 0.4325, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.5891089108910892, |
|
"grad_norm": 0.37213910332961514, |
|
"learning_rate": 6.907912938349682e-05, |
|
"loss": 0.408, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.5940594059405941, |
|
"grad_norm": 0.4079206922373049, |
|
"learning_rate": 6.898402927651019e-05, |
|
"loss": 0.3932, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.599009900990099, |
|
"grad_norm": 0.5391239274082024, |
|
"learning_rate": 6.88885829669385e-05, |
|
"loss": 0.4413, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.603960396039604, |
|
"grad_norm": 0.3304796112354852, |
|
"learning_rate": 6.879279159484961e-05, |
|
"loss": 0.3793, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.608910891089109, |
|
"grad_norm": 0.4565949481665514, |
|
"learning_rate": 6.869665630443295e-05, |
|
"loss": 0.4088, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.613861386138614, |
|
"grad_norm": 0.43115710841031746, |
|
"learning_rate": 6.860017824398595e-05, |
|
"loss": 0.4024, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.618811881188119, |
|
"grad_norm": 0.4473410186927358, |
|
"learning_rate": 6.85033585659003e-05, |
|
"loss": 0.4, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.6237623762376239, |
|
"grad_norm": 0.34319850357607024, |
|
"learning_rate": 6.84061984266481e-05, |
|
"loss": 0.3749, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.6287128712871288, |
|
"grad_norm": 0.3991717496121638, |
|
"learning_rate": 6.830869898676822e-05, |
|
"loss": 0.4249, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.6336633663366338, |
|
"grad_norm": 0.3669421848461611, |
|
"learning_rate": 6.82108614108523e-05, |
|
"loss": 0.3747, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.6386138613861387, |
|
"grad_norm": 0.40366481969746204, |
|
"learning_rate": 6.811268686753086e-05, |
|
"loss": 0.4194, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.6435643564356437, |
|
"grad_norm": 0.33856718283042686, |
|
"learning_rate": 6.801417652945939e-05, |
|
"loss": 0.4048, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.6485148514851486, |
|
"grad_norm": 0.38286010896252787, |
|
"learning_rate": 6.79153315733043e-05, |
|
"loss": 0.4078, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.6534653465346536, |
|
"grad_norm": 0.41697274751788355, |
|
"learning_rate": 6.781615317972886e-05, |
|
"loss": 0.4177, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.6584158415841586, |
|
"grad_norm": 0.5184797797560337, |
|
"learning_rate": 6.771664253337916e-05, |
|
"loss": 0.4306, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.6633663366336635, |
|
"grad_norm": 0.38783387993392643, |
|
"learning_rate": 6.761680082286988e-05, |
|
"loss": 0.394, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.6683168316831685, |
|
"grad_norm": 0.3428733325169249, |
|
"learning_rate": 6.751662924077015e-05, |
|
"loss": 0.3672, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.6732673267326734, |
|
"grad_norm": 0.38619372406045765, |
|
"learning_rate": 6.741612898358924e-05, |
|
"loss": 0.4151, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.6782178217821784, |
|
"grad_norm": 0.5825910251609712, |
|
"learning_rate": 6.731530125176237e-05, |
|
"loss": 0.3999, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.6831683168316833, |
|
"grad_norm": 0.38484297742645873, |
|
"learning_rate": 6.721414724963631e-05, |
|
"loss": 0.4128, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.688118811881188, |
|
"grad_norm": 0.3374983247266092, |
|
"learning_rate": 6.711266818545494e-05, |
|
"loss": 0.4031, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.693069306930693, |
|
"grad_norm": 0.3826217100514408, |
|
"learning_rate": 6.701086527134491e-05, |
|
"loss": 0.4009, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.698019801980198, |
|
"grad_norm": 0.4591363006682774, |
|
"learning_rate": 6.690873972330116e-05, |
|
"loss": 0.4228, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.702970297029703, |
|
"grad_norm": 0.40260910474197376, |
|
"learning_rate": 6.68062927611723e-05, |
|
"loss": 0.3947, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.7079207920792079, |
|
"grad_norm": 0.4397536768863476, |
|
"learning_rate": 6.670352560864615e-05, |
|
"loss": 0.3908, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.7128712871287128, |
|
"grad_norm": 0.43329899477194356, |
|
"learning_rate": 6.660043949323505e-05, |
|
"loss": 0.4372, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.7178217821782178, |
|
"grad_norm": 0.4602501332408058, |
|
"learning_rate": 6.649703564626125e-05, |
|
"loss": 0.3841, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.7227722772277227, |
|
"grad_norm": 0.35344976361552355, |
|
"learning_rate": 6.639331530284214e-05, |
|
"loss": 0.3727, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.7277227722772277, |
|
"grad_norm": 0.45197956712151793, |
|
"learning_rate": 6.628927970187557e-05, |
|
"loss": 0.3986, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.7326732673267327, |
|
"grad_norm": 0.3319696461521242, |
|
"learning_rate": 6.618493008602496e-05, |
|
"loss": 0.416, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.7376237623762376, |
|
"grad_norm": 0.4240346650785551, |
|
"learning_rate": 6.608026770170459e-05, |
|
"loss": 0.4162, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.7425742574257426, |
|
"grad_norm": 0.37060686199685133, |
|
"learning_rate": 6.597529379906455e-05, |
|
"loss": 0.4134, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.7475247524752475, |
|
"grad_norm": 0.36029614953276784, |
|
"learning_rate": 6.587000963197598e-05, |
|
"loss": 0.4135, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.7524752475247525, |
|
"grad_norm": 0.3512525066622723, |
|
"learning_rate": 6.576441645801592e-05, |
|
"loss": 0.3958, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.7574257425742574, |
|
"grad_norm": 0.426444933184236, |
|
"learning_rate": 6.565851553845242e-05, |
|
"loss": 0.4306, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.7623762376237624, |
|
"grad_norm": 0.45810053308481247, |
|
"learning_rate": 6.555230813822942e-05, |
|
"loss": 0.4515, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.7673267326732673, |
|
"grad_norm": 0.3257359743366414, |
|
"learning_rate": 6.544579552595165e-05, |
|
"loss": 0.3882, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.7722772277227723, |
|
"grad_norm": 0.5120549397179133, |
|
"learning_rate": 6.533897897386946e-05, |
|
"loss": 0.3977, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.7772277227722773, |
|
"grad_norm": 0.3741740274690065, |
|
"learning_rate": 6.523185975786366e-05, |
|
"loss": 0.389, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.7821782178217822, |
|
"grad_norm": 0.42787211118613316, |
|
"learning_rate": 6.512443915743024e-05, |
|
"loss": 0.4786, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.7871287128712872, |
|
"grad_norm": 0.42024261733771173, |
|
"learning_rate": 6.501671845566512e-05, |
|
"loss": 0.4744, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.7920792079207921, |
|
"grad_norm": 0.33419863088736074, |
|
"learning_rate": 6.49086989392488e-05, |
|
"loss": 0.4185, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.797029702970297, |
|
"grad_norm": 0.35002307084669404, |
|
"learning_rate": 6.480038189843101e-05, |
|
"loss": 0.4146, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.801980198019802, |
|
"grad_norm": 0.35584559302997615, |
|
"learning_rate": 6.469176862701529e-05, |
|
"loss": 0.3884, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.806930693069307, |
|
"grad_norm": 0.3811447477456138, |
|
"learning_rate": 6.458286042234352e-05, |
|
"loss": 0.416, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.811881188118812, |
|
"grad_norm": 0.3853982786626955, |
|
"learning_rate": 6.447365858528046e-05, |
|
"loss": 0.4125, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.8168316831683167, |
|
"grad_norm": 0.3630198314521477, |
|
"learning_rate": 6.436416442019817e-05, |
|
"loss": 0.4156, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.8217821782178216, |
|
"grad_norm": 0.39036383937421515, |
|
"learning_rate": 6.425437923496045e-05, |
|
"loss": 0.4063, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.8267326732673266, |
|
"grad_norm": 0.34303444284998674, |
|
"learning_rate": 6.414430434090725e-05, |
|
"loss": 0.3907, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.8316831683168315, |
|
"grad_norm": 0.3721425083007295, |
|
"learning_rate": 6.403394105283897e-05, |
|
"loss": 0.3844, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.8366336633663365, |
|
"grad_norm": 0.3728755734239106, |
|
"learning_rate": 6.392329068900072e-05, |
|
"loss": 0.3786, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.8415841584158414, |
|
"grad_norm": 0.42539329737431825, |
|
"learning_rate": 6.381235457106664e-05, |
|
"loss": 0.4059, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.8465346534653464, |
|
"grad_norm": 0.3516158152148282, |
|
"learning_rate": 6.370113402412412e-05, |
|
"loss": 0.3877, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.8514851485148514, |
|
"grad_norm": 0.4715333524372133, |
|
"learning_rate": 6.358963037665787e-05, |
|
"loss": 0.408, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.8564356435643563, |
|
"grad_norm": 0.41249812553139303, |
|
"learning_rate": 6.347784496053416e-05, |
|
"loss": 0.38, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.8613861386138613, |
|
"grad_norm": 0.46013637363793264, |
|
"learning_rate": 6.336577911098493e-05, |
|
"loss": 0.3771, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.8663366336633662, |
|
"grad_norm": 0.4205308152403992, |
|
"learning_rate": 6.325343416659166e-05, |
|
"loss": 0.4055, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.8712871287128712, |
|
"grad_norm": 0.3766714102667099, |
|
"learning_rate": 6.314081146926964e-05, |
|
"loss": 0.4226, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.8762376237623761, |
|
"grad_norm": 0.39466868097270724, |
|
"learning_rate": 6.302791236425169e-05, |
|
"loss": 0.3764, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.881188118811881, |
|
"grad_norm": 0.41414658631461326, |
|
"learning_rate": 6.291473820007227e-05, |
|
"loss": 0.3937, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.886138613861386, |
|
"grad_norm": 0.4008669055354229, |
|
"learning_rate": 6.280129032855132e-05, |
|
"loss": 0.365, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.891089108910891, |
|
"grad_norm": 0.4299626259785867, |
|
"learning_rate": 6.268757010477806e-05, |
|
"loss": 0.3978, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.896039603960396, |
|
"grad_norm": 0.4070041784450541, |
|
"learning_rate": 6.257357888709492e-05, |
|
"loss": 0.4278, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.900990099009901, |
|
"grad_norm": 0.42982053279620297, |
|
"learning_rate": 6.245931803708116e-05, |
|
"loss": 0.4199, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.9059405940594059, |
|
"grad_norm": 0.4548088267032186, |
|
"learning_rate": 6.234478891953674e-05, |
|
"loss": 0.4105, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.9108910891089108, |
|
"grad_norm": 0.34938003297126835, |
|
"learning_rate": 6.222999290246595e-05, |
|
"loss": 0.3826, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.9158415841584158, |
|
"grad_norm": 0.4449921528368388, |
|
"learning_rate": 6.211493135706109e-05, |
|
"loss": 0.3915, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.9207920792079207, |
|
"grad_norm": 0.3954858861397491, |
|
"learning_rate": 6.199960565768611e-05, |
|
"loss": 0.4206, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.9257425742574257, |
|
"grad_norm": 0.3680314701826789, |
|
"learning_rate": 6.188401718186013e-05, |
|
"loss": 0.4084, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.9306930693069306, |
|
"grad_norm": 0.3616975564074992, |
|
"learning_rate": 6.17681673102411e-05, |
|
"loss": 0.3903, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.9356435643564356, |
|
"grad_norm": 0.37988021748884604, |
|
"learning_rate": 6.165205742660915e-05, |
|
"loss": 0.4257, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.9405940594059405, |
|
"grad_norm": 0.35051686029822304, |
|
"learning_rate": 6.15356889178502e-05, |
|
"loss": 0.4141, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.9455445544554455, |
|
"grad_norm": 0.36629291399470554, |
|
"learning_rate": 6.141906317393934e-05, |
|
"loss": 0.3573, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.9504950495049505, |
|
"grad_norm": 0.35465338755441933, |
|
"learning_rate": 6.130218158792421e-05, |
|
"loss": 0.3634, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.9554455445544554, |
|
"grad_norm": 0.3741800019861695, |
|
"learning_rate": 6.118504555590843e-05, |
|
"loss": 0.4103, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.9603960396039604, |
|
"grad_norm": 0.3845094498974814, |
|
"learning_rate": 6.10676564770348e-05, |
|
"loss": 0.4255, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.9653465346534653, |
|
"grad_norm": 0.3427607558490164, |
|
"learning_rate": 6.0950015753468745e-05, |
|
"loss": 0.3549, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.9702970297029703, |
|
"grad_norm": 0.38103519315598317, |
|
"learning_rate": 6.083212479038143e-05, |
|
"loss": 0.37, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.9752475247524752, |
|
"grad_norm": 0.349874345182412, |
|
"learning_rate": 6.0713984995933016e-05, |
|
"loss": 0.4148, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.9801980198019802, |
|
"grad_norm": 0.3526050935966009, |
|
"learning_rate": 6.059559778125593e-05, |
|
"loss": 0.4193, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.9851485148514851, |
|
"grad_norm": 0.41226502171796636, |
|
"learning_rate": 6.0476964560437864e-05, |
|
"loss": 0.4042, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.99009900990099, |
|
"grad_norm": 0.43735266521817034, |
|
"learning_rate": 6.035808675050497e-05, |
|
"loss": 0.4042, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.995049504950495, |
|
"grad_norm": 0.3808948433009949, |
|
"learning_rate": 6.023896577140496e-05, |
|
"loss": 0.4242, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3753919394648301, |
|
"learning_rate": 6.011960304599003e-05, |
|
"loss": 0.3721, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.004950495049505, |
|
"grad_norm": 0.5337736086813397, |
|
"learning_rate": 6.000000000000001e-05, |
|
"loss": 0.256, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.00990099009901, |
|
"grad_norm": 0.43713310875092315, |
|
"learning_rate": 5.988015806204521e-05, |
|
"loss": 0.2488, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.014851485148515, |
|
"grad_norm": 0.4739558661506129, |
|
"learning_rate": 5.9760078663589454e-05, |
|
"loss": 0.2214, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.01980198019802, |
|
"grad_norm": 0.5637416820705298, |
|
"learning_rate": 5.9639763238932893e-05, |
|
"loss": 0.2541, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.0247524752475248, |
|
"grad_norm": 0.41314659967319184, |
|
"learning_rate": 5.9519213225194944e-05, |
|
"loss": 0.2242, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.0297029702970297, |
|
"grad_norm": 0.4476975471705969, |
|
"learning_rate": 5.9398430062297104e-05, |
|
"loss": 0.2436, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.0346534653465347, |
|
"grad_norm": 0.554086376519811, |
|
"learning_rate": 5.9277415192945707e-05, |
|
"loss": 0.2636, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.0396039603960396, |
|
"grad_norm": 0.3896325397871647, |
|
"learning_rate": 5.915617006261475e-05, |
|
"loss": 0.2163, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.0445544554455446, |
|
"grad_norm": 0.4211534758131753, |
|
"learning_rate": 5.903469611952861e-05, |
|
"loss": 0.2069, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.0495049504950495, |
|
"grad_norm": 0.4673961047376062, |
|
"learning_rate": 5.891299481464473e-05, |
|
"loss": 0.2357, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.0544554455445545, |
|
"grad_norm": 0.4699121939823616, |
|
"learning_rate": 5.8791067601636305e-05, |
|
"loss": 0.2132, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.0594059405940595, |
|
"grad_norm": 0.40250579961688465, |
|
"learning_rate": 5.866891593687492e-05, |
|
"loss": 0.2371, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.0643564356435644, |
|
"grad_norm": 0.4430137787396212, |
|
"learning_rate": 5.8546541279413094e-05, |
|
"loss": 0.2206, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.0693069306930694, |
|
"grad_norm": 0.6560412040889878, |
|
"learning_rate": 5.842394509096699e-05, |
|
"loss": 0.2548, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.0742574257425743, |
|
"grad_norm": 0.41799449349599727, |
|
"learning_rate": 5.8301128835898814e-05, |
|
"loss": 0.2098, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.0792079207920793, |
|
"grad_norm": 0.5457732239448745, |
|
"learning_rate": 5.817809398119937e-05, |
|
"loss": 0.2207, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.0841584158415842, |
|
"grad_norm": 0.4420461787241115, |
|
"learning_rate": 5.805484199647059e-05, |
|
"loss": 0.2348, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.089108910891089, |
|
"grad_norm": 0.34045786091490615, |
|
"learning_rate": 5.7931374353907904e-05, |
|
"loss": 0.237, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.094059405940594, |
|
"grad_norm": 0.5284323872556919, |
|
"learning_rate": 5.780769252828268e-05, |
|
"loss": 0.2465, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.099009900990099, |
|
"grad_norm": 0.38111779412176855, |
|
"learning_rate": 5.768379799692469e-05, |
|
"loss": 0.2203, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.103960396039604, |
|
"grad_norm": 0.48871360246565815, |
|
"learning_rate": 5.7559692239704255e-05, |
|
"loss": 0.2303, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.108910891089109, |
|
"grad_norm": 0.44331771149086596, |
|
"learning_rate": 5.743537673901485e-05, |
|
"loss": 0.2252, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.113861386138614, |
|
"grad_norm": 0.4268642412980195, |
|
"learning_rate": 5.731085297975516e-05, |
|
"loss": 0.2142, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.118811881188119, |
|
"grad_norm": 0.6275427494810922, |
|
"learning_rate": 5.718612244931146e-05, |
|
"loss": 0.2671, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.123762376237624, |
|
"grad_norm": 0.327732145674678, |
|
"learning_rate": 5.706118663753982e-05, |
|
"loss": 0.2165, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.128712871287129, |
|
"grad_norm": 0.4656441860841262, |
|
"learning_rate": 5.6936047036748335e-05, |
|
"loss": 0.2609, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.133663366336634, |
|
"grad_norm": 0.30479417092528327, |
|
"learning_rate": 5.6810705141679246e-05, |
|
"loss": 0.2076, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.1386138613861387, |
|
"grad_norm": 0.402408077735042, |
|
"learning_rate": 5.6685162449491125e-05, |
|
"loss": 0.2352, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.1435643564356437, |
|
"grad_norm": 0.3341797567078978, |
|
"learning_rate": 5.655942045974101e-05, |
|
"loss": 0.2223, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.1485148514851486, |
|
"grad_norm": 0.31991687492148513, |
|
"learning_rate": 5.643348067436644e-05, |
|
"loss": 0.2234, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.1534653465346536, |
|
"grad_norm": 0.31279738632606163, |
|
"learning_rate": 5.6307344597667555e-05, |
|
"loss": 0.2297, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.1584158415841586, |
|
"grad_norm": 0.33002843854360975, |
|
"learning_rate": 5.6181013736289114e-05, |
|
"loss": 0.2174, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.1633663366336635, |
|
"grad_norm": 0.35214742510811414, |
|
"learning_rate": 5.605448959920251e-05, |
|
"loss": 0.2256, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.1683168316831685, |
|
"grad_norm": 0.40557291840727117, |
|
"learning_rate": 5.5927773697687726e-05, |
|
"loss": 0.2428, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.1732673267326734, |
|
"grad_norm": 0.37799249434729715, |
|
"learning_rate": 5.580086754531527e-05, |
|
"loss": 0.204, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.1782178217821784, |
|
"grad_norm": 0.3420962200582764, |
|
"learning_rate": 5.567377265792819e-05, |
|
"loss": 0.2366, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.1831683168316833, |
|
"grad_norm": 0.3751006820518302, |
|
"learning_rate": 5.554649055362381e-05, |
|
"loss": 0.2337, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.1881188118811883, |
|
"grad_norm": 0.3571671609515524, |
|
"learning_rate": 5.5419022752735764e-05, |
|
"loss": 0.2266, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.1930693069306932, |
|
"grad_norm": 0.37320519253976664, |
|
"learning_rate": 5.5291370777815693e-05, |
|
"loss": 0.232, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.198019801980198, |
|
"grad_norm": 0.3161181093567384, |
|
"learning_rate": 5.5163536153615185e-05, |
|
"loss": 0.2235, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.202970297029703, |
|
"grad_norm": 0.46028647031599745, |
|
"learning_rate": 5.503552040706744e-05, |
|
"loss": 0.2503, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.207920792079208, |
|
"grad_norm": 0.3193257025863596, |
|
"learning_rate": 5.490732506726911e-05, |
|
"loss": 0.2317, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.212871287128713, |
|
"grad_norm": 0.3351952567580859, |
|
"learning_rate": 5.477895166546207e-05, |
|
"loss": 0.2301, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.217821782178218, |
|
"grad_norm": 0.46300097922283723, |
|
"learning_rate": 5.4650401735014985e-05, |
|
"loss": 0.2236, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.222772277227723, |
|
"grad_norm": 0.36846314014382436, |
|
"learning_rate": 5.452167681140515e-05, |
|
"loss": 0.2591, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.227722772277228, |
|
"grad_norm": 0.3411528537586109, |
|
"learning_rate": 5.4392778432200044e-05, |
|
"loss": 0.2232, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.232673267326733, |
|
"grad_norm": 0.40101497418695975, |
|
"learning_rate": 5.426370813703903e-05, |
|
"loss": 0.2468, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.237623762376238, |
|
"grad_norm": 0.36325004554083573, |
|
"learning_rate": 5.4134467467614945e-05, |
|
"loss": 0.2333, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.2425742574257423, |
|
"grad_norm": 0.4039573273985563, |
|
"learning_rate": 5.4005057967655634e-05, |
|
"loss": 0.2389, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.2475247524752477, |
|
"grad_norm": 0.2859750720015901, |
|
"learning_rate": 5.3875481182905595e-05, |
|
"loss": 0.2119, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.2524752475247523, |
|
"grad_norm": 0.4823602187196335, |
|
"learning_rate": 5.374573866110746e-05, |
|
"loss": 0.2547, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.2574257425742577, |
|
"grad_norm": 0.36717753790523777, |
|
"learning_rate": 5.3615831951983535e-05, |
|
"loss": 0.2157, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.262376237623762, |
|
"grad_norm": 0.356291153234771, |
|
"learning_rate": 5.348576260721725e-05, |
|
"loss": 0.2484, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.2673267326732676, |
|
"grad_norm": 0.34698712469187953, |
|
"learning_rate": 5.3355532180434696e-05, |
|
"loss": 0.2309, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.272277227722772, |
|
"grad_norm": 0.3596381562555409, |
|
"learning_rate": 5.3225142227185974e-05, |
|
"loss": 0.229, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.2772277227722775, |
|
"grad_norm": 0.32545619571413364, |
|
"learning_rate": 5.309459430492672e-05, |
|
"loss": 0.2264, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.282178217821782, |
|
"grad_norm": 0.35180510825756656, |
|
"learning_rate": 5.2963889972999384e-05, |
|
"loss": 0.2475, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.287128712871287, |
|
"grad_norm": 0.3606148958622392, |
|
"learning_rate": 5.283303079261471e-05, |
|
"loss": 0.2442, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.292079207920792, |
|
"grad_norm": 0.3563222394672158, |
|
"learning_rate": 5.2702018326833044e-05, |
|
"loss": 0.2347, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.297029702970297, |
|
"grad_norm": 0.3831444963521827, |
|
"learning_rate": 5.257085414054565e-05, |
|
"loss": 0.2747, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.301980198019802, |
|
"grad_norm": 0.33730785554599724, |
|
"learning_rate": 5.243953980045603e-05, |
|
"loss": 0.2388, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.3069306930693068, |
|
"grad_norm": 0.32495238050734326, |
|
"learning_rate": 5.230807687506122e-05, |
|
"loss": 0.2364, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.3118811881188117, |
|
"grad_norm": 0.37140573353028156, |
|
"learning_rate": 5.2176466934633045e-05, |
|
"loss": 0.2436, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 2.3168316831683167, |
|
"grad_norm": 0.36493258422574965, |
|
"learning_rate": 5.204471155119938e-05, |
|
"loss": 0.2337, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.3217821782178216, |
|
"grad_norm": 0.3152805218553869, |
|
"learning_rate": 5.191281229852534e-05, |
|
"loss": 0.2505, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 2.3267326732673266, |
|
"grad_norm": 0.3346938783234162, |
|
"learning_rate": 5.17807707520945e-05, |
|
"loss": 0.2129, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.3316831683168315, |
|
"grad_norm": 0.5222639196813958, |
|
"learning_rate": 5.164858848909009e-05, |
|
"loss": 0.2666, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 2.3366336633663365, |
|
"grad_norm": 0.31652796650422926, |
|
"learning_rate": 5.151626708837612e-05, |
|
"loss": 0.217, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.3415841584158414, |
|
"grad_norm": 0.3426513953545765, |
|
"learning_rate": 5.1383808130478605e-05, |
|
"loss": 0.2374, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 2.3465346534653464, |
|
"grad_norm": 0.36290882542143066, |
|
"learning_rate": 5.1251213197566515e-05, |
|
"loss": 0.235, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.3514851485148514, |
|
"grad_norm": 0.42009903492564477, |
|
"learning_rate": 5.11184838734331e-05, |
|
"loss": 0.2441, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.3564356435643563, |
|
"grad_norm": 0.3054426542267136, |
|
"learning_rate": 5.098562174347679e-05, |
|
"loss": 0.2157, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.3613861386138613, |
|
"grad_norm": 0.3823497680281174, |
|
"learning_rate": 5.085262839468236e-05, |
|
"loss": 0.2248, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 2.366336633663366, |
|
"grad_norm": 0.5103288301977376, |
|
"learning_rate": 5.071950541560193e-05, |
|
"loss": 0.2518, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.371287128712871, |
|
"grad_norm": 0.30455267923588514, |
|
"learning_rate": 5.058625439633599e-05, |
|
"loss": 0.2181, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 2.376237623762376, |
|
"grad_norm": 0.3818419897600763, |
|
"learning_rate": 5.0452876928514434e-05, |
|
"loss": 0.2285, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.381188118811881, |
|
"grad_norm": 0.3699320852480438, |
|
"learning_rate": 5.031937460527753e-05, |
|
"loss": 0.2398, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 2.386138613861386, |
|
"grad_norm": 0.3644442422492638, |
|
"learning_rate": 5.018574902125689e-05, |
|
"loss": 0.2351, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 2.391089108910891, |
|
"grad_norm": 0.35930941575494146, |
|
"learning_rate": 5.005200177255645e-05, |
|
"loss": 0.2342, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 2.396039603960396, |
|
"grad_norm": 0.37752588100484835, |
|
"learning_rate": 4.991813445673334e-05, |
|
"loss": 0.2253, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 2.400990099009901, |
|
"grad_norm": 0.32893758478857993, |
|
"learning_rate": 4.9784148672778864e-05, |
|
"loss": 0.2282, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.405940594059406, |
|
"grad_norm": 0.30149336304495533, |
|
"learning_rate": 4.965004602109938e-05, |
|
"loss": 0.2061, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 2.410891089108911, |
|
"grad_norm": 0.3219254238354535, |
|
"learning_rate": 4.95158281034972e-05, |
|
"loss": 0.2163, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 2.4158415841584158, |
|
"grad_norm": 0.31525960771805756, |
|
"learning_rate": 4.938149652315142e-05, |
|
"loss": 0.2321, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.4207920792079207, |
|
"grad_norm": 0.39088144618312287, |
|
"learning_rate": 4.92470528845988e-05, |
|
"loss": 0.2567, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 2.4257425742574257, |
|
"grad_norm": 0.3449499761490203, |
|
"learning_rate": 4.911249879371457e-05, |
|
"loss": 0.2372, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.4306930693069306, |
|
"grad_norm": 0.45815109391750647, |
|
"learning_rate": 4.897783585769331e-05, |
|
"loss": 0.2357, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 2.4356435643564356, |
|
"grad_norm": 0.3118857748477582, |
|
"learning_rate": 4.884306568502968e-05, |
|
"loss": 0.2327, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 2.4405940594059405, |
|
"grad_norm": 0.3386329573636345, |
|
"learning_rate": 4.870818988549923e-05, |
|
"loss": 0.2576, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 2.4455445544554455, |
|
"grad_norm": 0.33032199797382256, |
|
"learning_rate": 4.857321007013924e-05, |
|
"loss": 0.236, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 2.4504950495049505, |
|
"grad_norm": 0.30078783509268575, |
|
"learning_rate": 4.843812785122933e-05, |
|
"loss": 0.2181, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.4554455445544554, |
|
"grad_norm": 0.34354471178005036, |
|
"learning_rate": 4.830294484227236e-05, |
|
"loss": 0.2349, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 2.4603960396039604, |
|
"grad_norm": 0.32969555135194983, |
|
"learning_rate": 4.816766265797505e-05, |
|
"loss": 0.2446, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 2.4653465346534653, |
|
"grad_norm": 0.3379586969710406, |
|
"learning_rate": 4.8032282914228743e-05, |
|
"loss": 0.2402, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.4702970297029703, |
|
"grad_norm": 0.3148344857955309, |
|
"learning_rate": 4.78968072280901e-05, |
|
"loss": 0.2097, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 2.4752475247524752, |
|
"grad_norm": 0.31145265258949967, |
|
"learning_rate": 4.7761237217761736e-05, |
|
"loss": 0.2172, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.48019801980198, |
|
"grad_norm": 0.3017333863543826, |
|
"learning_rate": 4.7625574502572975e-05, |
|
"loss": 0.2222, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.485148514851485, |
|
"grad_norm": 0.35871571827001464, |
|
"learning_rate": 4.7489820702960444e-05, |
|
"loss": 0.2245, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 2.49009900990099, |
|
"grad_norm": 0.31489941844300473, |
|
"learning_rate": 4.735397744044874e-05, |
|
"loss": 0.2187, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 2.495049504950495, |
|
"grad_norm": 0.3639077058752089, |
|
"learning_rate": 4.721804633763105e-05, |
|
"loss": 0.2393, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.3193495325768187, |
|
"learning_rate": 4.7082029018149816e-05, |
|
"loss": 0.2404, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.504950495049505, |
|
"grad_norm": 0.35786177063036884, |
|
"learning_rate": 4.694592710667723e-05, |
|
"loss": 0.2563, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.50990099009901, |
|
"grad_norm": 0.36096478997948417, |
|
"learning_rate": 4.680974222889595e-05, |
|
"loss": 0.2461, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 2.514851485148515, |
|
"grad_norm": 0.3205057209076732, |
|
"learning_rate": 4.667347601147965e-05, |
|
"loss": 0.2146, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 2.51980198019802, |
|
"grad_norm": 0.3052920805898413, |
|
"learning_rate": 4.653713008207353e-05, |
|
"loss": 0.2008, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 2.5247524752475248, |
|
"grad_norm": 0.31137170709518075, |
|
"learning_rate": 4.640070606927497e-05, |
|
"loss": 0.2239, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.5297029702970297, |
|
"grad_norm": 0.3339899865251113, |
|
"learning_rate": 4.6264205602613944e-05, |
|
"loss": 0.2238, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 2.5346534653465347, |
|
"grad_norm": 0.2869330890283644, |
|
"learning_rate": 4.612763031253372e-05, |
|
"loss": 0.1928, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.5396039603960396, |
|
"grad_norm": 0.35107221358186375, |
|
"learning_rate": 4.599098183037127e-05, |
|
"loss": 0.246, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.5445544554455446, |
|
"grad_norm": 0.3049637104038534, |
|
"learning_rate": 4.5854261788337785e-05, |
|
"loss": 0.2132, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 2.5495049504950495, |
|
"grad_norm": 0.45177004334926596, |
|
"learning_rate": 4.571747181949928e-05, |
|
"loss": 0.2414, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.5544554455445545, |
|
"grad_norm": 0.30792835653348943, |
|
"learning_rate": 4.558061355775693e-05, |
|
"loss": 0.2018, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.5594059405940595, |
|
"grad_norm": 0.3621769791555683, |
|
"learning_rate": 4.5443688637827716e-05, |
|
"loss": 0.2306, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 2.5643564356435644, |
|
"grad_norm": 0.4396288298273991, |
|
"learning_rate": 4.530669869522478e-05, |
|
"loss": 0.2691, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 2.5693069306930694, |
|
"grad_norm": 0.3068970318327044, |
|
"learning_rate": 4.516964536623796e-05, |
|
"loss": 0.2474, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 2.5742574257425743, |
|
"grad_norm": 0.33600645421416364, |
|
"learning_rate": 4.503253028791422e-05, |
|
"loss": 0.2306, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.5792079207920793, |
|
"grad_norm": 0.2915774520149471, |
|
"learning_rate": 4.489535509803806e-05, |
|
"loss": 0.2332, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 2.5841584158415842, |
|
"grad_norm": 0.3036235999189328, |
|
"learning_rate": 4.475812143511202e-05, |
|
"loss": 0.2288, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 2.589108910891089, |
|
"grad_norm": 0.31154126635878154, |
|
"learning_rate": 4.4620830938337055e-05, |
|
"loss": 0.2219, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 2.594059405940594, |
|
"grad_norm": 0.32093686482922884, |
|
"learning_rate": 4.448348524759302e-05, |
|
"loss": 0.2212, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 2.599009900990099, |
|
"grad_norm": 0.325443387638589, |
|
"learning_rate": 4.4346086003418985e-05, |
|
"loss": 0.2317, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.603960396039604, |
|
"grad_norm": 0.30353414863792527, |
|
"learning_rate": 4.420863484699374e-05, |
|
"loss": 0.2262, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 2.608910891089109, |
|
"grad_norm": 0.3454386372059614, |
|
"learning_rate": 4.4071133420116106e-05, |
|
"loss": 0.2278, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 2.613861386138614, |
|
"grad_norm": 0.30102098435727376, |
|
"learning_rate": 4.3933583365185396e-05, |
|
"loss": 0.221, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.618811881188119, |
|
"grad_norm": 0.28474854963641383, |
|
"learning_rate": 4.379598632518175e-05, |
|
"loss": 0.2051, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 2.623762376237624, |
|
"grad_norm": 0.32500866792253486, |
|
"learning_rate": 4.365834394364653e-05, |
|
"loss": 0.2342, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.628712871287129, |
|
"grad_norm": 0.3188951539017567, |
|
"learning_rate": 4.35206578646627e-05, |
|
"loss": 0.224, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 2.633663366336634, |
|
"grad_norm": 0.28927265561371424, |
|
"learning_rate": 4.338292973283512e-05, |
|
"loss": 0.1787, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 2.6386138613861387, |
|
"grad_norm": 0.3344379754147368, |
|
"learning_rate": 4.324516119327102e-05, |
|
"loss": 0.2232, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 2.6435643564356437, |
|
"grad_norm": 0.35614810979036243, |
|
"learning_rate": 4.310735389156026e-05, |
|
"loss": 0.2458, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 2.6485148514851486, |
|
"grad_norm": 0.33358397345836344, |
|
"learning_rate": 4.296950947375566e-05, |
|
"loss": 0.2248, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.6534653465346536, |
|
"grad_norm": 0.29281872495236416, |
|
"learning_rate": 4.2831629586353446e-05, |
|
"loss": 0.23, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 2.6584158415841586, |
|
"grad_norm": 0.3567082705106451, |
|
"learning_rate": 4.269371587627346e-05, |
|
"loss": 0.245, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 2.6633663366336635, |
|
"grad_norm": 0.5510776836916321, |
|
"learning_rate": 4.255576999083956e-05, |
|
"loss": 0.2654, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 2.6683168316831685, |
|
"grad_norm": 0.3484675969209168, |
|
"learning_rate": 4.241779357775993e-05, |
|
"loss": 0.2267, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 2.6732673267326734, |
|
"grad_norm": 0.32743248333395614, |
|
"learning_rate": 4.227978828510739e-05, |
|
"loss": 0.197, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.6782178217821784, |
|
"grad_norm": 0.3698834754112533, |
|
"learning_rate": 4.214175576129972e-05, |
|
"loss": 0.2347, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 2.6831683168316833, |
|
"grad_norm": 0.3108200784170499, |
|
"learning_rate": 4.200369765507995e-05, |
|
"loss": 0.2022, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 2.6881188118811883, |
|
"grad_norm": 0.39122986107770225, |
|
"learning_rate": 4.18656156154967e-05, |
|
"loss": 0.2678, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 2.693069306930693, |
|
"grad_norm": 0.3520361936650789, |
|
"learning_rate": 4.172751129188447e-05, |
|
"loss": 0.2345, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 2.698019801980198, |
|
"grad_norm": 0.3360896398211906, |
|
"learning_rate": 4.158938633384389e-05, |
|
"loss": 0.2204, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.7029702970297027, |
|
"grad_norm": 0.3189999605477287, |
|
"learning_rate": 4.1451242391222105e-05, |
|
"loss": 0.1835, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 2.707920792079208, |
|
"grad_norm": 0.2878525852539076, |
|
"learning_rate": 4.1313081114093025e-05, |
|
"loss": 0.194, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 2.7128712871287126, |
|
"grad_norm": 0.37399660980915755, |
|
"learning_rate": 4.117490415273757e-05, |
|
"loss": 0.2429, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 2.717821782178218, |
|
"grad_norm": 0.4364451292360142, |
|
"learning_rate": 4.1036713157624045e-05, |
|
"loss": 0.272, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 2.7227722772277225, |
|
"grad_norm": 0.2885760814922969, |
|
"learning_rate": 4.089850977938836e-05, |
|
"loss": 0.2344, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.727722772277228, |
|
"grad_norm": 0.3179182415930085, |
|
"learning_rate": 4.076029566881436e-05, |
|
"loss": 0.2067, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 2.7326732673267324, |
|
"grad_norm": 0.3324523312729288, |
|
"learning_rate": 4.0622072476814045e-05, |
|
"loss": 0.2173, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.737623762376238, |
|
"grad_norm": 0.3463898334558332, |
|
"learning_rate": 4.0483841854407906e-05, |
|
"loss": 0.23, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 2.7425742574257423, |
|
"grad_norm": 1.349745744481094, |
|
"learning_rate": 4.0345605452705225e-05, |
|
"loss": 0.3042, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 2.7475247524752477, |
|
"grad_norm": 0.3557405479241251, |
|
"learning_rate": 4.020736492288426e-05, |
|
"loss": 0.244, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.7524752475247523, |
|
"grad_norm": 0.3325914967332085, |
|
"learning_rate": 4.006912191617259e-05, |
|
"loss": 0.2448, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 2.7574257425742577, |
|
"grad_norm": 0.3123897923148057, |
|
"learning_rate": 3.993087808382742e-05, |
|
"loss": 0.2268, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 2.762376237623762, |
|
"grad_norm": 0.3136369159963456, |
|
"learning_rate": 3.9792635077115755e-05, |
|
"loss": 0.2121, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.7673267326732676, |
|
"grad_norm": 0.3236652597206635, |
|
"learning_rate": 3.9654394547294775e-05, |
|
"loss": 0.2265, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 2.772277227722772, |
|
"grad_norm": 0.3053161722456095, |
|
"learning_rate": 3.9516158145592093e-05, |
|
"loss": 0.2187, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.7772277227722775, |
|
"grad_norm": 0.3305154761547749, |
|
"learning_rate": 3.937792752318597e-05, |
|
"loss": 0.2394, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 2.782178217821782, |
|
"grad_norm": 0.3637555988086301, |
|
"learning_rate": 3.923970433118566e-05, |
|
"loss": 0.2367, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 2.7871287128712874, |
|
"grad_norm": 0.3171839114698826, |
|
"learning_rate": 3.9101490220611646e-05, |
|
"loss": 0.2285, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 2.792079207920792, |
|
"grad_norm": 0.33661618797255727, |
|
"learning_rate": 3.8963286842375955e-05, |
|
"loss": 0.227, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.7970297029702973, |
|
"grad_norm": 0.3226709988581632, |
|
"learning_rate": 3.882509584726244e-05, |
|
"loss": 0.2186, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.801980198019802, |
|
"grad_norm": 0.34629105834430013, |
|
"learning_rate": 3.868691888590699e-05, |
|
"loss": 0.2533, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 2.806930693069307, |
|
"grad_norm": 0.6968328315911714, |
|
"learning_rate": 3.854875760877791e-05, |
|
"loss": 0.2399, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 2.8118811881188117, |
|
"grad_norm": 0.3350430446776809, |
|
"learning_rate": 3.8410613666156126e-05, |
|
"loss": 0.222, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 2.8168316831683167, |
|
"grad_norm": 0.35262490412149583, |
|
"learning_rate": 3.8272488708115536e-05, |
|
"loss": 0.2512, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 2.8217821782178216, |
|
"grad_norm": 0.30175139855193056, |
|
"learning_rate": 3.81343843845033e-05, |
|
"loss": 0.2118, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.8267326732673266, |
|
"grad_norm": 0.3224948254073621, |
|
"learning_rate": 3.7996302344920056e-05, |
|
"loss": 0.2368, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 2.8316831683168315, |
|
"grad_norm": 0.3142472270583956, |
|
"learning_rate": 3.785824423870029e-05, |
|
"loss": 0.247, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 2.8366336633663365, |
|
"grad_norm": 0.2930728895351729, |
|
"learning_rate": 3.772021171489261e-05, |
|
"loss": 0.2028, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 2.8415841584158414, |
|
"grad_norm": 0.2979919926832772, |
|
"learning_rate": 3.7582206422240073e-05, |
|
"loss": 0.213, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 2.8465346534653464, |
|
"grad_norm": 0.31269091733709203, |
|
"learning_rate": 3.744423000916045e-05, |
|
"loss": 0.2343, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.8514851485148514, |
|
"grad_norm": 0.31493725391362487, |
|
"learning_rate": 3.7306284123726545e-05, |
|
"loss": 0.2337, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.8564356435643563, |
|
"grad_norm": 0.29253133729107844, |
|
"learning_rate": 3.716837041364657e-05, |
|
"loss": 0.2406, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 2.8613861386138613, |
|
"grad_norm": 0.29891111766859063, |
|
"learning_rate": 3.703049052624434e-05, |
|
"loss": 0.2385, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 2.866336633663366, |
|
"grad_norm": 0.33892105214317203, |
|
"learning_rate": 3.689264610843975e-05, |
|
"loss": 0.23, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 2.871287128712871, |
|
"grad_norm": 0.3232151525300889, |
|
"learning_rate": 3.6754838806728985e-05, |
|
"loss": 0.2372, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.876237623762376, |
|
"grad_norm": 0.3185266368118756, |
|
"learning_rate": 3.6617070267164895e-05, |
|
"loss": 0.2343, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 2.881188118811881, |
|
"grad_norm": 0.29709076121109895, |
|
"learning_rate": 3.647934213533733e-05, |
|
"loss": 0.1971, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.886138613861386, |
|
"grad_norm": 0.3235673389076069, |
|
"learning_rate": 3.634165605635347e-05, |
|
"loss": 0.2334, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 2.891089108910891, |
|
"grad_norm": 0.28878380789158536, |
|
"learning_rate": 3.6204013674818264e-05, |
|
"loss": 0.203, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 2.896039603960396, |
|
"grad_norm": 0.2981782547905849, |
|
"learning_rate": 3.606641663481462e-05, |
|
"loss": 0.211, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.900990099009901, |
|
"grad_norm": 0.337934548527355, |
|
"learning_rate": 3.5928866579883914e-05, |
|
"loss": 0.2228, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 2.905940594059406, |
|
"grad_norm": 0.28548908884726665, |
|
"learning_rate": 3.579136515300627e-05, |
|
"loss": 0.1974, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 2.910891089108911, |
|
"grad_norm": 0.3170801695426472, |
|
"learning_rate": 3.565391399658102e-05, |
|
"loss": 0.2714, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.9158415841584158, |
|
"grad_norm": 0.31210516683665845, |
|
"learning_rate": 3.5516514752406996e-05, |
|
"loss": 0.1991, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 2.9207920792079207, |
|
"grad_norm": 0.3252111899517891, |
|
"learning_rate": 3.537916906166295e-05, |
|
"loss": 0.2538, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.9257425742574257, |
|
"grad_norm": 0.34206702703586866, |
|
"learning_rate": 3.5241878564888006e-05, |
|
"loss": 0.237, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.9306930693069306, |
|
"grad_norm": 0.30352676209933477, |
|
"learning_rate": 3.510464490196195e-05, |
|
"loss": 0.2045, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 2.9356435643564356, |
|
"grad_norm": 0.27190930148752757, |
|
"learning_rate": 3.496746971208579e-05, |
|
"loss": 0.2173, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 2.9405940594059405, |
|
"grad_norm": 0.3324654854830766, |
|
"learning_rate": 3.4830354633762044e-05, |
|
"loss": 0.2502, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.9455445544554455, |
|
"grad_norm": 0.3497620111470606, |
|
"learning_rate": 3.4693301304775226e-05, |
|
"loss": 0.2359, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.9504950495049505, |
|
"grad_norm": 0.27327963847153197, |
|
"learning_rate": 3.455631136217231e-05, |
|
"loss": 0.2078, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 2.9554455445544554, |
|
"grad_norm": 0.30380647947017314, |
|
"learning_rate": 3.4419386442243084e-05, |
|
"loss": 0.2269, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.9603960396039604, |
|
"grad_norm": 0.28430600655210836, |
|
"learning_rate": 3.428252818050074e-05, |
|
"loss": 0.2107, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 2.9653465346534653, |
|
"grad_norm": 0.26557159723378293, |
|
"learning_rate": 3.414573821166222e-05, |
|
"loss": 0.2057, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 2.9702970297029703, |
|
"grad_norm": 0.2862759138114645, |
|
"learning_rate": 3.4009018169628744e-05, |
|
"loss": 0.2087, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.9752475247524752, |
|
"grad_norm": 0.28158416627334854, |
|
"learning_rate": 3.38723696874663e-05, |
|
"loss": 0.2194, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 2.98019801980198, |
|
"grad_norm": 0.35712881847682365, |
|
"learning_rate": 3.373579439738606e-05, |
|
"loss": 0.2387, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 2.985148514851485, |
|
"grad_norm": 0.2884121129670916, |
|
"learning_rate": 3.359929393072505e-05, |
|
"loss": 0.2216, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.99009900990099, |
|
"grad_norm": 0.30058325322611185, |
|
"learning_rate": 3.346286991792648e-05, |
|
"loss": 0.2334, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 2.995049504950495, |
|
"grad_norm": 0.2843610698185909, |
|
"learning_rate": 3.3326523988520365e-05, |
|
"loss": 0.2067, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.34310662923578245, |
|
"learning_rate": 3.3190257771104055e-05, |
|
"loss": 0.2116, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 3.004950495049505, |
|
"grad_norm": 0.3425747462145039, |
|
"learning_rate": 3.305407289332279e-05, |
|
"loss": 0.0851, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 3.00990099009901, |
|
"grad_norm": 0.3033653125782944, |
|
"learning_rate": 3.2917970981850205e-05, |
|
"loss": 0.0897, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 3.014851485148515, |
|
"grad_norm": 0.28638471767451534, |
|
"learning_rate": 3.2781953662368954e-05, |
|
"loss": 0.0909, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 3.01980198019802, |
|
"grad_norm": 0.2844387140751838, |
|
"learning_rate": 3.264602255955127e-05, |
|
"loss": 0.0855, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.0247524752475248, |
|
"grad_norm": 0.25783024583363257, |
|
"learning_rate": 3.251017929703956e-05, |
|
"loss": 0.0739, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 3.0297029702970297, |
|
"grad_norm": 0.4455150785898187, |
|
"learning_rate": 3.237442549742704e-05, |
|
"loss": 0.08, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 3.0346534653465347, |
|
"grad_norm": 0.2649934418173023, |
|
"learning_rate": 3.223876278223828e-05, |
|
"loss": 0.0659, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 3.0396039603960396, |
|
"grad_norm": 0.28349483967189754, |
|
"learning_rate": 3.2103192771909927e-05, |
|
"loss": 0.0677, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 3.0445544554455446, |
|
"grad_norm": 0.2520579031781572, |
|
"learning_rate": 3.196771708577127e-05, |
|
"loss": 0.0675, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.0495049504950495, |
|
"grad_norm": 0.26346383406033047, |
|
"learning_rate": 3.1832337342024956e-05, |
|
"loss": 0.0774, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 3.0544554455445545, |
|
"grad_norm": 0.24009068817781143, |
|
"learning_rate": 3.1697055157727654e-05, |
|
"loss": 0.0797, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 3.0594059405940595, |
|
"grad_norm": 0.2480503867107148, |
|
"learning_rate": 3.156187214877068e-05, |
|
"loss": 0.0863, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 3.0643564356435644, |
|
"grad_norm": 0.29158321889505673, |
|
"learning_rate": 3.142678992986078e-05, |
|
"loss": 0.0744, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 3.0693069306930694, |
|
"grad_norm": 0.2367292120532588, |
|
"learning_rate": 3.129181011450077e-05, |
|
"loss": 0.0799, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 3.0742574257425743, |
|
"grad_norm": 0.23939469471682348, |
|
"learning_rate": 3.115693431497033e-05, |
|
"loss": 0.0769, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 3.0792079207920793, |
|
"grad_norm": 0.21921181988376112, |
|
"learning_rate": 3.102216414230671e-05, |
|
"loss": 0.0694, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 3.0841584158415842, |
|
"grad_norm": 0.2520551999313607, |
|
"learning_rate": 3.0887501206285436e-05, |
|
"loss": 0.0787, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 3.089108910891089, |
|
"grad_norm": 0.27900349033438354, |
|
"learning_rate": 3.075294711540123e-05, |
|
"loss": 0.0882, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 3.094059405940594, |
|
"grad_norm": 0.24848095506852333, |
|
"learning_rate": 3.061850347684859e-05, |
|
"loss": 0.0692, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.099009900990099, |
|
"grad_norm": 0.2685764262687839, |
|
"learning_rate": 3.0484171896502805e-05, |
|
"loss": 0.0828, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 3.103960396039604, |
|
"grad_norm": 0.2295364672313301, |
|
"learning_rate": 3.034995397890063e-05, |
|
"loss": 0.0685, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 3.108910891089109, |
|
"grad_norm": 0.2522857633506632, |
|
"learning_rate": 3.0215851327221163e-05, |
|
"loss": 0.0755, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 3.113861386138614, |
|
"grad_norm": 0.2340600632944488, |
|
"learning_rate": 3.0081865543266687e-05, |
|
"loss": 0.0716, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 3.118811881188119, |
|
"grad_norm": 0.2514923962812719, |
|
"learning_rate": 2.994799822744356e-05, |
|
"loss": 0.0801, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.123762376237624, |
|
"grad_norm": 0.22934807794127365, |
|
"learning_rate": 2.9814250978743115e-05, |
|
"loss": 0.0707, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 3.128712871287129, |
|
"grad_norm": 0.2322249593545606, |
|
"learning_rate": 2.9680625394722483e-05, |
|
"loss": 0.0709, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 3.133663366336634, |
|
"grad_norm": 0.2552323877342474, |
|
"learning_rate": 2.9547123071485586e-05, |
|
"loss": 0.0826, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 3.1386138613861387, |
|
"grad_norm": 0.229864941042402, |
|
"learning_rate": 2.9413745603664023e-05, |
|
"loss": 0.0635, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 3.1435643564356437, |
|
"grad_norm": 0.24313717952069136, |
|
"learning_rate": 2.928049458439808e-05, |
|
"loss": 0.0838, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 3.1485148514851486, |
|
"grad_norm": 0.25035045583769344, |
|
"learning_rate": 2.914737160531765e-05, |
|
"loss": 0.0722, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 3.1534653465346536, |
|
"grad_norm": 0.256842768073215, |
|
"learning_rate": 2.9014378256523218e-05, |
|
"loss": 0.086, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 3.1584158415841586, |
|
"grad_norm": 0.27803315855835337, |
|
"learning_rate": 2.888151612656692e-05, |
|
"loss": 0.0777, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 3.1633663366336635, |
|
"grad_norm": 0.2642554173662771, |
|
"learning_rate": 2.874878680243349e-05, |
|
"loss": 0.0748, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 3.1683168316831685, |
|
"grad_norm": 0.26609660151383924, |
|
"learning_rate": 2.8616191869521412e-05, |
|
"loss": 0.0941, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.1732673267326734, |
|
"grad_norm": 0.2507627298480842, |
|
"learning_rate": 2.8483732911623882e-05, |
|
"loss": 0.0705, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 3.1782178217821784, |
|
"grad_norm": 0.25655475416599705, |
|
"learning_rate": 2.8351411510909926e-05, |
|
"loss": 0.0811, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 3.1831683168316833, |
|
"grad_norm": 0.26109475686466127, |
|
"learning_rate": 2.821922924790552e-05, |
|
"loss": 0.0877, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 3.1881188118811883, |
|
"grad_norm": 0.23733552033709807, |
|
"learning_rate": 2.8087187701474667e-05, |
|
"loss": 0.0819, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 3.1930693069306932, |
|
"grad_norm": 0.23699501435988432, |
|
"learning_rate": 2.7955288448800628e-05, |
|
"loss": 0.0731, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 3.198019801980198, |
|
"grad_norm": 0.24748871746240322, |
|
"learning_rate": 2.7823533065366965e-05, |
|
"loss": 0.0763, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 3.202970297029703, |
|
"grad_norm": 0.25641166163891144, |
|
"learning_rate": 2.7691923124938794e-05, |
|
"loss": 0.0862, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 3.207920792079208, |
|
"grad_norm": 0.25223945193358493, |
|
"learning_rate": 2.756046019954398e-05, |
|
"loss": 0.0783, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 3.212871287128713, |
|
"grad_norm": 0.22946599813308205, |
|
"learning_rate": 2.742914585945436e-05, |
|
"loss": 0.0684, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 3.217821782178218, |
|
"grad_norm": 0.23227418913106662, |
|
"learning_rate": 2.7297981673166963e-05, |
|
"loss": 0.0726, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.222772277227723, |
|
"grad_norm": 0.2708641105053437, |
|
"learning_rate": 2.71669692073853e-05, |
|
"loss": 0.0809, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 3.227722772277228, |
|
"grad_norm": 0.23941294277593003, |
|
"learning_rate": 2.7036110027000636e-05, |
|
"loss": 0.0744, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 3.232673267326733, |
|
"grad_norm": 0.22427334271376922, |
|
"learning_rate": 2.690540569507329e-05, |
|
"loss": 0.0562, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 3.237623762376238, |
|
"grad_norm": 0.2397564104418557, |
|
"learning_rate": 2.677485777281403e-05, |
|
"loss": 0.0709, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 3.2425742574257423, |
|
"grad_norm": 0.24228483254952923, |
|
"learning_rate": 2.6644467819565317e-05, |
|
"loss": 0.0675, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 3.2475247524752477, |
|
"grad_norm": 0.2778549352377007, |
|
"learning_rate": 2.651423739278276e-05, |
|
"loss": 0.0832, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 3.2524752475247523, |
|
"grad_norm": 0.2510031448101173, |
|
"learning_rate": 2.638416804801648e-05, |
|
"loss": 0.0789, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 3.2574257425742577, |
|
"grad_norm": 0.23230623243606166, |
|
"learning_rate": 2.6254261338892536e-05, |
|
"loss": 0.0685, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 3.262376237623762, |
|
"grad_norm": 0.23265962042829946, |
|
"learning_rate": 2.6124518817094418e-05, |
|
"loss": 0.0707, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 3.2673267326732676, |
|
"grad_norm": 0.2698161518899995, |
|
"learning_rate": 2.5994942032344376e-05, |
|
"loss": 0.0712, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.272277227722772, |
|
"grad_norm": 0.23919947212956175, |
|
"learning_rate": 2.5865532532385072e-05, |
|
"loss": 0.0735, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 3.2772277227722775, |
|
"grad_norm": 0.2338979262555169, |
|
"learning_rate": 2.573629186296097e-05, |
|
"loss": 0.079, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 3.282178217821782, |
|
"grad_norm": 0.24198283638959564, |
|
"learning_rate": 2.560722156779996e-05, |
|
"loss": 0.0652, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 3.287128712871287, |
|
"grad_norm": 0.2678959132988273, |
|
"learning_rate": 2.547832318859487e-05, |
|
"loss": 0.0806, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 3.292079207920792, |
|
"grad_norm": 0.24369334210701515, |
|
"learning_rate": 2.5349598264985028e-05, |
|
"loss": 0.0801, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 3.297029702970297, |
|
"grad_norm": 0.2516534708453375, |
|
"learning_rate": 2.5221048334537952e-05, |
|
"loss": 0.0729, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 3.301980198019802, |
|
"grad_norm": 0.26191397567125363, |
|
"learning_rate": 2.5092674932730886e-05, |
|
"loss": 0.0817, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 3.3069306930693068, |
|
"grad_norm": 0.2311417870965065, |
|
"learning_rate": 2.4964479592932574e-05, |
|
"loss": 0.0668, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 3.3118811881188117, |
|
"grad_norm": 0.22453138314695728, |
|
"learning_rate": 2.4836463846384832e-05, |
|
"loss": 0.0679, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 3.3168316831683167, |
|
"grad_norm": 0.25117156700585896, |
|
"learning_rate": 2.470862922218431e-05, |
|
"loss": 0.0726, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.3217821782178216, |
|
"grad_norm": 0.26705255539786693, |
|
"learning_rate": 2.4580977247264253e-05, |
|
"loss": 0.0757, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 3.3267326732673266, |
|
"grad_norm": 0.26514359221062156, |
|
"learning_rate": 2.4453509446376192e-05, |
|
"loss": 0.0798, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 3.3316831683168315, |
|
"grad_norm": 0.2671109496025883, |
|
"learning_rate": 2.432622734207182e-05, |
|
"loss": 0.078, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 3.3366336633663365, |
|
"grad_norm": 0.24536817866890306, |
|
"learning_rate": 2.4199132454684736e-05, |
|
"loss": 0.0729, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 3.3415841584158414, |
|
"grad_norm": 0.25557331335860106, |
|
"learning_rate": 2.40722263023123e-05, |
|
"loss": 0.0784, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.3465346534653464, |
|
"grad_norm": 0.2546554341285905, |
|
"learning_rate": 2.3945510400797485e-05, |
|
"loss": 0.0722, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 3.3514851485148514, |
|
"grad_norm": 0.22489537362915024, |
|
"learning_rate": 2.3818986263710886e-05, |
|
"loss": 0.0701, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 3.3564356435643563, |
|
"grad_norm": 0.23800748533645866, |
|
"learning_rate": 2.3692655402332455e-05, |
|
"loss": 0.0694, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 3.3613861386138613, |
|
"grad_norm": 0.21671549307844398, |
|
"learning_rate": 2.3566519325633567e-05, |
|
"loss": 0.0555, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 3.366336633663366, |
|
"grad_norm": 0.2583657442869358, |
|
"learning_rate": 2.3440579540259006e-05, |
|
"loss": 0.071, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.371287128712871, |
|
"grad_norm": 0.2573251426343712, |
|
"learning_rate": 2.3314837550508875e-05, |
|
"loss": 0.0698, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 3.376237623762376, |
|
"grad_norm": 0.2404906641580492, |
|
"learning_rate": 2.3189294858320768e-05, |
|
"loss": 0.0721, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 3.381188118811881, |
|
"grad_norm": 0.24817378991212172, |
|
"learning_rate": 2.3063952963251682e-05, |
|
"loss": 0.0665, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 3.386138613861386, |
|
"grad_norm": 0.278544909227608, |
|
"learning_rate": 2.2938813362460198e-05, |
|
"loss": 0.0796, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 3.391089108910891, |
|
"grad_norm": 0.23587919321962164, |
|
"learning_rate": 2.2813877550688553e-05, |
|
"loss": 0.0638, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.396039603960396, |
|
"grad_norm": 0.26054805369939354, |
|
"learning_rate": 2.2689147020244848e-05, |
|
"loss": 0.0829, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 3.400990099009901, |
|
"grad_norm": 0.2600430379659956, |
|
"learning_rate": 2.256462326098516e-05, |
|
"loss": 0.0751, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 3.405940594059406, |
|
"grad_norm": 0.251402994649795, |
|
"learning_rate": 2.2440307760295755e-05, |
|
"loss": 0.0768, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 3.410891089108911, |
|
"grad_norm": 0.2513808234935709, |
|
"learning_rate": 2.2316202003075347e-05, |
|
"loss": 0.0794, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 3.4158415841584158, |
|
"grad_norm": 0.2394850075046582, |
|
"learning_rate": 2.2192307471717324e-05, |
|
"loss": 0.0678, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.4207920792079207, |
|
"grad_norm": 0.2270966635982115, |
|
"learning_rate": 2.2068625646092103e-05, |
|
"loss": 0.0603, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 3.4257425742574257, |
|
"grad_norm": 0.230349969817333, |
|
"learning_rate": 2.194515800352942e-05, |
|
"loss": 0.0575, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 3.4306930693069306, |
|
"grad_norm": 0.2151090758971661, |
|
"learning_rate": 2.1821906018800643e-05, |
|
"loss": 0.0617, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 3.4356435643564356, |
|
"grad_norm": 0.26279226131219013, |
|
"learning_rate": 2.169887116410121e-05, |
|
"loss": 0.0684, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 3.4405940594059405, |
|
"grad_norm": 0.23698343268856573, |
|
"learning_rate": 2.1576054909033014e-05, |
|
"loss": 0.0673, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.4455445544554455, |
|
"grad_norm": 0.21782279464252616, |
|
"learning_rate": 2.1453458720586902e-05, |
|
"loss": 0.0596, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 3.4504950495049505, |
|
"grad_norm": 0.2413629900614204, |
|
"learning_rate": 2.13310840631251e-05, |
|
"loss": 0.066, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 3.4554455445544554, |
|
"grad_norm": 0.2590724577362002, |
|
"learning_rate": 2.1208932398363712e-05, |
|
"loss": 0.085, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 3.4603960396039604, |
|
"grad_norm": 0.23854018641540192, |
|
"learning_rate": 2.1087005185355292e-05, |
|
"loss": 0.0578, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 3.4653465346534653, |
|
"grad_norm": 0.2617124689921388, |
|
"learning_rate": 2.0965303880471405e-05, |
|
"loss": 0.0683, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.4702970297029703, |
|
"grad_norm": 0.25853795971879695, |
|
"learning_rate": 2.0843829937385255e-05, |
|
"loss": 0.0685, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 3.4752475247524752, |
|
"grad_norm": 0.21533646187557015, |
|
"learning_rate": 2.072258480705431e-05, |
|
"loss": 0.0612, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 3.48019801980198, |
|
"grad_norm": 0.2398974299806927, |
|
"learning_rate": 2.0601569937702913e-05, |
|
"loss": 0.0767, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 3.485148514851485, |
|
"grad_norm": 0.2825099006591305, |
|
"learning_rate": 2.048078677480507e-05, |
|
"loss": 0.0713, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 3.49009900990099, |
|
"grad_norm": 0.2374986457390262, |
|
"learning_rate": 2.0360236761067117e-05, |
|
"loss": 0.0614, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.495049504950495, |
|
"grad_norm": 0.24928209932734513, |
|
"learning_rate": 2.023992133641055e-05, |
|
"loss": 0.0715, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.24602624092887262, |
|
"learning_rate": 2.0119841937954794e-05, |
|
"loss": 0.0776, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 3.504950495049505, |
|
"grad_norm": 0.26151726202365333, |
|
"learning_rate": 2.0000000000000012e-05, |
|
"loss": 0.0834, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 3.50990099009901, |
|
"grad_norm": 0.22791863144384952, |
|
"learning_rate": 1.9880396954009976e-05, |
|
"loss": 0.0645, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 3.514851485148515, |
|
"grad_norm": 0.27852434371052076, |
|
"learning_rate": 1.976103422859506e-05, |
|
"loss": 0.0615, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.51980198019802, |
|
"grad_norm": 0.4658704014321942, |
|
"learning_rate": 1.9641913249495026e-05, |
|
"loss": 0.1114, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 3.5247524752475248, |
|
"grad_norm": 0.2388977822102273, |
|
"learning_rate": 1.9523035439562146e-05, |
|
"loss": 0.0636, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 3.5297029702970297, |
|
"grad_norm": 0.2431114346820817, |
|
"learning_rate": 1.9404402218744086e-05, |
|
"loss": 0.072, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 3.5346534653465347, |
|
"grad_norm": 0.2263409426285513, |
|
"learning_rate": 1.9286015004066984e-05, |
|
"loss": 0.0618, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 3.5396039603960396, |
|
"grad_norm": 0.25538479665632796, |
|
"learning_rate": 1.9167875209618592e-05, |
|
"loss": 0.0713, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.5445544554455446, |
|
"grad_norm": 0.26107712164158786, |
|
"learning_rate": 1.9049984246531255e-05, |
|
"loss": 0.0803, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 3.5495049504950495, |
|
"grad_norm": 0.25722416216963295, |
|
"learning_rate": 1.8932343522965205e-05, |
|
"loss": 0.0793, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 3.5544554455445545, |
|
"grad_norm": 0.2129150003741417, |
|
"learning_rate": 1.8814954444091595e-05, |
|
"loss": 0.0602, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 3.5594059405940595, |
|
"grad_norm": 0.2965098118005929, |
|
"learning_rate": 1.8697818412075794e-05, |
|
"loss": 0.0756, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 3.5643564356435644, |
|
"grad_norm": 0.24815189554867753, |
|
"learning_rate": 1.8580936826060685e-05, |
|
"loss": 0.072, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.5693069306930694, |
|
"grad_norm": 0.23537016156276444, |
|
"learning_rate": 1.846431108214981e-05, |
|
"loss": 0.0632, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 3.5742574257425743, |
|
"grad_norm": 0.25177412841655517, |
|
"learning_rate": 1.8347942573390865e-05, |
|
"loss": 0.0703, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 3.5792079207920793, |
|
"grad_norm": 0.2225737099451813, |
|
"learning_rate": 1.8231832689758903e-05, |
|
"loss": 0.0623, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 3.5841584158415842, |
|
"grad_norm": 0.27105752420141427, |
|
"learning_rate": 1.8115982818139862e-05, |
|
"loss": 0.0794, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 3.589108910891089, |
|
"grad_norm": 0.2817083321886372, |
|
"learning_rate": 1.80003943423139e-05, |
|
"loss": 0.0689, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.594059405940594, |
|
"grad_norm": 0.2285885257732442, |
|
"learning_rate": 1.7885068642938924e-05, |
|
"loss": 0.0567, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 3.599009900990099, |
|
"grad_norm": 0.24563843948778483, |
|
"learning_rate": 1.7770007097534062e-05, |
|
"loss": 0.0753, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 3.603960396039604, |
|
"grad_norm": 0.2302620192686851, |
|
"learning_rate": 1.7655211080463265e-05, |
|
"loss": 0.0615, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 3.608910891089109, |
|
"grad_norm": 0.2080881341242689, |
|
"learning_rate": 1.754068196291885e-05, |
|
"loss": 0.0505, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 3.613861386138614, |
|
"grad_norm": 0.2218095852880323, |
|
"learning_rate": 1.7426421112905095e-05, |
|
"loss": 0.0608, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.618811881188119, |
|
"grad_norm": 0.2310947814386969, |
|
"learning_rate": 1.731242989522195e-05, |
|
"loss": 0.0663, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 3.623762376237624, |
|
"grad_norm": 0.2545723904232327, |
|
"learning_rate": 1.7198709671448696e-05, |
|
"loss": 0.0759, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 3.628712871287129, |
|
"grad_norm": 0.2211042721278219, |
|
"learning_rate": 1.7085261799927738e-05, |
|
"loss": 0.0583, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 3.633663366336634, |
|
"grad_norm": 0.2509514455369346, |
|
"learning_rate": 1.697208763574833e-05, |
|
"loss": 0.0782, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 3.6386138613861387, |
|
"grad_norm": 0.22667085651403643, |
|
"learning_rate": 1.6859188530730387e-05, |
|
"loss": 0.0664, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.6435643564356437, |
|
"grad_norm": 0.22712863713521322, |
|
"learning_rate": 1.6746565833408352e-05, |
|
"loss": 0.0623, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 3.6485148514851486, |
|
"grad_norm": 0.22499251373801385, |
|
"learning_rate": 1.6634220889015087e-05, |
|
"loss": 0.0644, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 3.6534653465346536, |
|
"grad_norm": 0.20110354191475177, |
|
"learning_rate": 1.652215503946583e-05, |
|
"loss": 0.0565, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 3.6584158415841586, |
|
"grad_norm": 0.22315963225154622, |
|
"learning_rate": 1.6410369623342144e-05, |
|
"loss": 0.0661, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 3.6633663366336635, |
|
"grad_norm": 0.22793408178521282, |
|
"learning_rate": 1.6298865975875903e-05, |
|
"loss": 0.0625, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.6683168316831685, |
|
"grad_norm": 0.23014768345985898, |
|
"learning_rate": 1.6187645428933372e-05, |
|
"loss": 0.0674, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 3.6732673267326734, |
|
"grad_norm": 0.2253800846374502, |
|
"learning_rate": 1.607670931099929e-05, |
|
"loss": 0.0639, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 3.6782178217821784, |
|
"grad_norm": 0.24360567586454138, |
|
"learning_rate": 1.5966058947161035e-05, |
|
"loss": 0.07, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 3.6831683168316833, |
|
"grad_norm": 0.21918551794623176, |
|
"learning_rate": 1.5855695659092746e-05, |
|
"loss": 0.0618, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 3.6881188118811883, |
|
"grad_norm": 0.23923527333760677, |
|
"learning_rate": 1.5745620765039564e-05, |
|
"loss": 0.0716, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.693069306930693, |
|
"grad_norm": 0.2475994058430268, |
|
"learning_rate": 1.563583557980186e-05, |
|
"loss": 0.0737, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 3.698019801980198, |
|
"grad_norm": 0.23662288045909508, |
|
"learning_rate": 1.5526341414719565e-05, |
|
"loss": 0.069, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 3.7029702970297027, |
|
"grad_norm": 0.23536533447903255, |
|
"learning_rate": 1.541713957765649e-05, |
|
"loss": 0.0634, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 3.707920792079208, |
|
"grad_norm": 0.23553261160197025, |
|
"learning_rate": 1.5308231372984723e-05, |
|
"loss": 0.074, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 3.7128712871287126, |
|
"grad_norm": 0.22659254762767775, |
|
"learning_rate": 1.5199618101569003e-05, |
|
"loss": 0.059, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.717821782178218, |
|
"grad_norm": 0.22831907929393108, |
|
"learning_rate": 1.5091301060751207e-05, |
|
"loss": 0.0642, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 3.7227722772277225, |
|
"grad_norm": 0.22033603995618226, |
|
"learning_rate": 1.4983281544334896e-05, |
|
"loss": 0.0644, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 3.727722772277228, |
|
"grad_norm": 0.22136073260599423, |
|
"learning_rate": 1.4875560842569767e-05, |
|
"loss": 0.0659, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 3.7326732673267324, |
|
"grad_norm": 0.2152678182143572, |
|
"learning_rate": 1.4768140242136353e-05, |
|
"loss": 0.0634, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 3.737623762376238, |
|
"grad_norm": 0.21396497300394157, |
|
"learning_rate": 1.4661021026130553e-05, |
|
"loss": 0.0528, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.7425742574257423, |
|
"grad_norm": 0.2114883903251707, |
|
"learning_rate": 1.4554204474048357e-05, |
|
"loss": 0.0561, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 3.7475247524752477, |
|
"grad_norm": 0.2117441225032521, |
|
"learning_rate": 1.4447691861770591e-05, |
|
"loss": 0.0628, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 3.7524752475247523, |
|
"grad_norm": 0.2323094346863045, |
|
"learning_rate": 1.4341484461547585e-05, |
|
"loss": 0.0655, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 3.7574257425742577, |
|
"grad_norm": 0.22902723203887707, |
|
"learning_rate": 1.4235583541984092e-05, |
|
"loss": 0.0713, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 3.762376237623762, |
|
"grad_norm": 0.2968751987123677, |
|
"learning_rate": 1.412999036802404e-05, |
|
"loss": 0.0703, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.7673267326732676, |
|
"grad_norm": 0.22977435481121689, |
|
"learning_rate": 1.4024706200935452e-05, |
|
"loss": 0.072, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 3.772277227722772, |
|
"grad_norm": 0.23461646531105834, |
|
"learning_rate": 1.3919732298295431e-05, |
|
"loss": 0.0607, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 3.7772277227722775, |
|
"grad_norm": 0.21663372881824905, |
|
"learning_rate": 1.3815069913975045e-05, |
|
"loss": 0.0614, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 3.782178217821782, |
|
"grad_norm": 0.23095070576662677, |
|
"learning_rate": 1.3710720298124454e-05, |
|
"loss": 0.0676, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 3.7871287128712874, |
|
"grad_norm": 0.2305740011310324, |
|
"learning_rate": 1.3606684697157876e-05, |
|
"loss": 0.0679, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.792079207920792, |
|
"grad_norm": 0.217941058133159, |
|
"learning_rate": 1.350296435373876e-05, |
|
"loss": 0.0617, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 3.7970297029702973, |
|
"grad_norm": 0.23172469767040707, |
|
"learning_rate": 1.3399560506764959e-05, |
|
"loss": 0.0711, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 3.801980198019802, |
|
"grad_norm": 0.2569635240774621, |
|
"learning_rate": 1.3296474391353854e-05, |
|
"loss": 0.0813, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 3.806930693069307, |
|
"grad_norm": 0.3217073053960941, |
|
"learning_rate": 1.3193707238827714e-05, |
|
"loss": 0.0751, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 3.8118811881188117, |
|
"grad_norm": 0.2229682377437594, |
|
"learning_rate": 1.3091260276698847e-05, |
|
"loss": 0.06, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.8168316831683167, |
|
"grad_norm": 0.5823009036335607, |
|
"learning_rate": 1.2989134728655097e-05, |
|
"loss": 0.1029, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 3.8217821782178216, |
|
"grad_norm": 0.2196594147367011, |
|
"learning_rate": 1.288733181454508e-05, |
|
"loss": 0.0603, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 3.8267326732673266, |
|
"grad_norm": 0.23552052956281094, |
|
"learning_rate": 1.2785852750363716e-05, |
|
"loss": 0.0656, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 3.8316831683168315, |
|
"grad_norm": 0.2241024828767437, |
|
"learning_rate": 1.2684698748237633e-05, |
|
"loss": 0.0632, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 3.8366336633663365, |
|
"grad_norm": 0.24561519046281696, |
|
"learning_rate": 1.2583871016410764e-05, |
|
"loss": 0.0741, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.8415841584158414, |
|
"grad_norm": 0.22027356130191578, |
|
"learning_rate": 1.2483370759229874e-05, |
|
"loss": 0.0602, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 3.8465346534653464, |
|
"grad_norm": 0.21085250976290754, |
|
"learning_rate": 1.2383199177130135e-05, |
|
"loss": 0.0547, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 3.8514851485148514, |
|
"grad_norm": 0.20808078642947644, |
|
"learning_rate": 1.228335746662086e-05, |
|
"loss": 0.0563, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 3.8564356435643563, |
|
"grad_norm": 0.2198172661766001, |
|
"learning_rate": 1.2183846820271147e-05, |
|
"loss": 0.0632, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 3.8613861386138613, |
|
"grad_norm": 0.22050042718742202, |
|
"learning_rate": 1.2084668426695712e-05, |
|
"loss": 0.0608, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.866336633663366, |
|
"grad_norm": 0.22382400683228384, |
|
"learning_rate": 1.198582347054062e-05, |
|
"loss": 0.0622, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 3.871287128712871, |
|
"grad_norm": 0.221034670098815, |
|
"learning_rate": 1.1887313132469154e-05, |
|
"loss": 0.0686, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 3.876237623762376, |
|
"grad_norm": 0.2178480835513662, |
|
"learning_rate": 1.178913858914772e-05, |
|
"loss": 0.0656, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 3.881188118811881, |
|
"grad_norm": 0.23113866160558683, |
|
"learning_rate": 1.1691301013231788e-05, |
|
"loss": 0.0716, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 3.886138613861386, |
|
"grad_norm": 0.24146387892581606, |
|
"learning_rate": 1.1593801573351908e-05, |
|
"loss": 0.076, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.891089108910891, |
|
"grad_norm": 0.279931142015236, |
|
"learning_rate": 1.1496641434099725e-05, |
|
"loss": 0.0795, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 3.896039603960396, |
|
"grad_norm": 0.22282099068800784, |
|
"learning_rate": 1.1399821756014058e-05, |
|
"loss": 0.0647, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 3.900990099009901, |
|
"grad_norm": 0.22759349001620766, |
|
"learning_rate": 1.1303343695567066e-05, |
|
"loss": 0.0709, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 3.905940594059406, |
|
"grad_norm": 0.21740738469087242, |
|
"learning_rate": 1.1207208405150397e-05, |
|
"loss": 0.0582, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 3.910891089108911, |
|
"grad_norm": 0.23489806348416206, |
|
"learning_rate": 1.1111417033061498e-05, |
|
"loss": 0.0749, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.9158415841584158, |
|
"grad_norm": 0.21888448469819882, |
|
"learning_rate": 1.1015970723489828e-05, |
|
"loss": 0.0631, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 3.9207920792079207, |
|
"grad_norm": 0.20856864084290314, |
|
"learning_rate": 1.0920870616503194e-05, |
|
"loss": 0.0534, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 3.9257425742574257, |
|
"grad_norm": 0.2949758911019223, |
|
"learning_rate": 1.082611784803417e-05, |
|
"loss": 0.0637, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 3.9306930693069306, |
|
"grad_norm": 0.22197761683092598, |
|
"learning_rate": 1.0731713549866494e-05, |
|
"loss": 0.0679, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 3.9356435643564356, |
|
"grad_norm": 0.46875314512876026, |
|
"learning_rate": 1.0637658849621593e-05, |
|
"loss": 0.0832, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.9405940594059405, |
|
"grad_norm": 0.2293499010106679, |
|
"learning_rate": 1.0543954870745088e-05, |
|
"loss": 0.0668, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 3.9455445544554455, |
|
"grad_norm": 0.2032480907519156, |
|
"learning_rate": 1.0450602732493337e-05, |
|
"loss": 0.0567, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 3.9504950495049505, |
|
"grad_norm": 0.21842971126240499, |
|
"learning_rate": 1.0357603549920129e-05, |
|
"loss": 0.0572, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 3.9554455445544554, |
|
"grad_norm": 0.22221759651106548, |
|
"learning_rate": 1.0264958433863353e-05, |
|
"loss": 0.0577, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 3.9603960396039604, |
|
"grad_norm": 0.23181702880658006, |
|
"learning_rate": 1.0172668490931673e-05, |
|
"loss": 0.0698, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.9653465346534653, |
|
"grad_norm": 0.20252785677004106, |
|
"learning_rate": 1.0080734823491402e-05, |
|
"loss": 0.0573, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 3.9702970297029703, |
|
"grad_norm": 0.22486180252186383, |
|
"learning_rate": 9.989158529653257e-06, |
|
"loss": 0.0687, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 3.9752475247524752, |
|
"grad_norm": 0.2276919286466179, |
|
"learning_rate": 9.897940703259264e-06, |
|
"loss": 0.0595, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 3.98019801980198, |
|
"grad_norm": 0.24721806064919266, |
|
"learning_rate": 9.807082433869727e-06, |
|
"loss": 0.0684, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 3.985148514851485, |
|
"grad_norm": 0.20707237969343867, |
|
"learning_rate": 9.716584806750151e-06, |
|
"loss": 0.0495, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.99009900990099, |
|
"grad_norm": 0.2636796510612892, |
|
"learning_rate": 9.626448902858359e-06, |
|
"loss": 0.0895, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 3.995049504950495, |
|
"grad_norm": 0.20717337091681312, |
|
"learning_rate": 9.536675798831499e-06, |
|
"loss": 0.0635, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.2307254007299234, |
|
"learning_rate": 9.447266566973211e-06, |
|
"loss": 0.0602, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 4.0049504950495045, |
|
"grad_norm": 0.12624509852601035, |
|
"learning_rate": 9.358222275240884e-06, |
|
"loss": 0.0145, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 4.00990099009901, |
|
"grad_norm": 0.16507407293813872, |
|
"learning_rate": 9.26954398723278e-06, |
|
"loss": 0.0177, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 4.014851485148514, |
|
"grad_norm": 0.1292728036609501, |
|
"learning_rate": 9.181232762175435e-06, |
|
"loss": 0.0165, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 4.01980198019802, |
|
"grad_norm": 0.1080103691046681, |
|
"learning_rate": 9.093289654910946e-06, |
|
"loss": 0.0107, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 4.024752475247524, |
|
"grad_norm": 0.12607608036523452, |
|
"learning_rate": 9.005715715884409e-06, |
|
"loss": 0.0151, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 4.02970297029703, |
|
"grad_norm": 0.1304597478576506, |
|
"learning_rate": 8.918511991131335e-06, |
|
"loss": 0.0156, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 4.034653465346534, |
|
"grad_norm": 0.11336223348572144, |
|
"learning_rate": 8.831679522265167e-06, |
|
"loss": 0.0137, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 4.03960396039604, |
|
"grad_norm": 0.10738022397375838, |
|
"learning_rate": 8.745219346464884e-06, |
|
"loss": 0.0112, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 4.044554455445544, |
|
"grad_norm": 0.10565152711917576, |
|
"learning_rate": 8.659132496462521e-06, |
|
"loss": 0.0122, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 4.0495049504950495, |
|
"grad_norm": 0.10963944208831582, |
|
"learning_rate": 8.57342000053095e-06, |
|
"loss": 0.0132, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 4.054455445544554, |
|
"grad_norm": 0.11269131050613809, |
|
"learning_rate": 8.488082882471476e-06, |
|
"loss": 0.0119, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 4.0594059405940595, |
|
"grad_norm": 0.12401055933444247, |
|
"learning_rate": 8.403122161601699e-06, |
|
"loss": 0.0109, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 4.064356435643564, |
|
"grad_norm": 0.13083619123039256, |
|
"learning_rate": 8.318538852743275e-06, |
|
"loss": 0.0136, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 4.069306930693069, |
|
"grad_norm": 0.11500742537186857, |
|
"learning_rate": 8.23433396620986e-06, |
|
"loss": 0.0117, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 4.074257425742574, |
|
"grad_norm": 0.12961481225832067, |
|
"learning_rate": 8.150508507795005e-06, |
|
"loss": 0.0137, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 4.079207920792079, |
|
"grad_norm": 0.12055499601120245, |
|
"learning_rate": 8.067063478760127e-06, |
|
"loss": 0.0107, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 4.084158415841584, |
|
"grad_norm": 0.15932784631531496, |
|
"learning_rate": 7.983999875822563e-06, |
|
"loss": 0.0156, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.089108910891089, |
|
"grad_norm": 0.11910916796855137, |
|
"learning_rate": 7.901318691143678e-06, |
|
"loss": 0.0131, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 4.094059405940594, |
|
"grad_norm": 0.13705672850707565, |
|
"learning_rate": 7.819020912317011e-06, |
|
"loss": 0.0132, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 4.099009900990099, |
|
"grad_norm": 0.13359626054483767, |
|
"learning_rate": 7.73710752235647e-06, |
|
"loss": 0.0135, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 4.103960396039604, |
|
"grad_norm": 0.14419301245049312, |
|
"learning_rate": 7.65557949968459e-06, |
|
"loss": 0.0133, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 4.108910891089109, |
|
"grad_norm": 0.12203061347827363, |
|
"learning_rate": 7.574437818120839e-06, |
|
"loss": 0.01, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.1138613861386135, |
|
"grad_norm": 0.1185999849541631, |
|
"learning_rate": 7.4936834468699945e-06, |
|
"loss": 0.01, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 4.118811881188119, |
|
"grad_norm": 0.1420252022264759, |
|
"learning_rate": 7.413317350510589e-06, |
|
"loss": 0.0111, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 4.123762376237623, |
|
"grad_norm": 0.12871738126470406, |
|
"learning_rate": 7.333340488983363e-06, |
|
"loss": 0.0119, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 4.128712871287129, |
|
"grad_norm": 0.09804785229838152, |
|
"learning_rate": 7.253753817579792e-06, |
|
"loss": 0.0062, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 4.133663366336633, |
|
"grad_norm": 0.1500135953230511, |
|
"learning_rate": 7.174558286930682e-06, |
|
"loss": 0.0123, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 4.138613861386139, |
|
"grad_norm": 0.13688770404452, |
|
"learning_rate": 7.095754842994824e-06, |
|
"loss": 0.0145, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 4.143564356435643, |
|
"grad_norm": 0.13128710417128772, |
|
"learning_rate": 7.0173444270477075e-06, |
|
"loss": 0.0134, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 4.148514851485149, |
|
"grad_norm": 0.12492637806478936, |
|
"learning_rate": 6.939327975670256e-06, |
|
"loss": 0.0087, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 4.153465346534653, |
|
"grad_norm": 0.13106643311603597, |
|
"learning_rate": 6.861706420737628e-06, |
|
"loss": 0.0097, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 4.158415841584159, |
|
"grad_norm": 0.1103294994245557, |
|
"learning_rate": 6.784480689408099e-06, |
|
"loss": 0.0075, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.163366336633663, |
|
"grad_norm": 0.14473315697134212, |
|
"learning_rate": 6.707651704112028e-06, |
|
"loss": 0.0139, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 4.1683168316831685, |
|
"grad_norm": 0.1166093187701074, |
|
"learning_rate": 6.631220382540755e-06, |
|
"loss": 0.0102, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 4.173267326732673, |
|
"grad_norm": 0.10511608560955692, |
|
"learning_rate": 6.555187637635727e-06, |
|
"loss": 0.0079, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 4.178217821782178, |
|
"grad_norm": 0.11453792840078524, |
|
"learning_rate": 6.479554377577528e-06, |
|
"loss": 0.0098, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 4.183168316831683, |
|
"grad_norm": 0.12632446631314592, |
|
"learning_rate": 6.404321505775053e-06, |
|
"loss": 0.0114, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 4.188118811881188, |
|
"grad_norm": 0.13650562617278209, |
|
"learning_rate": 6.329489920854745e-06, |
|
"loss": 0.0127, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 4.193069306930693, |
|
"grad_norm": 0.12800825431726343, |
|
"learning_rate": 6.255060516649809e-06, |
|
"loss": 0.0121, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 4.198019801980198, |
|
"grad_norm": 0.12058629163517937, |
|
"learning_rate": 6.181034182189592e-06, |
|
"loss": 0.0104, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 4.202970297029703, |
|
"grad_norm": 0.1351110698360575, |
|
"learning_rate": 6.107411801688905e-06, |
|
"loss": 0.0138, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 4.207920792079208, |
|
"grad_norm": 0.12674244407405744, |
|
"learning_rate": 6.034194254537502e-06, |
|
"loss": 0.0087, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.212871287128713, |
|
"grad_norm": 0.15735576903425058, |
|
"learning_rate": 5.9613824152895765e-06, |
|
"loss": 0.0112, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 4.217821782178218, |
|
"grad_norm": 0.11717446858945191, |
|
"learning_rate": 5.8889771536532855e-06, |
|
"loss": 0.0104, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 4.2227722772277225, |
|
"grad_norm": 0.13212471056877662, |
|
"learning_rate": 5.8169793344804085e-06, |
|
"loss": 0.0153, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 4.227722772277228, |
|
"grad_norm": 0.13863455505878444, |
|
"learning_rate": 5.7453898177559505e-06, |
|
"loss": 0.0144, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 4.232673267326732, |
|
"grad_norm": 0.11440426947284903, |
|
"learning_rate": 5.674209458587929e-06, |
|
"loss": 0.0107, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 4.237623762376238, |
|
"grad_norm": 0.13893105515641654, |
|
"learning_rate": 5.603439107197149e-06, |
|
"loss": 0.0113, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 4.242574257425742, |
|
"grad_norm": 0.09755126038986718, |
|
"learning_rate": 5.5330796089070064e-06, |
|
"loss": 0.0086, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 4.247524752475248, |
|
"grad_norm": 0.1012621696476356, |
|
"learning_rate": 5.463131804133461e-06, |
|
"loss": 0.0064, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 4.252475247524752, |
|
"grad_norm": 0.11007678351244354, |
|
"learning_rate": 5.393596528374923e-06, |
|
"loss": 0.0092, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 4.257425742574258, |
|
"grad_norm": 0.09888652684365734, |
|
"learning_rate": 5.324474612202335e-06, |
|
"loss": 0.0069, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.262376237623762, |
|
"grad_norm": 0.12286862253049159, |
|
"learning_rate": 5.255766881249212e-06, |
|
"loss": 0.01, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 4.267326732673268, |
|
"grad_norm": 0.10142434831826229, |
|
"learning_rate": 5.187474156201786e-06, |
|
"loss": 0.0088, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 4.272277227722772, |
|
"grad_norm": 0.10702805040029474, |
|
"learning_rate": 5.119597252789237e-06, |
|
"loss": 0.0086, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 4.2772277227722775, |
|
"grad_norm": 0.11943904582684445, |
|
"learning_rate": 5.052136981773892e-06, |
|
"loss": 0.0094, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 4.282178217821782, |
|
"grad_norm": 0.1028717674245101, |
|
"learning_rate": 4.9850941489415985e-06, |
|
"loss": 0.0093, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 4.287128712871287, |
|
"grad_norm": 0.11112579540598307, |
|
"learning_rate": 4.918469555092049e-06, |
|
"loss": 0.011, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 4.292079207920792, |
|
"grad_norm": 0.10114297312691053, |
|
"learning_rate": 4.852263996029259e-06, |
|
"loss": 0.0092, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 4.297029702970297, |
|
"grad_norm": 0.097220137032453, |
|
"learning_rate": 4.786478262552012e-06, |
|
"loss": 0.0099, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 4.301980198019802, |
|
"grad_norm": 0.10180171384782043, |
|
"learning_rate": 4.7211131404444825e-06, |
|
"loss": 0.0083, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 4.306930693069307, |
|
"grad_norm": 0.09810672490833108, |
|
"learning_rate": 4.656169410466795e-06, |
|
"loss": 0.0092, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.311881188118812, |
|
"grad_norm": 0.10340416632521776, |
|
"learning_rate": 4.591647848345711e-06, |
|
"loss": 0.0083, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 4.316831683168317, |
|
"grad_norm": 0.09960308482221131, |
|
"learning_rate": 4.527549224765362e-06, |
|
"loss": 0.0104, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 4.321782178217822, |
|
"grad_norm": 0.11986640194637614, |
|
"learning_rate": 4.463874305358045e-06, |
|
"loss": 0.0098, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 4.326732673267327, |
|
"grad_norm": 0.1310677721590091, |
|
"learning_rate": 4.400623850695103e-06, |
|
"loss": 0.0112, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 4.3316831683168315, |
|
"grad_norm": 0.11364550376478703, |
|
"learning_rate": 4.337798616277806e-06, |
|
"loss": 0.0098, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 4.336633663366337, |
|
"grad_norm": 0.10732139674973952, |
|
"learning_rate": 4.275399352528342e-06, |
|
"loss": 0.0096, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 4.341584158415841, |
|
"grad_norm": 0.11726008537633273, |
|
"learning_rate": 4.213426804780838e-06, |
|
"loss": 0.0099, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 4.346534653465347, |
|
"grad_norm": 0.10648615788140584, |
|
"learning_rate": 4.151881713272472e-06, |
|
"loss": 0.0081, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 4.351485148514851, |
|
"grad_norm": 0.14887235960616102, |
|
"learning_rate": 4.090764813134644e-06, |
|
"loss": 0.0141, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 4.356435643564357, |
|
"grad_norm": 0.11352930689713439, |
|
"learning_rate": 4.0300768343841805e-06, |
|
"loss": 0.0109, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.361386138613861, |
|
"grad_norm": 0.1588437384116596, |
|
"learning_rate": 3.969818501914597e-06, |
|
"loss": 0.0101, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 4.366336633663367, |
|
"grad_norm": 0.12435479835383088, |
|
"learning_rate": 3.909990535487472e-06, |
|
"loss": 0.0102, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 4.371287128712871, |
|
"grad_norm": 0.09983693257790452, |
|
"learning_rate": 3.850593649723804e-06, |
|
"loss": 0.0089, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 4.376237623762377, |
|
"grad_norm": 0.1000375497881339, |
|
"learning_rate": 3.7916285540955566e-06, |
|
"loss": 0.0093, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 4.381188118811881, |
|
"grad_norm": 0.10545145468420694, |
|
"learning_rate": 3.733095952917101e-06, |
|
"loss": 0.0093, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 4.3861386138613865, |
|
"grad_norm": 0.12375538991124814, |
|
"learning_rate": 3.6749965453368375e-06, |
|
"loss": 0.0129, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 4.391089108910891, |
|
"grad_norm": 0.11154078992980383, |
|
"learning_rate": 3.617331025328845e-06, |
|
"loss": 0.0109, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 4.396039603960396, |
|
"grad_norm": 0.12282743124663045, |
|
"learning_rate": 3.5601000816846053e-06, |
|
"loss": 0.0117, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 4.400990099009901, |
|
"grad_norm": 0.09350300949252467, |
|
"learning_rate": 3.50330439800473e-06, |
|
"loss": 0.0061, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 4.405940594059406, |
|
"grad_norm": 0.12085328848569887, |
|
"learning_rate": 3.4469446526908555e-06, |
|
"loss": 0.0117, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.410891089108911, |
|
"grad_norm": 0.10985567703832641, |
|
"learning_rate": 3.3910215189374916e-06, |
|
"loss": 0.0095, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 4.415841584158416, |
|
"grad_norm": 0.1315796533550786, |
|
"learning_rate": 3.3355356647239987e-06, |
|
"loss": 0.0113, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 4.420792079207921, |
|
"grad_norm": 0.10069554863342839, |
|
"learning_rate": 3.2804877528066225e-06, |
|
"loss": 0.0092, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 4.425742574257426, |
|
"grad_norm": 0.10433578338492921, |
|
"learning_rate": 3.225878440710544e-06, |
|
"loss": 0.0082, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 4.430693069306931, |
|
"grad_norm": 0.12234230112930741, |
|
"learning_rate": 3.171708380722072e-06, |
|
"loss": 0.0106, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 4.435643564356436, |
|
"grad_norm": 0.11967807029664655, |
|
"learning_rate": 3.1179782198807973e-06, |
|
"loss": 0.0109, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 4.4405940594059405, |
|
"grad_norm": 0.12548981747614735, |
|
"learning_rate": 3.064688599971901e-06, |
|
"loss": 0.0104, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 4.445544554455446, |
|
"grad_norm": 0.1410817044638762, |
|
"learning_rate": 3.011840157518493e-06, |
|
"loss": 0.0133, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 4.4504950495049505, |
|
"grad_norm": 0.10380510689456897, |
|
"learning_rate": 2.9594335237739778e-06, |
|
"loss": 0.0082, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 4.455445544554456, |
|
"grad_norm": 0.11959316610225865, |
|
"learning_rate": 2.9074693247145513e-06, |
|
"loss": 0.0088, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.46039603960396, |
|
"grad_norm": 0.1603922624135695, |
|
"learning_rate": 2.85594818103168e-06, |
|
"loss": 0.0153, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 4.465346534653466, |
|
"grad_norm": 0.10661183180213293, |
|
"learning_rate": 2.804870708124745e-06, |
|
"loss": 0.0098, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 4.47029702970297, |
|
"grad_norm": 0.1293618057772319, |
|
"learning_rate": 2.754237516093623e-06, |
|
"loss": 0.0108, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 4.475247524752476, |
|
"grad_norm": 0.17181592274571184, |
|
"learning_rate": 2.7040492097314498e-06, |
|
"loss": 0.012, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 4.48019801980198, |
|
"grad_norm": 0.1289378035263025, |
|
"learning_rate": 2.6543063885173936e-06, |
|
"loss": 0.0128, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 4.485148514851485, |
|
"grad_norm": 0.09592619922978449, |
|
"learning_rate": 2.605009646609453e-06, |
|
"loss": 0.0079, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 4.49009900990099, |
|
"grad_norm": 0.10097183688451279, |
|
"learning_rate": 2.556159572837422e-06, |
|
"loss": 0.0091, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 4.4950495049504955, |
|
"grad_norm": 0.11735122065163751, |
|
"learning_rate": 2.5077567506957977e-06, |
|
"loss": 0.0086, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.10580716713405154, |
|
"learning_rate": 2.459801758336835e-06, |
|
"loss": 0.0099, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 4.5049504950495045, |
|
"grad_norm": 0.097587996056804, |
|
"learning_rate": 2.4122951685636674e-06, |
|
"loss": 0.0072, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.50990099009901, |
|
"grad_norm": 0.10194905262661175, |
|
"learning_rate": 2.3652375488234114e-06, |
|
"loss": 0.0088, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 4.514851485148515, |
|
"grad_norm": 0.11494423366378297, |
|
"learning_rate": 2.3186294612004365e-06, |
|
"loss": 0.0089, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 4.51980198019802, |
|
"grad_norm": 0.10539132079737316, |
|
"learning_rate": 2.272471462409622e-06, |
|
"loss": 0.008, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 4.524752475247524, |
|
"grad_norm": 0.13942440167736256, |
|
"learning_rate": 2.226764103789716e-06, |
|
"loss": 0.0107, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 4.52970297029703, |
|
"grad_norm": 0.133810524249386, |
|
"learning_rate": 2.181507931296749e-06, |
|
"loss": 0.0102, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 4.534653465346535, |
|
"grad_norm": 0.10558494444651857, |
|
"learning_rate": 2.136703485497531e-06, |
|
"loss": 0.0085, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 4.53960396039604, |
|
"grad_norm": 0.10838629888708354, |
|
"learning_rate": 2.0923513015631646e-06, |
|
"loss": 0.0092, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 4.544554455445544, |
|
"grad_norm": 0.11934575207962728, |
|
"learning_rate": 2.0484519092626652e-06, |
|
"loss": 0.0102, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 4.5495049504950495, |
|
"grad_norm": 0.11420310289601006, |
|
"learning_rate": 2.0050058329566367e-06, |
|
"loss": 0.0109, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 4.554455445544555, |
|
"grad_norm": 0.1288165937047373, |
|
"learning_rate": 1.9620135915909968e-06, |
|
"loss": 0.012, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.5594059405940595, |
|
"grad_norm": 0.12492928381215326, |
|
"learning_rate": 1.9194756986908025e-06, |
|
"loss": 0.012, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 4.564356435643564, |
|
"grad_norm": 0.1562283004243408, |
|
"learning_rate": 1.8773926623541028e-06, |
|
"loss": 0.013, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 4.569306930693069, |
|
"grad_norm": 0.11262911274840161, |
|
"learning_rate": 1.835764985245856e-06, |
|
"loss": 0.0106, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 4.574257425742574, |
|
"grad_norm": 0.10018479525655571, |
|
"learning_rate": 1.7945931645919358e-06, |
|
"loss": 0.0085, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 4.579207920792079, |
|
"grad_norm": 0.12831476700417802, |
|
"learning_rate": 1.7538776921731937e-06, |
|
"loss": 0.0127, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 4.584158415841584, |
|
"grad_norm": 0.10899789389203231, |
|
"learning_rate": 1.713619054319593e-06, |
|
"loss": 0.0102, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 4.589108910891089, |
|
"grad_norm": 0.12180796544381997, |
|
"learning_rate": 1.6738177319044036e-06, |
|
"loss": 0.0103, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 4.594059405940594, |
|
"grad_norm": 0.1304139455450118, |
|
"learning_rate": 1.6344742003384161e-06, |
|
"loss": 0.0117, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 4.599009900990099, |
|
"grad_norm": 0.09331975478034672, |
|
"learning_rate": 1.5955889295643111e-06, |
|
"loss": 0.0075, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 4.603960396039604, |
|
"grad_norm": 0.12135625400911418, |
|
"learning_rate": 1.5571623840510185e-06, |
|
"loss": 0.0091, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.608910891089109, |
|
"grad_norm": 0.13446080469374425, |
|
"learning_rate": 1.519195022788198e-06, |
|
"loss": 0.0127, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 4.6138613861386135, |
|
"grad_norm": 0.11265755081466036, |
|
"learning_rate": 1.481687299280723e-06, |
|
"loss": 0.011, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 4.618811881188119, |
|
"grad_norm": 0.1012739526325802, |
|
"learning_rate": 1.4446396615432855e-06, |
|
"loss": 0.0069, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 4.623762376237623, |
|
"grad_norm": 0.09877519513950658, |
|
"learning_rate": 1.4080525520950184e-06, |
|
"loss": 0.0085, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 4.628712871287129, |
|
"grad_norm": 0.10904407230005678, |
|
"learning_rate": 1.3719264079542628e-06, |
|
"loss": 0.0105, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 4.633663366336633, |
|
"grad_norm": 0.09084607268010596, |
|
"learning_rate": 1.33626166063328e-06, |
|
"loss": 0.0064, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 4.638613861386139, |
|
"grad_norm": 0.10777246325097957, |
|
"learning_rate": 1.3010587361331673e-06, |
|
"loss": 0.0094, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 4.643564356435643, |
|
"grad_norm": 0.1210887592148449, |
|
"learning_rate": 1.2663180549387e-06, |
|
"loss": 0.0125, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 4.648514851485149, |
|
"grad_norm": 0.11166984220404819, |
|
"learning_rate": 1.2320400320133551e-06, |
|
"loss": 0.0079, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 4.653465346534653, |
|
"grad_norm": 0.12155380749860617, |
|
"learning_rate": 1.1982250767943593e-06, |
|
"loss": 0.0121, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.658415841584159, |
|
"grad_norm": 0.09874444900173716, |
|
"learning_rate": 1.1648735931877543e-06, |
|
"loss": 0.0075, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 4.663366336633663, |
|
"grad_norm": 0.12797079584414423, |
|
"learning_rate": 1.131985979563619e-06, |
|
"loss": 0.0084, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 4.6683168316831685, |
|
"grad_norm": 0.12083572862418303, |
|
"learning_rate": 1.0995626287512828e-06, |
|
"loss": 0.0101, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 4.673267326732673, |
|
"grad_norm": 0.11920811138204533, |
|
"learning_rate": 1.0676039280346439e-06, |
|
"loss": 0.0088, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 4.678217821782178, |
|
"grad_norm": 0.120811785785541, |
|
"learning_rate": 1.036110259147547e-06, |
|
"loss": 0.0103, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.683168316831683, |
|
"grad_norm": 0.09314579688215502, |
|
"learning_rate": 1.0050819982692083e-06, |
|
"loss": 0.0087, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 4.688118811881188, |
|
"grad_norm": 0.11433215178887399, |
|
"learning_rate": 9.745195160197452e-07, |
|
"loss": 0.008, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 4.693069306930693, |
|
"grad_norm": 0.11421090935566788, |
|
"learning_rate": 9.444231774557199e-07, |
|
"loss": 0.0098, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 4.698019801980198, |
|
"grad_norm": 0.13351330986822083, |
|
"learning_rate": 9.147933420658117e-07, |
|
"loss": 0.0126, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 4.702970297029703, |
|
"grad_norm": 0.12319551924430684, |
|
"learning_rate": 8.856303637664987e-07, |
|
"loss": 0.0112, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.707920792079208, |
|
"grad_norm": 0.11647737457481774, |
|
"learning_rate": 8.569345908978355e-07, |
|
"loss": 0.0089, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 4.712871287128713, |
|
"grad_norm": 0.08526467868390036, |
|
"learning_rate": 8.287063662193095e-07, |
|
"loss": 0.0065, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 4.717821782178218, |
|
"grad_norm": 0.10819662235450227, |
|
"learning_rate": 8.009460269057156e-07, |
|
"loss": 0.0093, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 4.7227722772277225, |
|
"grad_norm": 0.10795810812921569, |
|
"learning_rate": 7.736539045431634e-07, |
|
"loss": 0.0083, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 4.727722772277228, |
|
"grad_norm": 0.13790878328791822, |
|
"learning_rate": 7.468303251250764e-07, |
|
"loss": 0.0117, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.732673267326732, |
|
"grad_norm": 0.12340124149908364, |
|
"learning_rate": 7.204756090483411e-07, |
|
"loss": 0.0131, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 4.737623762376238, |
|
"grad_norm": 0.11659198110649899, |
|
"learning_rate": 6.945900711094534e-07, |
|
"loss": 0.0091, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 4.742574257425742, |
|
"grad_norm": 0.1303246749548437, |
|
"learning_rate": 6.691740205007602e-07, |
|
"loss": 0.0105, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 4.747524752475248, |
|
"grad_norm": 0.12015398011009239, |
|
"learning_rate": 6.442277608067838e-07, |
|
"loss": 0.0091, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 4.752475247524752, |
|
"grad_norm": 0.11438450355082716, |
|
"learning_rate": 6.197515900005613e-07, |
|
"loss": 0.0086, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.757425742574258, |
|
"grad_norm": 0.10035792670493265, |
|
"learning_rate": 5.957458004401328e-07, |
|
"loss": 0.009, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 4.762376237623762, |
|
"grad_norm": 0.12945304146742145, |
|
"learning_rate": 5.722106788649928e-07, |
|
"loss": 0.0086, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 4.767326732673268, |
|
"grad_norm": 0.1313966501532762, |
|
"learning_rate": 5.491465063927282e-07, |
|
"loss": 0.016, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 4.772277227722772, |
|
"grad_norm": 0.09991929616458903, |
|
"learning_rate": 5.265535585156079e-07, |
|
"loss": 0.0077, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 4.7772277227722775, |
|
"grad_norm": 0.1272495031762234, |
|
"learning_rate": 5.044321050973189e-07, |
|
"loss": 0.0105, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.782178217821782, |
|
"grad_norm": 0.11503642147592588, |
|
"learning_rate": 4.827824103697332e-07, |
|
"loss": 0.0092, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 4.787128712871287, |
|
"grad_norm": 0.12710578688405658, |
|
"learning_rate": 4.616047329297546e-07, |
|
"loss": 0.0125, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 4.792079207920792, |
|
"grad_norm": 0.11374035594717975, |
|
"learning_rate": 4.408993257362282e-07, |
|
"loss": 0.0104, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 4.797029702970297, |
|
"grad_norm": 0.08858002009294087, |
|
"learning_rate": 4.206664361069379e-07, |
|
"loss": 0.0076, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 4.801980198019802, |
|
"grad_norm": 0.11134716407300935, |
|
"learning_rate": 4.0090630571560927e-07, |
|
"loss": 0.0115, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.806930693069307, |
|
"grad_norm": 0.12525044878492964, |
|
"learning_rate": 3.8161917058906706e-07, |
|
"loss": 0.012, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 4.811881188118812, |
|
"grad_norm": 0.10392944064091289, |
|
"learning_rate": 3.628052611043842e-07, |
|
"loss": 0.01, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 4.816831683168317, |
|
"grad_norm": 0.10164873644042355, |
|
"learning_rate": 3.444648019861552e-07, |
|
"loss": 0.0076, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 4.821782178217822, |
|
"grad_norm": 0.10039228066726238, |
|
"learning_rate": 3.265980123038004e-07, |
|
"loss": 0.0078, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 4.826732673267327, |
|
"grad_norm": 0.14843542740351492, |
|
"learning_rate": 3.0920510546894156e-07, |
|
"loss": 0.0135, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.8316831683168315, |
|
"grad_norm": 0.10601558319793576, |
|
"learning_rate": 2.9228628923285705e-07, |
|
"loss": 0.0095, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 4.836633663366337, |
|
"grad_norm": 0.1244248427623547, |
|
"learning_rate": 2.7584176568401734e-07, |
|
"loss": 0.0115, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 4.841584158415841, |
|
"grad_norm": 0.13885166923612954, |
|
"learning_rate": 2.5987173124564224e-07, |
|
"loss": 0.0124, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 4.846534653465347, |
|
"grad_norm": 0.11860523889858633, |
|
"learning_rate": 2.4437637667338754e-07, |
|
"loss": 0.0106, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 4.851485148514851, |
|
"grad_norm": 0.11637752351297623, |
|
"learning_rate": 2.2935588705302658e-07, |
|
"loss": 0.0087, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.856435643564357, |
|
"grad_norm": 0.0970118081249502, |
|
"learning_rate": 2.148104417982788e-07, |
|
"loss": 0.0069, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 4.861386138613861, |
|
"grad_norm": 0.10050712126829177, |
|
"learning_rate": 2.0074021464864702e-07, |
|
"loss": 0.0084, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 4.866336633663367, |
|
"grad_norm": 0.09418273967101161, |
|
"learning_rate": 1.871453736673301e-07, |
|
"loss": 0.0069, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 4.871287128712871, |
|
"grad_norm": 0.09839098778033135, |
|
"learning_rate": 1.740260812392558e-07, |
|
"loss": 0.0075, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 4.876237623762377, |
|
"grad_norm": 0.11625610481951301, |
|
"learning_rate": 1.6138249406909558e-07, |
|
"loss": 0.0097, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 4.881188118811881, |
|
"grad_norm": 0.10209939301826869, |
|
"learning_rate": 1.4921476317941719e-07, |
|
"loss": 0.0068, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 4.8861386138613865, |
|
"grad_norm": 0.10859072250356432, |
|
"learning_rate": 1.3752303390887733e-07, |
|
"loss": 0.0094, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 4.891089108910891, |
|
"grad_norm": 0.10996490648405818, |
|
"learning_rate": 1.2630744591048516e-07, |
|
"loss": 0.0083, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 4.896039603960396, |
|
"grad_norm": 0.11600550545963038, |
|
"learning_rate": 1.1556813314993698e-07, |
|
"loss": 0.0101, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 4.900990099009901, |
|
"grad_norm": 0.12735173508972217, |
|
"learning_rate": 1.0530522390400422e-07, |
|
"loss": 0.0099, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.905940594059406, |
|
"grad_norm": 0.12475506745382753, |
|
"learning_rate": 9.551884075901463e-08, |
|
"loss": 0.0111, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 4.910891089108911, |
|
"grad_norm": 0.09638370643324946, |
|
"learning_rate": 8.620910060938681e-08, |
|
"loss": 0.008, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 4.915841584158416, |
|
"grad_norm": 0.11118069759156585, |
|
"learning_rate": 7.737611465622686e-08, |
|
"loss": 0.0084, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 4.920792079207921, |
|
"grad_norm": 0.09421039333272324, |
|
"learning_rate": 6.901998840600055e-08, |
|
"loss": 0.008, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 4.925742574257426, |
|
"grad_norm": 0.08785979683980723, |
|
"learning_rate": 6.11408216692766e-08, |
|
"loss": 0.0077, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 4.930693069306931, |
|
"grad_norm": 0.11723033297350285, |
|
"learning_rate": 5.373870855954089e-08, |
|
"loss": 0.0111, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 4.935643564356436, |
|
"grad_norm": 0.09947063813063768, |
|
"learning_rate": 4.681373749205964e-08, |
|
"loss": 0.0095, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 4.9405940594059405, |
|
"grad_norm": 0.134222756516452, |
|
"learning_rate": 4.036599118282691e-08, |
|
"loss": 0.016, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 4.945544554455445, |
|
"grad_norm": 0.12883240275435368, |
|
"learning_rate": 3.439554664758316e-08, |
|
"loss": 0.0104, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 4.9504950495049505, |
|
"grad_norm": 0.12150651151338142, |
|
"learning_rate": 2.890247520089151e-08, |
|
"loss": 0.0093, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.955445544554456, |
|
"grad_norm": 0.10677161131156719, |
|
"learning_rate": 2.3886842455285166e-08, |
|
"loss": 0.0101, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 4.96039603960396, |
|
"grad_norm": 0.1394432398003884, |
|
"learning_rate": 1.934870832047686e-08, |
|
"loss": 0.0108, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 4.965346534653465, |
|
"grad_norm": 0.10205391962906493, |
|
"learning_rate": 1.528812700266169e-08, |
|
"loss": 0.0071, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 4.97029702970297, |
|
"grad_norm": 0.14245472257343503, |
|
"learning_rate": 1.1705147003842065e-08, |
|
"loss": 0.0125, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 4.975247524752476, |
|
"grad_norm": 0.12217355514292874, |
|
"learning_rate": 8.59981112128594e-09, |
|
"loss": 0.011, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 4.98019801980198, |
|
"grad_norm": 0.11487251605220974, |
|
"learning_rate": 5.972156446980571e-09, |
|
"loss": 0.0104, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 4.985148514851485, |
|
"grad_norm": 0.11454656357310349, |
|
"learning_rate": 3.822214367197319e-09, |
|
"loss": 0.0099, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 4.99009900990099, |
|
"grad_norm": 0.11034869889608963, |
|
"learning_rate": 2.150010562140814e-09, |
|
"loss": 0.0077, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 4.9950495049504955, |
|
"grad_norm": 0.11755320903989616, |
|
"learning_rate": 9.555650056070065e-10, |
|
"loss": 0.0111, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.12274589377449001, |
|
"learning_rate": 2.3889196477000497e-10, |
|
"loss": 0.0082, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 1010, |
|
"total_flos": 889001858826240.0, |
|
"train_loss": 0.25745383046733417, |
|
"train_runtime": 97110.6799, |
|
"train_samples_per_second": 0.083, |
|
"train_steps_per_second": 0.01 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1010, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 889001858826240.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|