{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 404, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024752475247524753, "grad_norm": 0.35921207070350647, "learning_rate": 1e-05, "loss": 1.8027, "step": 1 }, { "epoch": 0.0049504950495049506, "grad_norm": 0.39715632796287537, "learning_rate": 9.975247524752477e-06, "loss": 1.9376, "step": 2 }, { "epoch": 0.007425742574257425, "grad_norm": 0.4083423614501953, "learning_rate": 9.950495049504951e-06, "loss": 1.9841, "step": 3 }, { "epoch": 0.009900990099009901, "grad_norm": 0.40308135747909546, "learning_rate": 9.925742574257427e-06, "loss": 1.9444, "step": 4 }, { "epoch": 0.012376237623762377, "grad_norm": 0.370197057723999, "learning_rate": 9.900990099009901e-06, "loss": 1.7989, "step": 5 }, { "epoch": 0.01485148514851485, "grad_norm": 0.3891890347003937, "learning_rate": 9.876237623762377e-06, "loss": 1.8703, "step": 6 }, { "epoch": 0.017326732673267328, "grad_norm": 0.39329928159713745, "learning_rate": 9.851485148514852e-06, "loss": 1.9423, "step": 7 }, { "epoch": 0.019801980198019802, "grad_norm": 0.38817495107650757, "learning_rate": 9.826732673267328e-06, "loss": 1.8705, "step": 8 }, { "epoch": 0.022277227722772276, "grad_norm": 0.3646544814109802, "learning_rate": 9.801980198019802e-06, "loss": 1.7702, "step": 9 }, { "epoch": 0.024752475247524754, "grad_norm": 0.3860485851764679, "learning_rate": 9.777227722772278e-06, "loss": 1.863, "step": 10 }, { "epoch": 0.027227722772277228, "grad_norm": 0.34650540351867676, "learning_rate": 9.752475247524754e-06, "loss": 1.7751, "step": 11 }, { "epoch": 0.0297029702970297, "grad_norm": 0.3583658039569855, "learning_rate": 9.727722772277228e-06, "loss": 1.8129, "step": 12 }, { "epoch": 0.03217821782178218, "grad_norm": 0.3476351797580719, "learning_rate": 9.702970297029704e-06, "loss": 1.8005, "step": 13 }, { "epoch": 0.034653465346534656, "grad_norm": 0.34600403904914856, "learning_rate": 9.678217821782178e-06, "loss": 1.8254, "step": 14 }, { "epoch": 0.03712871287128713, "grad_norm": 0.3049982786178589, "learning_rate": 9.653465346534654e-06, "loss": 1.6756, "step": 15 }, { "epoch": 0.039603960396039604, "grad_norm": 0.3400605618953705, "learning_rate": 9.628712871287129e-06, "loss": 1.8066, "step": 16 }, { "epoch": 0.04207920792079208, "grad_norm": 0.29243192076683044, "learning_rate": 9.603960396039604e-06, "loss": 1.7034, "step": 17 }, { "epoch": 0.04455445544554455, "grad_norm": 0.3139927089214325, "learning_rate": 9.579207920792079e-06, "loss": 1.7889, "step": 18 }, { "epoch": 0.04702970297029703, "grad_norm": 0.29722219705581665, "learning_rate": 9.554455445544555e-06, "loss": 1.7024, "step": 19 }, { "epoch": 0.04950495049504951, "grad_norm": 0.29992449283599854, "learning_rate": 9.52970297029703e-06, "loss": 1.7444, "step": 20 }, { "epoch": 0.05198019801980198, "grad_norm": 0.3000761568546295, "learning_rate": 9.504950495049505e-06, "loss": 1.6878, "step": 21 }, { "epoch": 0.054455445544554455, "grad_norm": 0.2957460284233093, "learning_rate": 9.480198019801981e-06, "loss": 1.7008, "step": 22 }, { "epoch": 0.05693069306930693, "grad_norm": 0.2921546995639801, "learning_rate": 9.455445544554455e-06, "loss": 1.749, "step": 23 }, { "epoch": 0.0594059405940594, "grad_norm": 0.27504685521125793, "learning_rate": 9.430693069306931e-06, "loss": 1.6845, "step": 24 }, { "epoch": 0.06188118811881188, "grad_norm": 0.2481079399585724, "learning_rate": 9.405940594059405e-06, "loss": 1.5942, "step": 25 }, { "epoch": 0.06435643564356436, "grad_norm": 0.2514517903327942, "learning_rate": 9.381188118811881e-06, "loss": 1.6142, "step": 26 }, { "epoch": 0.06683168316831684, "grad_norm": 0.23834620416164398, "learning_rate": 9.356435643564357e-06, "loss": 1.5875, "step": 27 }, { "epoch": 0.06930693069306931, "grad_norm": 0.246104896068573, "learning_rate": 9.331683168316833e-06, "loss": 1.6119, "step": 28 }, { "epoch": 0.07178217821782178, "grad_norm": 0.21701523661613464, "learning_rate": 9.306930693069308e-06, "loss": 1.531, "step": 29 }, { "epoch": 0.07425742574257425, "grad_norm": 0.26559269428253174, "learning_rate": 9.282178217821784e-06, "loss": 1.6806, "step": 30 }, { "epoch": 0.07673267326732673, "grad_norm": 0.23772238194942474, "learning_rate": 9.257425742574258e-06, "loss": 1.5922, "step": 31 }, { "epoch": 0.07920792079207921, "grad_norm": 0.23874586820602417, "learning_rate": 9.232673267326734e-06, "loss": 1.567, "step": 32 }, { "epoch": 0.08168316831683169, "grad_norm": 0.2239362597465515, "learning_rate": 9.20792079207921e-06, "loss": 1.5437, "step": 33 }, { "epoch": 0.08415841584158416, "grad_norm": 0.21884024143218994, "learning_rate": 9.183168316831684e-06, "loss": 1.485, "step": 34 }, { "epoch": 0.08663366336633663, "grad_norm": 0.22681422531604767, "learning_rate": 9.15841584158416e-06, "loss": 1.5176, "step": 35 }, { "epoch": 0.0891089108910891, "grad_norm": 0.23015110194683075, "learning_rate": 9.133663366336634e-06, "loss": 1.5312, "step": 36 }, { "epoch": 0.09158415841584158, "grad_norm": 0.21569128334522247, "learning_rate": 9.10891089108911e-06, "loss": 1.4879, "step": 37 }, { "epoch": 0.09405940594059406, "grad_norm": 0.247847318649292, "learning_rate": 9.084158415841585e-06, "loss": 1.5873, "step": 38 }, { "epoch": 0.09653465346534654, "grad_norm": 0.20600910484790802, "learning_rate": 9.05940594059406e-06, "loss": 1.476, "step": 39 }, { "epoch": 0.09900990099009901, "grad_norm": 0.23611964285373688, "learning_rate": 9.034653465346535e-06, "loss": 1.5556, "step": 40 }, { "epoch": 0.10148514851485149, "grad_norm": 0.22053782641887665, "learning_rate": 9.009900990099011e-06, "loss": 1.5271, "step": 41 }, { "epoch": 0.10396039603960396, "grad_norm": 0.21321026980876923, "learning_rate": 8.985148514851487e-06, "loss": 1.47, "step": 42 }, { "epoch": 0.10643564356435643, "grad_norm": 0.20851102471351624, "learning_rate": 8.960396039603961e-06, "loss": 1.4747, "step": 43 }, { "epoch": 0.10891089108910891, "grad_norm": 0.20043537020683289, "learning_rate": 8.935643564356437e-06, "loss": 1.4561, "step": 44 }, { "epoch": 0.11138613861386139, "grad_norm": 0.2246498167514801, "learning_rate": 8.910891089108911e-06, "loss": 1.5265, "step": 45 }, { "epoch": 0.11386138613861387, "grad_norm": 0.19988679885864258, "learning_rate": 8.886138613861387e-06, "loss": 1.4764, "step": 46 }, { "epoch": 0.11633663366336634, "grad_norm": 0.1900709569454193, "learning_rate": 8.861386138613862e-06, "loss": 1.3958, "step": 47 }, { "epoch": 0.1188118811881188, "grad_norm": 0.18632960319519043, "learning_rate": 8.836633663366338e-06, "loss": 1.3834, "step": 48 }, { "epoch": 0.12128712871287128, "grad_norm": 0.18428552150726318, "learning_rate": 8.811881188118812e-06, "loss": 1.3753, "step": 49 }, { "epoch": 0.12376237623762376, "grad_norm": 0.2110532969236374, "learning_rate": 8.787128712871288e-06, "loss": 1.3957, "step": 50 }, { "epoch": 0.12623762376237624, "grad_norm": 0.17746232450008392, "learning_rate": 8.762376237623764e-06, "loss": 1.3436, "step": 51 }, { "epoch": 0.12871287128712872, "grad_norm": 0.16532298922538757, "learning_rate": 8.737623762376238e-06, "loss": 1.3575, "step": 52 }, { "epoch": 0.1311881188118812, "grad_norm": 0.1659340113401413, "learning_rate": 8.712871287128714e-06, "loss": 1.3793, "step": 53 }, { "epoch": 0.13366336633663367, "grad_norm": 0.16310887038707733, "learning_rate": 8.688118811881188e-06, "loss": 1.3364, "step": 54 }, { "epoch": 0.13613861386138615, "grad_norm": 0.17784424126148224, "learning_rate": 8.663366336633664e-06, "loss": 1.3824, "step": 55 }, { "epoch": 0.13861386138613863, "grad_norm": 0.1612381488084793, "learning_rate": 8.638613861386139e-06, "loss": 1.3519, "step": 56 }, { "epoch": 0.14108910891089108, "grad_norm": 0.16395767033100128, "learning_rate": 8.613861386138615e-06, "loss": 1.3209, "step": 57 }, { "epoch": 0.14356435643564355, "grad_norm": 0.15274788439273834, "learning_rate": 8.58910891089109e-06, "loss": 1.3076, "step": 58 }, { "epoch": 0.14603960396039603, "grad_norm": 0.26918065547943115, "learning_rate": 8.564356435643565e-06, "loss": 1.3272, "step": 59 }, { "epoch": 0.1485148514851485, "grad_norm": 0.1536840796470642, "learning_rate": 8.53960396039604e-06, "loss": 1.2732, "step": 60 }, { "epoch": 0.15099009900990099, "grad_norm": 0.15451501309871674, "learning_rate": 8.514851485148515e-06, "loss": 1.3142, "step": 61 }, { "epoch": 0.15346534653465346, "grad_norm": 0.15294960141181946, "learning_rate": 8.490099009900991e-06, "loss": 1.3307, "step": 62 }, { "epoch": 0.15594059405940594, "grad_norm": 0.1928381323814392, "learning_rate": 8.465346534653465e-06, "loss": 1.311, "step": 63 }, { "epoch": 0.15841584158415842, "grad_norm": 0.15054042637348175, "learning_rate": 8.440594059405941e-06, "loss": 1.3191, "step": 64 }, { "epoch": 0.1608910891089109, "grad_norm": 0.13962224125862122, "learning_rate": 8.415841584158416e-06, "loss": 1.2381, "step": 65 }, { "epoch": 0.16336633663366337, "grad_norm": 0.1442529261112213, "learning_rate": 8.391089108910891e-06, "loss": 1.2732, "step": 66 }, { "epoch": 0.16584158415841585, "grad_norm": 0.14938265085220337, "learning_rate": 8.366336633663367e-06, "loss": 1.2835, "step": 67 }, { "epoch": 0.16831683168316833, "grad_norm": 0.17844296991825104, "learning_rate": 8.341584158415842e-06, "loss": 1.3027, "step": 68 }, { "epoch": 0.1707920792079208, "grad_norm": 0.14419397711753845, "learning_rate": 8.316831683168318e-06, "loss": 1.3032, "step": 69 }, { "epoch": 0.17326732673267325, "grad_norm": 0.16270500421524048, "learning_rate": 8.292079207920792e-06, "loss": 1.2799, "step": 70 }, { "epoch": 0.17574257425742573, "grad_norm": 0.1458730697631836, "learning_rate": 8.267326732673268e-06, "loss": 1.2522, "step": 71 }, { "epoch": 0.1782178217821782, "grad_norm": 0.14367038011550903, "learning_rate": 8.242574257425742e-06, "loss": 1.2369, "step": 72 }, { "epoch": 0.1806930693069307, "grad_norm": 0.1553303301334381, "learning_rate": 8.217821782178218e-06, "loss": 1.3366, "step": 73 }, { "epoch": 0.18316831683168316, "grad_norm": 0.14889273047447205, "learning_rate": 8.193069306930692e-06, "loss": 1.3084, "step": 74 }, { "epoch": 0.18564356435643564, "grad_norm": 0.14996132254600525, "learning_rate": 8.168316831683168e-06, "loss": 1.2643, "step": 75 }, { "epoch": 0.18811881188118812, "grad_norm": 0.14123128354549408, "learning_rate": 8.143564356435644e-06, "loss": 1.2621, "step": 76 }, { "epoch": 0.1905940594059406, "grad_norm": 0.1484861671924591, "learning_rate": 8.11881188118812e-06, "loss": 1.3082, "step": 77 }, { "epoch": 0.19306930693069307, "grad_norm": 0.1556919664144516, "learning_rate": 8.094059405940595e-06, "loss": 1.2785, "step": 78 }, { "epoch": 0.19554455445544555, "grad_norm": 0.1459795981645584, "learning_rate": 8.06930693069307e-06, "loss": 1.1944, "step": 79 }, { "epoch": 0.19801980198019803, "grad_norm": 0.13556146621704102, "learning_rate": 8.044554455445545e-06, "loss": 1.2475, "step": 80 }, { "epoch": 0.2004950495049505, "grad_norm": 0.14718809723854065, "learning_rate": 8.019801980198021e-06, "loss": 1.2353, "step": 81 }, { "epoch": 0.20297029702970298, "grad_norm": 0.14329920709133148, "learning_rate": 7.995049504950497e-06, "loss": 1.252, "step": 82 }, { "epoch": 0.20544554455445543, "grad_norm": 0.14620834589004517, "learning_rate": 7.970297029702971e-06, "loss": 1.2394, "step": 83 }, { "epoch": 0.2079207920792079, "grad_norm": 0.14883743226528168, "learning_rate": 7.945544554455447e-06, "loss": 1.2461, "step": 84 }, { "epoch": 0.2103960396039604, "grad_norm": 0.14748890697956085, "learning_rate": 7.920792079207921e-06, "loss": 1.2049, "step": 85 }, { "epoch": 0.21287128712871287, "grad_norm": 0.1422479897737503, "learning_rate": 7.896039603960397e-06, "loss": 1.2621, "step": 86 }, { "epoch": 0.21534653465346534, "grad_norm": 0.1533108800649643, "learning_rate": 7.871287128712872e-06, "loss": 1.2211, "step": 87 }, { "epoch": 0.21782178217821782, "grad_norm": 0.14151449501514435, "learning_rate": 7.846534653465348e-06, "loss": 1.2094, "step": 88 }, { "epoch": 0.2202970297029703, "grad_norm": 0.14519892632961273, "learning_rate": 7.821782178217822e-06, "loss": 1.2665, "step": 89 }, { "epoch": 0.22277227722772278, "grad_norm": 0.15295307338237762, "learning_rate": 7.797029702970298e-06, "loss": 1.2438, "step": 90 }, { "epoch": 0.22524752475247525, "grad_norm": 0.14052022993564606, "learning_rate": 7.772277227722774e-06, "loss": 1.2013, "step": 91 }, { "epoch": 0.22772277227722773, "grad_norm": 0.208185613155365, "learning_rate": 7.747524752475248e-06, "loss": 1.2212, "step": 92 }, { "epoch": 0.2301980198019802, "grad_norm": 0.14823101460933685, "learning_rate": 7.722772277227724e-06, "loss": 1.1976, "step": 93 }, { "epoch": 0.23267326732673269, "grad_norm": 0.22633716464042664, "learning_rate": 7.698019801980198e-06, "loss": 1.2397, "step": 94 }, { "epoch": 0.23514851485148514, "grad_norm": 0.15597891807556152, "learning_rate": 7.673267326732674e-06, "loss": 1.2253, "step": 95 }, { "epoch": 0.2376237623762376, "grad_norm": 0.14791743457317352, "learning_rate": 7.648514851485149e-06, "loss": 1.1459, "step": 96 }, { "epoch": 0.2400990099009901, "grad_norm": 0.14784985780715942, "learning_rate": 7.6237623762376246e-06, "loss": 1.2072, "step": 97 }, { "epoch": 0.24257425742574257, "grad_norm": 0.1492685228586197, "learning_rate": 7.5990099009901e-06, "loss": 1.2038, "step": 98 }, { "epoch": 0.24504950495049505, "grad_norm": 0.14586341381072998, "learning_rate": 7.574257425742575e-06, "loss": 1.1988, "step": 99 }, { "epoch": 0.24752475247524752, "grad_norm": 0.15133561193943024, "learning_rate": 7.54950495049505e-06, "loss": 1.217, "step": 100 }, { "epoch": 0.25, "grad_norm": 0.14834076166152954, "learning_rate": 7.524752475247525e-06, "loss": 1.1654, "step": 101 }, { "epoch": 0.2524752475247525, "grad_norm": 0.1549738198518753, "learning_rate": 7.500000000000001e-06, "loss": 1.2182, "step": 102 }, { "epoch": 0.25495049504950495, "grad_norm": 0.14767996966838837, "learning_rate": 7.475247524752476e-06, "loss": 1.1842, "step": 103 }, { "epoch": 0.25742574257425743, "grad_norm": 0.16162672638893127, "learning_rate": 7.450495049504951e-06, "loss": 1.1801, "step": 104 }, { "epoch": 0.2599009900990099, "grad_norm": 0.1689019352197647, "learning_rate": 7.425742574257426e-06, "loss": 1.1873, "step": 105 }, { "epoch": 0.2623762376237624, "grad_norm": 0.18430471420288086, "learning_rate": 7.4009900990099015e-06, "loss": 1.2109, "step": 106 }, { "epoch": 0.26485148514851486, "grad_norm": 0.15747009217739105, "learning_rate": 7.376237623762377e-06, "loss": 1.196, "step": 107 }, { "epoch": 0.26732673267326734, "grad_norm": 0.16308680176734924, "learning_rate": 7.351485148514852e-06, "loss": 1.1703, "step": 108 }, { "epoch": 0.2698019801980198, "grad_norm": 0.1602238267660141, "learning_rate": 7.326732673267327e-06, "loss": 1.2002, "step": 109 }, { "epoch": 0.2722772277227723, "grad_norm": 0.15777747333049774, "learning_rate": 7.301980198019802e-06, "loss": 1.1728, "step": 110 }, { "epoch": 0.2747524752475248, "grad_norm": 0.1543913632631302, "learning_rate": 7.277227722772278e-06, "loss": 1.1419, "step": 111 }, { "epoch": 0.27722772277227725, "grad_norm": 0.1703396588563919, "learning_rate": 7.252475247524753e-06, "loss": 1.2023, "step": 112 }, { "epoch": 0.27970297029702973, "grad_norm": 0.16434207558631897, "learning_rate": 7.227722772277228e-06, "loss": 1.2137, "step": 113 }, { "epoch": 0.28217821782178215, "grad_norm": 0.17134983837604523, "learning_rate": 7.202970297029703e-06, "loss": 1.2105, "step": 114 }, { "epoch": 0.28465346534653463, "grad_norm": 0.19009283185005188, "learning_rate": 7.1782178217821785e-06, "loss": 1.1768, "step": 115 }, { "epoch": 0.2871287128712871, "grad_norm": 0.16358834505081177, "learning_rate": 7.153465346534654e-06, "loss": 1.1668, "step": 116 }, { "epoch": 0.2896039603960396, "grad_norm": 0.2351948767900467, "learning_rate": 7.128712871287129e-06, "loss": 1.1706, "step": 117 }, { "epoch": 0.29207920792079206, "grad_norm": 0.18060773611068726, "learning_rate": 7.103960396039604e-06, "loss": 1.1659, "step": 118 }, { "epoch": 0.29455445544554454, "grad_norm": 0.19292417168617249, "learning_rate": 7.079207920792079e-06, "loss": 1.1685, "step": 119 }, { "epoch": 0.297029702970297, "grad_norm": 0.18404839932918549, "learning_rate": 7.054455445544555e-06, "loss": 1.133, "step": 120 }, { "epoch": 0.2995049504950495, "grad_norm": 0.17319351434707642, "learning_rate": 7.02970297029703e-06, "loss": 1.1565, "step": 121 }, { "epoch": 0.30198019801980197, "grad_norm": 0.17463432252407074, "learning_rate": 7.004950495049505e-06, "loss": 1.1614, "step": 122 }, { "epoch": 0.30445544554455445, "grad_norm": 0.17117349803447723, "learning_rate": 6.98019801980198e-06, "loss": 1.1595, "step": 123 }, { "epoch": 0.3069306930693069, "grad_norm": 0.17456741631031036, "learning_rate": 6.9554455445544555e-06, "loss": 1.1249, "step": 124 }, { "epoch": 0.3094059405940594, "grad_norm": 0.2406443953514099, "learning_rate": 6.930693069306931e-06, "loss": 1.1676, "step": 125 }, { "epoch": 0.3118811881188119, "grad_norm": 0.16312997043132782, "learning_rate": 6.905940594059406e-06, "loss": 1.1581, "step": 126 }, { "epoch": 0.31435643564356436, "grad_norm": 0.17027762532234192, "learning_rate": 6.881188118811881e-06, "loss": 1.1484, "step": 127 }, { "epoch": 0.31683168316831684, "grad_norm": 0.17338208854198456, "learning_rate": 6.856435643564358e-06, "loss": 1.1782, "step": 128 }, { "epoch": 0.3193069306930693, "grad_norm": 0.16886372864246368, "learning_rate": 6.831683168316833e-06, "loss": 1.1233, "step": 129 }, { "epoch": 0.3217821782178218, "grad_norm": 0.1658564656972885, "learning_rate": 6.806930693069308e-06, "loss": 1.1433, "step": 130 }, { "epoch": 0.32425742574257427, "grad_norm": 0.2258491814136505, "learning_rate": 6.782178217821783e-06, "loss": 1.1599, "step": 131 }, { "epoch": 0.32673267326732675, "grad_norm": 0.16513760387897491, "learning_rate": 6.757425742574258e-06, "loss": 1.1104, "step": 132 }, { "epoch": 0.3292079207920792, "grad_norm": 0.16575312614440918, "learning_rate": 6.732673267326733e-06, "loss": 1.1428, "step": 133 }, { "epoch": 0.3316831683168317, "grad_norm": 0.17043238878250122, "learning_rate": 6.707920792079209e-06, "loss": 1.1618, "step": 134 }, { "epoch": 0.3341584158415842, "grad_norm": 0.22997120022773743, "learning_rate": 6.683168316831684e-06, "loss": 1.1464, "step": 135 }, { "epoch": 0.33663366336633666, "grad_norm": 0.16962502896785736, "learning_rate": 6.6584158415841595e-06, "loss": 1.134, "step": 136 }, { "epoch": 0.33910891089108913, "grad_norm": 0.17967693507671356, "learning_rate": 6.633663366336635e-06, "loss": 1.1329, "step": 137 }, { "epoch": 0.3415841584158416, "grad_norm": 0.17692621052265167, "learning_rate": 6.60891089108911e-06, "loss": 1.1281, "step": 138 }, { "epoch": 0.34405940594059403, "grad_norm": 0.19165442883968353, "learning_rate": 6.584158415841585e-06, "loss": 1.1643, "step": 139 }, { "epoch": 0.3465346534653465, "grad_norm": 0.1611143797636032, "learning_rate": 6.55940594059406e-06, "loss": 1.099, "step": 140 }, { "epoch": 0.349009900990099, "grad_norm": 0.170062854886055, "learning_rate": 6.534653465346535e-06, "loss": 1.1317, "step": 141 }, { "epoch": 0.35148514851485146, "grad_norm": 0.1633332520723343, "learning_rate": 6.509900990099011e-06, "loss": 1.1481, "step": 142 }, { "epoch": 0.35396039603960394, "grad_norm": 0.16619764268398285, "learning_rate": 6.485148514851486e-06, "loss": 1.1216, "step": 143 }, { "epoch": 0.3564356435643564, "grad_norm": 0.16496872901916504, "learning_rate": 6.460396039603961e-06, "loss": 1.1103, "step": 144 }, { "epoch": 0.3589108910891089, "grad_norm": 0.16046729683876038, "learning_rate": 6.4356435643564364e-06, "loss": 1.1273, "step": 145 }, { "epoch": 0.3613861386138614, "grad_norm": 0.15890191495418549, "learning_rate": 6.4108910891089116e-06, "loss": 1.0927, "step": 146 }, { "epoch": 0.36386138613861385, "grad_norm": 0.15759097039699554, "learning_rate": 6.386138613861387e-06, "loss": 1.1565, "step": 147 }, { "epoch": 0.36633663366336633, "grad_norm": 0.21185743808746338, "learning_rate": 6.361386138613862e-06, "loss": 1.0925, "step": 148 }, { "epoch": 0.3688118811881188, "grad_norm": 0.15979894995689392, "learning_rate": 6.336633663366337e-06, "loss": 1.1229, "step": 149 }, { "epoch": 0.3712871287128713, "grad_norm": 0.16039994359016418, "learning_rate": 6.311881188118812e-06, "loss": 1.1225, "step": 150 }, { "epoch": 0.37376237623762376, "grad_norm": 0.16320356726646423, "learning_rate": 6.287128712871288e-06, "loss": 1.1513, "step": 151 }, { "epoch": 0.37623762376237624, "grad_norm": 0.1686553657054901, "learning_rate": 6.262376237623763e-06, "loss": 1.1172, "step": 152 }, { "epoch": 0.3787128712871287, "grad_norm": 0.164052814245224, "learning_rate": 6.237623762376238e-06, "loss": 1.1046, "step": 153 }, { "epoch": 0.3811881188118812, "grad_norm": 0.15217240154743195, "learning_rate": 6.212871287128713e-06, "loss": 1.1111, "step": 154 }, { "epoch": 0.38366336633663367, "grad_norm": 0.1576368808746338, "learning_rate": 6.1881188118811885e-06, "loss": 1.0526, "step": 155 }, { "epoch": 0.38613861386138615, "grad_norm": 0.1566316783428192, "learning_rate": 6.163366336633664e-06, "loss": 1.1092, "step": 156 }, { "epoch": 0.3886138613861386, "grad_norm": 0.1590651422739029, "learning_rate": 6.138613861386139e-06, "loss": 1.0949, "step": 157 }, { "epoch": 0.3910891089108911, "grad_norm": 0.15860077738761902, "learning_rate": 6.113861386138614e-06, "loss": 1.0919, "step": 158 }, { "epoch": 0.3935643564356436, "grad_norm": 0.1543784886598587, "learning_rate": 6.08910891089109e-06, "loss": 1.1081, "step": 159 }, { "epoch": 0.39603960396039606, "grad_norm": 0.17123788595199585, "learning_rate": 6.064356435643565e-06, "loss": 1.1478, "step": 160 }, { "epoch": 0.39851485148514854, "grad_norm": 0.16677404940128326, "learning_rate": 6.03960396039604e-06, "loss": 1.1128, "step": 161 }, { "epoch": 0.400990099009901, "grad_norm": 0.21054880321025848, "learning_rate": 6.014851485148515e-06, "loss": 1.1037, "step": 162 }, { "epoch": 0.4034653465346535, "grad_norm": 0.15982946753501892, "learning_rate": 5.99009900990099e-06, "loss": 1.0831, "step": 163 }, { "epoch": 0.40594059405940597, "grad_norm": 0.14396493136882782, "learning_rate": 5.9653465346534655e-06, "loss": 1.0845, "step": 164 }, { "epoch": 0.4084158415841584, "grad_norm": 0.16700232028961182, "learning_rate": 5.940594059405941e-06, "loss": 1.0954, "step": 165 }, { "epoch": 0.41089108910891087, "grad_norm": 0.15970200300216675, "learning_rate": 5.915841584158416e-06, "loss": 1.0977, "step": 166 }, { "epoch": 0.41336633663366334, "grad_norm": 0.15419846773147583, "learning_rate": 5.891089108910891e-06, "loss": 1.1063, "step": 167 }, { "epoch": 0.4158415841584158, "grad_norm": 0.16501763463020325, "learning_rate": 5.866336633663367e-06, "loss": 1.1508, "step": 168 }, { "epoch": 0.4183168316831683, "grad_norm": 0.15167059004306793, "learning_rate": 5.841584158415842e-06, "loss": 1.092, "step": 169 }, { "epoch": 0.4207920792079208, "grad_norm": 0.15721780061721802, "learning_rate": 5.816831683168317e-06, "loss": 1.0915, "step": 170 }, { "epoch": 0.42326732673267325, "grad_norm": 0.15296679735183716, "learning_rate": 5.792079207920792e-06, "loss": 1.0453, "step": 171 }, { "epoch": 0.42574257425742573, "grad_norm": 0.16495901346206665, "learning_rate": 5.767326732673267e-06, "loss": 1.1493, "step": 172 }, { "epoch": 0.4282178217821782, "grad_norm": 0.1588982343673706, "learning_rate": 5.7425742574257425e-06, "loss": 1.0593, "step": 173 }, { "epoch": 0.4306930693069307, "grad_norm": 0.15599201619625092, "learning_rate": 5.717821782178218e-06, "loss": 1.0433, "step": 174 }, { "epoch": 0.43316831683168316, "grad_norm": 0.1536027193069458, "learning_rate": 5.693069306930693e-06, "loss": 1.1101, "step": 175 }, { "epoch": 0.43564356435643564, "grad_norm": 0.1521487981081009, "learning_rate": 5.668316831683169e-06, "loss": 1.0905, "step": 176 }, { "epoch": 0.4381188118811881, "grad_norm": 0.21931779384613037, "learning_rate": 5.643564356435644e-06, "loss": 1.1008, "step": 177 }, { "epoch": 0.4405940594059406, "grad_norm": 0.1681768000125885, "learning_rate": 5.61881188118812e-06, "loss": 1.0724, "step": 178 }, { "epoch": 0.4430693069306931, "grad_norm": 0.16810664534568787, "learning_rate": 5.594059405940595e-06, "loss": 1.1137, "step": 179 }, { "epoch": 0.44554455445544555, "grad_norm": 0.15952704846858978, "learning_rate": 5.56930693069307e-06, "loss": 1.0913, "step": 180 }, { "epoch": 0.44801980198019803, "grad_norm": 0.25783032178878784, "learning_rate": 5.544554455445545e-06, "loss": 1.1057, "step": 181 }, { "epoch": 0.4504950495049505, "grad_norm": 0.1551341712474823, "learning_rate": 5.519801980198021e-06, "loss": 1.0744, "step": 182 }, { "epoch": 0.452970297029703, "grad_norm": 0.16667252779006958, "learning_rate": 5.495049504950496e-06, "loss": 1.066, "step": 183 }, { "epoch": 0.45544554455445546, "grad_norm": 0.16125273704528809, "learning_rate": 5.470297029702971e-06, "loss": 1.1511, "step": 184 }, { "epoch": 0.45792079207920794, "grad_norm": 0.16407977044582367, "learning_rate": 5.4455445544554465e-06, "loss": 1.0719, "step": 185 }, { "epoch": 0.4603960396039604, "grad_norm": 0.17295385897159576, "learning_rate": 5.420792079207922e-06, "loss": 1.1174, "step": 186 }, { "epoch": 0.4628712871287129, "grad_norm": 0.18729552626609802, "learning_rate": 5.396039603960397e-06, "loss": 1.0352, "step": 187 }, { "epoch": 0.46534653465346537, "grad_norm": 0.1539359837770462, "learning_rate": 5.371287128712872e-06, "loss": 1.0807, "step": 188 }, { "epoch": 0.46782178217821785, "grad_norm": 0.16208483278751373, "learning_rate": 5.346534653465347e-06, "loss": 1.08, "step": 189 }, { "epoch": 0.47029702970297027, "grad_norm": 0.1604655236005783, "learning_rate": 5.321782178217822e-06, "loss": 1.0715, "step": 190 }, { "epoch": 0.47277227722772275, "grad_norm": 0.20206867158412933, "learning_rate": 5.297029702970298e-06, "loss": 1.0588, "step": 191 }, { "epoch": 0.4752475247524752, "grad_norm": 0.15372274816036224, "learning_rate": 5.272277227722773e-06, "loss": 1.0771, "step": 192 }, { "epoch": 0.4777227722772277, "grad_norm": 0.15498343110084534, "learning_rate": 5.247524752475248e-06, "loss": 1.06, "step": 193 }, { "epoch": 0.4801980198019802, "grad_norm": 0.15957452356815338, "learning_rate": 5.2227722772277234e-06, "loss": 1.0731, "step": 194 }, { "epoch": 0.48267326732673266, "grad_norm": 0.1622665375471115, "learning_rate": 5.1980198019801986e-06, "loss": 1.0962, "step": 195 }, { "epoch": 0.48514851485148514, "grad_norm": 0.1674077957868576, "learning_rate": 5.173267326732674e-06, "loss": 1.105, "step": 196 }, { "epoch": 0.4876237623762376, "grad_norm": 0.169521763920784, "learning_rate": 5.148514851485149e-06, "loss": 1.0811, "step": 197 }, { "epoch": 0.4900990099009901, "grad_norm": 0.1688235104084015, "learning_rate": 5.123762376237624e-06, "loss": 1.0708, "step": 198 }, { "epoch": 0.49257425742574257, "grad_norm": 0.15499871969223022, "learning_rate": 5.0990099009901e-06, "loss": 1.0947, "step": 199 }, { "epoch": 0.49504950495049505, "grad_norm": 0.15326109528541565, "learning_rate": 5.074257425742575e-06, "loss": 1.0676, "step": 200 }, { "epoch": 0.4975247524752475, "grad_norm": 0.16365967690944672, "learning_rate": 5.04950495049505e-06, "loss": 1.0792, "step": 201 }, { "epoch": 0.5, "grad_norm": 0.1605796068906784, "learning_rate": 5.024752475247525e-06, "loss": 1.0939, "step": 202 }, { "epoch": 0.5024752475247525, "grad_norm": 0.16988039016723633, "learning_rate": 5e-06, "loss": 1.0996, "step": 203 }, { "epoch": 0.504950495049505, "grad_norm": 0.16347338259220123, "learning_rate": 4.9752475247524755e-06, "loss": 1.0603, "step": 204 }, { "epoch": 0.5074257425742574, "grad_norm": 0.16206030547618866, "learning_rate": 4.950495049504951e-06, "loss": 1.0675, "step": 205 }, { "epoch": 0.5099009900990099, "grad_norm": 0.18767081201076508, "learning_rate": 4.925742574257426e-06, "loss": 1.0597, "step": 206 }, { "epoch": 0.5123762376237624, "grad_norm": 0.1522037833929062, "learning_rate": 4.900990099009901e-06, "loss": 1.0673, "step": 207 }, { "epoch": 0.5148514851485149, "grad_norm": 0.15843883156776428, "learning_rate": 4.876237623762377e-06, "loss": 1.0544, "step": 208 }, { "epoch": 0.5173267326732673, "grad_norm": 0.1723291128873825, "learning_rate": 4.851485148514852e-06, "loss": 1.0837, "step": 209 }, { "epoch": 0.5198019801980198, "grad_norm": 0.16350848972797394, "learning_rate": 4.826732673267327e-06, "loss": 1.0666, "step": 210 }, { "epoch": 0.5222772277227723, "grad_norm": 0.15768328309059143, "learning_rate": 4.801980198019802e-06, "loss": 1.0723, "step": 211 }, { "epoch": 0.5247524752475248, "grad_norm": 0.17364096641540527, "learning_rate": 4.777227722772277e-06, "loss": 1.0667, "step": 212 }, { "epoch": 0.5272277227722773, "grad_norm": 0.16839410364627838, "learning_rate": 4.7524752475247525e-06, "loss": 1.0728, "step": 213 }, { "epoch": 0.5297029702970297, "grad_norm": 0.16369131207466125, "learning_rate": 4.727722772277228e-06, "loss": 1.0732, "step": 214 }, { "epoch": 0.5321782178217822, "grad_norm": 0.1695587933063507, "learning_rate": 4.702970297029703e-06, "loss": 1.0637, "step": 215 }, { "epoch": 0.5346534653465347, "grad_norm": 0.16182969510555267, "learning_rate": 4.678217821782179e-06, "loss": 1.0429, "step": 216 }, { "epoch": 0.5371287128712872, "grad_norm": 0.15863147377967834, "learning_rate": 4.653465346534654e-06, "loss": 1.0736, "step": 217 }, { "epoch": 0.5396039603960396, "grad_norm": 0.15679685771465302, "learning_rate": 4.628712871287129e-06, "loss": 1.0781, "step": 218 }, { "epoch": 0.5420792079207921, "grad_norm": 0.16381986439228058, "learning_rate": 4.603960396039605e-06, "loss": 1.063, "step": 219 }, { "epoch": 0.5445544554455446, "grad_norm": 0.1749560832977295, "learning_rate": 4.57920792079208e-06, "loss": 1.0588, "step": 220 }, { "epoch": 0.5470297029702971, "grad_norm": 0.19300749897956848, "learning_rate": 4.554455445544555e-06, "loss": 1.1099, "step": 221 }, { "epoch": 0.5495049504950495, "grad_norm": 0.16599427163600922, "learning_rate": 4.52970297029703e-06, "loss": 1.0517, "step": 222 }, { "epoch": 0.551980198019802, "grad_norm": 0.1542525589466095, "learning_rate": 4.5049504950495054e-06, "loss": 1.0474, "step": 223 }, { "epoch": 0.5544554455445545, "grad_norm": 0.15763726830482483, "learning_rate": 4.4801980198019806e-06, "loss": 0.9993, "step": 224 }, { "epoch": 0.556930693069307, "grad_norm": 0.1836528331041336, "learning_rate": 4.455445544554456e-06, "loss": 1.1125, "step": 225 }, { "epoch": 0.5594059405940595, "grad_norm": 0.16592352092266083, "learning_rate": 4.430693069306931e-06, "loss": 1.0154, "step": 226 }, { "epoch": 0.5618811881188119, "grad_norm": 0.16697534918785095, "learning_rate": 4.405940594059406e-06, "loss": 1.0328, "step": 227 }, { "epoch": 0.5643564356435643, "grad_norm": 0.16123178601264954, "learning_rate": 4.381188118811882e-06, "loss": 1.0859, "step": 228 }, { "epoch": 0.5668316831683168, "grad_norm": 0.17404749989509583, "learning_rate": 4.356435643564357e-06, "loss": 1.0121, "step": 229 }, { "epoch": 0.5693069306930693, "grad_norm": 0.17720270156860352, "learning_rate": 4.331683168316832e-06, "loss": 1.0757, "step": 230 }, { "epoch": 0.5717821782178217, "grad_norm": 0.1586427241563797, "learning_rate": 4.306930693069307e-06, "loss": 1.0551, "step": 231 }, { "epoch": 0.5742574257425742, "grad_norm": 0.1774415373802185, "learning_rate": 4.282178217821782e-06, "loss": 1.0333, "step": 232 }, { "epoch": 0.5767326732673267, "grad_norm": 0.16705653071403503, "learning_rate": 4.2574257425742575e-06, "loss": 1.0601, "step": 233 }, { "epoch": 0.5792079207920792, "grad_norm": 0.18242689967155457, "learning_rate": 4.232673267326733e-06, "loss": 1.0476, "step": 234 }, { "epoch": 0.5816831683168316, "grad_norm": 0.17149695754051208, "learning_rate": 4.207920792079208e-06, "loss": 1.0589, "step": 235 }, { "epoch": 0.5841584158415841, "grad_norm": 0.1735452264547348, "learning_rate": 4.183168316831684e-06, "loss": 1.0479, "step": 236 }, { "epoch": 0.5866336633663366, "grad_norm": 0.1712096780538559, "learning_rate": 4.158415841584159e-06, "loss": 1.0727, "step": 237 }, { "epoch": 0.5891089108910891, "grad_norm": 0.1779618263244629, "learning_rate": 4.133663366336634e-06, "loss": 1.0847, "step": 238 }, { "epoch": 0.5915841584158416, "grad_norm": 0.165634423494339, "learning_rate": 4.108910891089109e-06, "loss": 1.0318, "step": 239 }, { "epoch": 0.594059405940594, "grad_norm": 0.17693260312080383, "learning_rate": 4.084158415841584e-06, "loss": 1.0602, "step": 240 }, { "epoch": 0.5965346534653465, "grad_norm": 0.16443394124507904, "learning_rate": 4.05940594059406e-06, "loss": 1.0693, "step": 241 }, { "epoch": 0.599009900990099, "grad_norm": 0.1829027682542801, "learning_rate": 4.034653465346535e-06, "loss": 1.0597, "step": 242 }, { "epoch": 0.6014851485148515, "grad_norm": 0.17347212135791779, "learning_rate": 4.0099009900990104e-06, "loss": 1.0901, "step": 243 }, { "epoch": 0.6039603960396039, "grad_norm": 0.1738901287317276, "learning_rate": 3.9851485148514856e-06, "loss": 1.0378, "step": 244 }, { "epoch": 0.6064356435643564, "grad_norm": 0.17123974859714508, "learning_rate": 3.960396039603961e-06, "loss": 1.0449, "step": 245 }, { "epoch": 0.6089108910891089, "grad_norm": 0.16725413501262665, "learning_rate": 3.935643564356436e-06, "loss": 1.07, "step": 246 }, { "epoch": 0.6113861386138614, "grad_norm": 0.16899654269218445, "learning_rate": 3.910891089108911e-06, "loss": 1.0491, "step": 247 }, { "epoch": 0.6138613861386139, "grad_norm": 0.16935132443904877, "learning_rate": 3.886138613861387e-06, "loss": 1.0099, "step": 248 }, { "epoch": 0.6163366336633663, "grad_norm": 0.1622443050146103, "learning_rate": 3.861386138613862e-06, "loss": 1.0529, "step": 249 }, { "epoch": 0.6188118811881188, "grad_norm": 0.16781330108642578, "learning_rate": 3.836633663366337e-06, "loss": 1.0629, "step": 250 }, { "epoch": 0.6212871287128713, "grad_norm": 0.17032338678836823, "learning_rate": 3.8118811881188123e-06, "loss": 1.0624, "step": 251 }, { "epoch": 0.6237623762376238, "grad_norm": 0.17583905160427094, "learning_rate": 3.7871287128712874e-06, "loss": 1.0191, "step": 252 }, { "epoch": 0.6262376237623762, "grad_norm": 0.1608443707227707, "learning_rate": 3.7623762376237625e-06, "loss": 1.0459, "step": 253 }, { "epoch": 0.6287128712871287, "grad_norm": 0.16449084877967834, "learning_rate": 3.737623762376238e-06, "loss": 1.0341, "step": 254 }, { "epoch": 0.6311881188118812, "grad_norm": 0.17335855960845947, "learning_rate": 3.712871287128713e-06, "loss": 1.0492, "step": 255 }, { "epoch": 0.6336633663366337, "grad_norm": 0.17544226348400116, "learning_rate": 3.6881188118811883e-06, "loss": 1.0682, "step": 256 }, { "epoch": 0.6361386138613861, "grad_norm": 0.16722352802753448, "learning_rate": 3.6633663366336635e-06, "loss": 1.0557, "step": 257 }, { "epoch": 0.6386138613861386, "grad_norm": 0.19356003403663635, "learning_rate": 3.638613861386139e-06, "loss": 1.06, "step": 258 }, { "epoch": 0.6410891089108911, "grad_norm": 0.18645772337913513, "learning_rate": 3.613861386138614e-06, "loss": 1.0467, "step": 259 }, { "epoch": 0.6435643564356436, "grad_norm": 0.18059493601322174, "learning_rate": 3.5891089108910892e-06, "loss": 1.0736, "step": 260 }, { "epoch": 0.6460396039603961, "grad_norm": 0.19914749264717102, "learning_rate": 3.5643564356435644e-06, "loss": 1.0792, "step": 261 }, { "epoch": 0.6485148514851485, "grad_norm": 0.16404379904270172, "learning_rate": 3.5396039603960395e-06, "loss": 1.0253, "step": 262 }, { "epoch": 0.650990099009901, "grad_norm": 0.16826879978179932, "learning_rate": 3.514851485148515e-06, "loss": 1.0458, "step": 263 }, { "epoch": 0.6534653465346535, "grad_norm": 0.17538203299045563, "learning_rate": 3.49009900990099e-06, "loss": 1.0224, "step": 264 }, { "epoch": 0.655940594059406, "grad_norm": 0.18196958303451538, "learning_rate": 3.4653465346534653e-06, "loss": 1.0475, "step": 265 }, { "epoch": 0.6584158415841584, "grad_norm": 0.1760399043560028, "learning_rate": 3.4405940594059404e-06, "loss": 1.045, "step": 266 }, { "epoch": 0.6608910891089109, "grad_norm": 0.17527367174625397, "learning_rate": 3.4158415841584164e-06, "loss": 1.0634, "step": 267 }, { "epoch": 0.6633663366336634, "grad_norm": 0.18751391768455505, "learning_rate": 3.3910891089108915e-06, "loss": 1.0073, "step": 268 }, { "epoch": 0.6658415841584159, "grad_norm": 0.16496875882148743, "learning_rate": 3.3663366336633666e-06, "loss": 1.0435, "step": 269 }, { "epoch": 0.6683168316831684, "grad_norm": 0.16843590140342712, "learning_rate": 3.341584158415842e-06, "loss": 1.035, "step": 270 }, { "epoch": 0.6707920792079208, "grad_norm": 0.19264374673366547, "learning_rate": 3.3168316831683173e-06, "loss": 1.0538, "step": 271 }, { "epoch": 0.6732673267326733, "grad_norm": 0.19670583307743073, "learning_rate": 3.2920792079207924e-06, "loss": 1.041, "step": 272 }, { "epoch": 0.6757425742574258, "grad_norm": 0.1634487509727478, "learning_rate": 3.2673267326732676e-06, "loss": 1.0131, "step": 273 }, { "epoch": 0.6782178217821783, "grad_norm": 0.1698472797870636, "learning_rate": 3.242574257425743e-06, "loss": 1.0293, "step": 274 }, { "epoch": 0.6806930693069307, "grad_norm": 0.16460570693016052, "learning_rate": 3.2178217821782182e-06, "loss": 1.0078, "step": 275 }, { "epoch": 0.6831683168316832, "grad_norm": 0.17161054909229279, "learning_rate": 3.1930693069306933e-06, "loss": 1.0465, "step": 276 }, { "epoch": 0.6856435643564357, "grad_norm": 0.1728292554616928, "learning_rate": 3.1683168316831685e-06, "loss": 1.0593, "step": 277 }, { "epoch": 0.6881188118811881, "grad_norm": 0.16688892245292664, "learning_rate": 3.143564356435644e-06, "loss": 1.0343, "step": 278 }, { "epoch": 0.6905940594059405, "grad_norm": 0.17009975016117096, "learning_rate": 3.118811881188119e-06, "loss": 1.0686, "step": 279 }, { "epoch": 0.693069306930693, "grad_norm": 0.16594161093235016, "learning_rate": 3.0940594059405943e-06, "loss": 1.0239, "step": 280 }, { "epoch": 0.6955445544554455, "grad_norm": 0.17890632152557373, "learning_rate": 3.0693069306930694e-06, "loss": 1.079, "step": 281 }, { "epoch": 0.698019801980198, "grad_norm": 0.16677552461624146, "learning_rate": 3.044554455445545e-06, "loss": 0.999, "step": 282 }, { "epoch": 0.7004950495049505, "grad_norm": 0.20281478762626648, "learning_rate": 3.01980198019802e-06, "loss": 1.0477, "step": 283 }, { "epoch": 0.7029702970297029, "grad_norm": 0.1814507395029068, "learning_rate": 2.995049504950495e-06, "loss": 1.0537, "step": 284 }, { "epoch": 0.7054455445544554, "grad_norm": 0.2223450392484665, "learning_rate": 2.9702970297029703e-06, "loss": 1.0815, "step": 285 }, { "epoch": 0.7079207920792079, "grad_norm": 0.17586183547973633, "learning_rate": 2.9455445544554454e-06, "loss": 1.0277, "step": 286 }, { "epoch": 0.7103960396039604, "grad_norm": 0.17505447566509247, "learning_rate": 2.920792079207921e-06, "loss": 1.0055, "step": 287 }, { "epoch": 0.7128712871287128, "grad_norm": 0.17361949384212494, "learning_rate": 2.896039603960396e-06, "loss": 1.0465, "step": 288 }, { "epoch": 0.7153465346534653, "grad_norm": 0.1845312863588333, "learning_rate": 2.8712871287128712e-06, "loss": 1.0676, "step": 289 }, { "epoch": 0.7178217821782178, "grad_norm": 0.1974944770336151, "learning_rate": 2.8465346534653464e-06, "loss": 1.0766, "step": 290 }, { "epoch": 0.7202970297029703, "grad_norm": 0.17243847250938416, "learning_rate": 2.821782178217822e-06, "loss": 1.0387, "step": 291 }, { "epoch": 0.7227722772277227, "grad_norm": 0.18797056376934052, "learning_rate": 2.7970297029702974e-06, "loss": 1.048, "step": 292 }, { "epoch": 0.7252475247524752, "grad_norm": 0.1894993633031845, "learning_rate": 2.7722772277227726e-06, "loss": 1.0352, "step": 293 }, { "epoch": 0.7277227722772277, "grad_norm": 0.16774602234363556, "learning_rate": 2.747524752475248e-06, "loss": 1.052, "step": 294 }, { "epoch": 0.7301980198019802, "grad_norm": 0.17973656952381134, "learning_rate": 2.7227722772277232e-06, "loss": 1.0066, "step": 295 }, { "epoch": 0.7326732673267327, "grad_norm": 0.17266245186328888, "learning_rate": 2.6980198019801984e-06, "loss": 1.0327, "step": 296 }, { "epoch": 0.7351485148514851, "grad_norm": 0.19428043067455292, "learning_rate": 2.6732673267326735e-06, "loss": 1.0479, "step": 297 }, { "epoch": 0.7376237623762376, "grad_norm": 0.16848322749137878, "learning_rate": 2.648514851485149e-06, "loss": 1.0665, "step": 298 }, { "epoch": 0.7400990099009901, "grad_norm": 0.1717626452445984, "learning_rate": 2.623762376237624e-06, "loss": 1.042, "step": 299 }, { "epoch": 0.7425742574257426, "grad_norm": 0.16830983757972717, "learning_rate": 2.5990099009900993e-06, "loss": 1.0313, "step": 300 }, { "epoch": 0.745049504950495, "grad_norm": 0.1700887829065323, "learning_rate": 2.5742574257425744e-06, "loss": 1.0512, "step": 301 }, { "epoch": 0.7475247524752475, "grad_norm": 0.17589855194091797, "learning_rate": 2.54950495049505e-06, "loss": 1.0411, "step": 302 }, { "epoch": 0.75, "grad_norm": 0.20275838673114777, "learning_rate": 2.524752475247525e-06, "loss": 1.0412, "step": 303 }, { "epoch": 0.7524752475247525, "grad_norm": 0.17624107003211975, "learning_rate": 2.5e-06, "loss": 1.0484, "step": 304 }, { "epoch": 0.754950495049505, "grad_norm": 0.16618701815605164, "learning_rate": 2.4752475247524753e-06, "loss": 1.0444, "step": 305 }, { "epoch": 0.7574257425742574, "grad_norm": 0.17324908077716827, "learning_rate": 2.4504950495049505e-06, "loss": 1.0126, "step": 306 }, { "epoch": 0.7599009900990099, "grad_norm": 0.1791515350341797, "learning_rate": 2.425742574257426e-06, "loss": 1.0513, "step": 307 }, { "epoch": 0.7623762376237624, "grad_norm": 0.16968585550785065, "learning_rate": 2.400990099009901e-06, "loss": 1.0458, "step": 308 }, { "epoch": 0.7648514851485149, "grad_norm": 0.19068896770477295, "learning_rate": 2.3762376237623762e-06, "loss": 1.0307, "step": 309 }, { "epoch": 0.7673267326732673, "grad_norm": 0.1762915998697281, "learning_rate": 2.3514851485148514e-06, "loss": 1.0266, "step": 310 }, { "epoch": 0.7698019801980198, "grad_norm": 0.16937871277332306, "learning_rate": 2.326732673267327e-06, "loss": 1.0397, "step": 311 }, { "epoch": 0.7722772277227723, "grad_norm": 0.17128786444664001, "learning_rate": 2.3019801980198025e-06, "loss": 1.0479, "step": 312 }, { "epoch": 0.7747524752475248, "grad_norm": 0.17716889083385468, "learning_rate": 2.2772277227722776e-06, "loss": 0.9953, "step": 313 }, { "epoch": 0.7772277227722773, "grad_norm": 0.18644240498542786, "learning_rate": 2.2524752475247527e-06, "loss": 1.0266, "step": 314 }, { "epoch": 0.7797029702970297, "grad_norm": 0.18382704257965088, "learning_rate": 2.227722772277228e-06, "loss": 1.0472, "step": 315 }, { "epoch": 0.7821782178217822, "grad_norm": 0.17966952919960022, "learning_rate": 2.202970297029703e-06, "loss": 1.03, "step": 316 }, { "epoch": 0.7846534653465347, "grad_norm": 0.17476266622543335, "learning_rate": 2.1782178217821785e-06, "loss": 1.0197, "step": 317 }, { "epoch": 0.7871287128712872, "grad_norm": 0.16993242502212524, "learning_rate": 2.1534653465346536e-06, "loss": 1.0365, "step": 318 }, { "epoch": 0.7896039603960396, "grad_norm": 0.17047357559204102, "learning_rate": 2.1287128712871288e-06, "loss": 1.0412, "step": 319 }, { "epoch": 0.7920792079207921, "grad_norm": 0.16769839823246002, "learning_rate": 2.103960396039604e-06, "loss": 1.0082, "step": 320 }, { "epoch": 0.7945544554455446, "grad_norm": 0.17701376974582672, "learning_rate": 2.0792079207920794e-06, "loss": 1.0574, "step": 321 }, { "epoch": 0.7970297029702971, "grad_norm": 0.1958608329296112, "learning_rate": 2.0544554455445546e-06, "loss": 1.0122, "step": 322 }, { "epoch": 0.7995049504950495, "grad_norm": 0.19034409523010254, "learning_rate": 2.02970297029703e-06, "loss": 0.9904, "step": 323 }, { "epoch": 0.801980198019802, "grad_norm": 0.17253142595291138, "learning_rate": 2.0049504950495052e-06, "loss": 1.0229, "step": 324 }, { "epoch": 0.8044554455445545, "grad_norm": 0.1845494955778122, "learning_rate": 1.9801980198019803e-06, "loss": 1.0537, "step": 325 }, { "epoch": 0.806930693069307, "grad_norm": 0.17464852333068848, "learning_rate": 1.9554455445544555e-06, "loss": 1.0293, "step": 326 }, { "epoch": 0.8094059405940595, "grad_norm": 0.18944846093654633, "learning_rate": 1.930693069306931e-06, "loss": 1.0342, "step": 327 }, { "epoch": 0.8118811881188119, "grad_norm": 0.20734351873397827, "learning_rate": 1.9059405940594061e-06, "loss": 1.0481, "step": 328 }, { "epoch": 0.8143564356435643, "grad_norm": 0.18267051875591278, "learning_rate": 1.8811881188118813e-06, "loss": 1.017, "step": 329 }, { "epoch": 0.8168316831683168, "grad_norm": 0.16905878484249115, "learning_rate": 1.8564356435643566e-06, "loss": 0.9996, "step": 330 }, { "epoch": 0.8193069306930693, "grad_norm": 0.18889805674552917, "learning_rate": 1.8316831683168317e-06, "loss": 1.0487, "step": 331 }, { "epoch": 0.8217821782178217, "grad_norm": 0.1784394234418869, "learning_rate": 1.806930693069307e-06, "loss": 1.0301, "step": 332 }, { "epoch": 0.8242574257425742, "grad_norm": 0.1732165515422821, "learning_rate": 1.7821782178217822e-06, "loss": 0.9989, "step": 333 }, { "epoch": 0.8267326732673267, "grad_norm": 0.19555354118347168, "learning_rate": 1.7574257425742575e-06, "loss": 1.0195, "step": 334 }, { "epoch": 0.8292079207920792, "grad_norm": 0.17240861058235168, "learning_rate": 1.7326732673267326e-06, "loss": 1.0076, "step": 335 }, { "epoch": 0.8316831683168316, "grad_norm": 0.1675492227077484, "learning_rate": 1.7079207920792082e-06, "loss": 1.0393, "step": 336 }, { "epoch": 0.8341584158415841, "grad_norm": 0.19649028778076172, "learning_rate": 1.6831683168316833e-06, "loss": 1.0467, "step": 337 }, { "epoch": 0.8366336633663366, "grad_norm": 0.1702542006969452, "learning_rate": 1.6584158415841587e-06, "loss": 1.0208, "step": 338 }, { "epoch": 0.8391089108910891, "grad_norm": 0.16906598210334778, "learning_rate": 1.6336633663366338e-06, "loss": 1.0268, "step": 339 }, { "epoch": 0.8415841584158416, "grad_norm": 0.18987883627414703, "learning_rate": 1.6089108910891091e-06, "loss": 1.0566, "step": 340 }, { "epoch": 0.844059405940594, "grad_norm": 0.1939912885427475, "learning_rate": 1.5841584158415842e-06, "loss": 1.0871, "step": 341 }, { "epoch": 0.8465346534653465, "grad_norm": 0.18526384234428406, "learning_rate": 1.5594059405940596e-06, "loss": 1.0179, "step": 342 }, { "epoch": 0.849009900990099, "grad_norm": 0.17309053242206573, "learning_rate": 1.5346534653465347e-06, "loss": 1.0509, "step": 343 }, { "epoch": 0.8514851485148515, "grad_norm": 0.16723985970020294, "learning_rate": 1.50990099009901e-06, "loss": 1.0034, "step": 344 }, { "epoch": 0.8539603960396039, "grad_norm": 0.17107772827148438, "learning_rate": 1.4851485148514852e-06, "loss": 0.9677, "step": 345 }, { "epoch": 0.8564356435643564, "grad_norm": 0.17373721301555634, "learning_rate": 1.4603960396039605e-06, "loss": 1.0245, "step": 346 }, { "epoch": 0.8589108910891089, "grad_norm": 0.1878427267074585, "learning_rate": 1.4356435643564356e-06, "loss": 1.023, "step": 347 }, { "epoch": 0.8613861386138614, "grad_norm": 0.17384839057922363, "learning_rate": 1.410891089108911e-06, "loss": 1.0414, "step": 348 }, { "epoch": 0.8638613861386139, "grad_norm": 0.17436939477920532, "learning_rate": 1.3861386138613863e-06, "loss": 1.0371, "step": 349 }, { "epoch": 0.8663366336633663, "grad_norm": 0.18823057413101196, "learning_rate": 1.3613861386138616e-06, "loss": 1.0286, "step": 350 }, { "epoch": 0.8688118811881188, "grad_norm": 0.17299430072307587, "learning_rate": 1.3366336633663367e-06, "loss": 1.0117, "step": 351 }, { "epoch": 0.8712871287128713, "grad_norm": 0.2107045203447342, "learning_rate": 1.311881188118812e-06, "loss": 1.0331, "step": 352 }, { "epoch": 0.8737623762376238, "grad_norm": 0.17237113416194916, "learning_rate": 1.2871287128712872e-06, "loss": 1.0022, "step": 353 }, { "epoch": 0.8762376237623762, "grad_norm": 0.17842179536819458, "learning_rate": 1.2623762376237625e-06, "loss": 1.0496, "step": 354 }, { "epoch": 0.8787128712871287, "grad_norm": 0.17555266618728638, "learning_rate": 1.2376237623762377e-06, "loss": 1.0082, "step": 355 }, { "epoch": 0.8811881188118812, "grad_norm": 0.22252587974071503, "learning_rate": 1.212871287128713e-06, "loss": 1.0386, "step": 356 }, { "epoch": 0.8836633663366337, "grad_norm": 0.1819303333759308, "learning_rate": 1.1881188118811881e-06, "loss": 1.0468, "step": 357 }, { "epoch": 0.8861386138613861, "grad_norm": 0.18028509616851807, "learning_rate": 1.1633663366336635e-06, "loss": 0.972, "step": 358 }, { "epoch": 0.8886138613861386, "grad_norm": 0.1826130598783493, "learning_rate": 1.1386138613861388e-06, "loss": 1.0503, "step": 359 }, { "epoch": 0.8910891089108911, "grad_norm": 0.17441998422145844, "learning_rate": 1.113861386138614e-06, "loss": 1.0267, "step": 360 }, { "epoch": 0.8935643564356436, "grad_norm": 0.20245307683944702, "learning_rate": 1.0891089108910893e-06, "loss": 1.0261, "step": 361 }, { "epoch": 0.8960396039603961, "grad_norm": 0.170221745967865, "learning_rate": 1.0643564356435644e-06, "loss": 1.0217, "step": 362 }, { "epoch": 0.8985148514851485, "grad_norm": 0.1835111379623413, "learning_rate": 1.0396039603960397e-06, "loss": 1.0677, "step": 363 }, { "epoch": 0.900990099009901, "grad_norm": 0.17305181920528412, "learning_rate": 1.014851485148515e-06, "loss": 1.0372, "step": 364 }, { "epoch": 0.9034653465346535, "grad_norm": 0.17221209406852722, "learning_rate": 9.900990099009902e-07, "loss": 1.0351, "step": 365 }, { "epoch": 0.905940594059406, "grad_norm": 0.2017694115638733, "learning_rate": 9.653465346534655e-07, "loss": 1.0641, "step": 366 }, { "epoch": 0.9084158415841584, "grad_norm": 0.16452528536319733, "learning_rate": 9.405940594059406e-07, "loss": 1.0297, "step": 367 }, { "epoch": 0.9108910891089109, "grad_norm": 0.1696237474679947, "learning_rate": 9.158415841584159e-07, "loss": 1.0272, "step": 368 }, { "epoch": 0.9133663366336634, "grad_norm": 0.17816710472106934, "learning_rate": 8.910891089108911e-07, "loss": 0.9946, "step": 369 }, { "epoch": 0.9158415841584159, "grad_norm": 0.22065281867980957, "learning_rate": 8.663366336633663e-07, "loss": 1.0485, "step": 370 }, { "epoch": 0.9183168316831684, "grad_norm": 0.1876746118068695, "learning_rate": 8.415841584158417e-07, "loss": 1.0179, "step": 371 }, { "epoch": 0.9207920792079208, "grad_norm": 0.28578758239746094, "learning_rate": 8.168316831683169e-07, "loss": 1.0024, "step": 372 }, { "epoch": 0.9232673267326733, "grad_norm": 0.2457706332206726, "learning_rate": 7.920792079207921e-07, "loss": 0.9973, "step": 373 }, { "epoch": 0.9257425742574258, "grad_norm": 0.17008252441883087, "learning_rate": 7.673267326732673e-07, "loss": 0.9994, "step": 374 }, { "epoch": 0.9282178217821783, "grad_norm": 0.1781049519777298, "learning_rate": 7.425742574257426e-07, "loss": 1.061, "step": 375 }, { "epoch": 0.9306930693069307, "grad_norm": 0.18529166281223297, "learning_rate": 7.178217821782178e-07, "loss": 1.0428, "step": 376 }, { "epoch": 0.9331683168316832, "grad_norm": 0.19657132029533386, "learning_rate": 6.930693069306931e-07, "loss": 1.0448, "step": 377 }, { "epoch": 0.9356435643564357, "grad_norm": 0.17185792326927185, "learning_rate": 6.683168316831684e-07, "loss": 1.0306, "step": 378 }, { "epoch": 0.9381188118811881, "grad_norm": 0.18214091658592224, "learning_rate": 6.435643564356436e-07, "loss": 1.0433, "step": 379 }, { "epoch": 0.9405940594059405, "grad_norm": 0.23452341556549072, "learning_rate": 6.188118811881188e-07, "loss": 0.9751, "step": 380 }, { "epoch": 0.943069306930693, "grad_norm": 0.16824421286582947, "learning_rate": 5.940594059405941e-07, "loss": 1.032, "step": 381 }, { "epoch": 0.9455445544554455, "grad_norm": 0.2157716602087021, "learning_rate": 5.693069306930694e-07, "loss": 1.0361, "step": 382 }, { "epoch": 0.948019801980198, "grad_norm": 0.1715572625398636, "learning_rate": 5.445544554455446e-07, "loss": 1.0135, "step": 383 }, { "epoch": 0.9504950495049505, "grad_norm": 0.18207526206970215, "learning_rate": 5.198019801980199e-07, "loss": 1.008, "step": 384 }, { "epoch": 0.9529702970297029, "grad_norm": 0.17518338561058044, "learning_rate": 4.950495049504951e-07, "loss": 1.0558, "step": 385 }, { "epoch": 0.9554455445544554, "grad_norm": 0.16482017934322357, "learning_rate": 4.702970297029703e-07, "loss": 1.0176, "step": 386 }, { "epoch": 0.9579207920792079, "grad_norm": 0.1714717000722885, "learning_rate": 4.4554455445544555e-07, "loss": 1.0052, "step": 387 }, { "epoch": 0.9603960396039604, "grad_norm": 0.2134282886981964, "learning_rate": 4.2079207920792083e-07, "loss": 0.9946, "step": 388 }, { "epoch": 0.9628712871287128, "grad_norm": 0.1706077754497528, "learning_rate": 3.9603960396039606e-07, "loss": 1.0198, "step": 389 }, { "epoch": 0.9653465346534653, "grad_norm": 0.17025628685951233, "learning_rate": 3.712871287128713e-07, "loss": 1.0387, "step": 390 }, { "epoch": 0.9678217821782178, "grad_norm": 0.20762069523334503, "learning_rate": 3.4653465346534657e-07, "loss": 1.0192, "step": 391 }, { "epoch": 0.9702970297029703, "grad_norm": 0.1808956265449524, "learning_rate": 3.217821782178218e-07, "loss": 1.0094, "step": 392 }, { "epoch": 0.9727722772277227, "grad_norm": 0.18868137896060944, "learning_rate": 2.9702970297029703e-07, "loss": 1.0505, "step": 393 }, { "epoch": 0.9752475247524752, "grad_norm": 0.17066539824008942, "learning_rate": 2.722772277227723e-07, "loss": 1.0439, "step": 394 }, { "epoch": 0.9777227722772277, "grad_norm": 0.19031685590744019, "learning_rate": 2.4752475247524754e-07, "loss": 1.0565, "step": 395 }, { "epoch": 0.9801980198019802, "grad_norm": 0.175362229347229, "learning_rate": 2.2277227722772277e-07, "loss": 1.0129, "step": 396 }, { "epoch": 0.9826732673267327, "grad_norm": 0.17563587427139282, "learning_rate": 1.9801980198019803e-07, "loss": 1.0256, "step": 397 }, { "epoch": 0.9851485148514851, "grad_norm": 0.18792986869812012, "learning_rate": 1.7326732673267329e-07, "loss": 0.9857, "step": 398 }, { "epoch": 0.9876237623762376, "grad_norm": 0.16679790616035461, "learning_rate": 1.4851485148514852e-07, "loss": 1.0258, "step": 399 }, { "epoch": 0.9900990099009901, "grad_norm": 0.17717471718788147, "learning_rate": 1.2376237623762377e-07, "loss": 1.0558, "step": 400 }, { "epoch": 0.9925742574257426, "grad_norm": 0.17575222253799438, "learning_rate": 9.900990099009901e-08, "loss": 1.0095, "step": 401 }, { "epoch": 0.995049504950495, "grad_norm": 0.17630964517593384, "learning_rate": 7.425742574257426e-08, "loss": 1.0431, "step": 402 }, { "epoch": 0.9975247524752475, "grad_norm": 0.18719862401485443, "learning_rate": 4.950495049504951e-08, "loss": 1.0469, "step": 403 }, { "epoch": 1.0, "grad_norm": 0.1757795810699463, "learning_rate": 2.4752475247524754e-08, "loss": 1.0161, "step": 404 } ], "logging_steps": 1.0, "max_steps": 404, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.023190572360008e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }