{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 351, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002849002849002849, "grad_norm": 1.4838141202926636, "learning_rate": 1e-05, "loss": 2.366, "step": 1 }, { "epoch": 0.005698005698005698, "grad_norm": 1.4262256622314453, "learning_rate": 9.971509971509972e-06, "loss": 2.4139, "step": 2 }, { "epoch": 0.008547008547008548, "grad_norm": 1.3603845834732056, "learning_rate": 9.943019943019944e-06, "loss": 2.2811, "step": 3 }, { "epoch": 0.011396011396011397, "grad_norm": 1.307405710220337, "learning_rate": 9.914529914529915e-06, "loss": 2.2546, "step": 4 }, { "epoch": 0.014245014245014245, "grad_norm": 1.2675021886825562, "learning_rate": 9.886039886039887e-06, "loss": 2.2322, "step": 5 }, { "epoch": 0.017094017094017096, "grad_norm": 1.2081060409545898, "learning_rate": 9.857549857549858e-06, "loss": 2.2255, "step": 6 }, { "epoch": 0.019943019943019943, "grad_norm": 1.104535698890686, "learning_rate": 9.82905982905983e-06, "loss": 2.2139, "step": 7 }, { "epoch": 0.022792022792022793, "grad_norm": 1.0799970626831055, "learning_rate": 9.800569800569801e-06, "loss": 2.2049, "step": 8 }, { "epoch": 0.02564102564102564, "grad_norm": 0.9630372524261475, "learning_rate": 9.772079772079773e-06, "loss": 2.1045, "step": 9 }, { "epoch": 0.02849002849002849, "grad_norm": 0.9740710854530334, "learning_rate": 9.743589743589744e-06, "loss": 2.1669, "step": 10 }, { "epoch": 0.03133903133903134, "grad_norm": 0.9871430397033691, "learning_rate": 9.715099715099716e-06, "loss": 2.1666, "step": 11 }, { "epoch": 0.03418803418803419, "grad_norm": 0.9979017376899719, "learning_rate": 9.686609686609687e-06, "loss": 2.2347, "step": 12 }, { "epoch": 0.037037037037037035, "grad_norm": 0.921946108341217, "learning_rate": 9.658119658119659e-06, "loss": 2.1309, "step": 13 }, { "epoch": 0.039886039886039885, "grad_norm": 0.9126842617988586, "learning_rate": 9.62962962962963e-06, "loss": 2.0519, "step": 14 }, { "epoch": 0.042735042735042736, "grad_norm": 0.8587276935577393, "learning_rate": 9.601139601139601e-06, "loss": 2.0816, "step": 15 }, { "epoch": 0.045584045584045586, "grad_norm": 0.8564528822898865, "learning_rate": 9.572649572649575e-06, "loss": 2.0918, "step": 16 }, { "epoch": 0.04843304843304843, "grad_norm": 0.8116742968559265, "learning_rate": 9.544159544159544e-06, "loss": 1.9883, "step": 17 }, { "epoch": 0.05128205128205128, "grad_norm": 0.7653638124465942, "learning_rate": 9.515669515669516e-06, "loss": 2.008, "step": 18 }, { "epoch": 0.05413105413105413, "grad_norm": 0.758541464805603, "learning_rate": 9.487179487179487e-06, "loss": 2.0232, "step": 19 }, { "epoch": 0.05698005698005698, "grad_norm": 0.7756889462471008, "learning_rate": 9.458689458689459e-06, "loss": 2.0479, "step": 20 }, { "epoch": 0.05982905982905983, "grad_norm": 0.8094788789749146, "learning_rate": 9.430199430199432e-06, "loss": 2.0904, "step": 21 }, { "epoch": 0.06267806267806268, "grad_norm": 0.6886956691741943, "learning_rate": 9.401709401709402e-06, "loss": 1.9566, "step": 22 }, { "epoch": 0.06552706552706553, "grad_norm": 0.6763948798179626, "learning_rate": 9.373219373219375e-06, "loss": 1.9283, "step": 23 }, { "epoch": 0.06837606837606838, "grad_norm": 0.6754049062728882, "learning_rate": 9.344729344729345e-06, "loss": 1.9353, "step": 24 }, { "epoch": 0.07122507122507123, "grad_norm": 0.6518625617027283, "learning_rate": 9.316239316239318e-06, "loss": 1.9165, "step": 25 }, { "epoch": 0.07407407407407407, "grad_norm": 0.6429179906845093, "learning_rate": 9.287749287749288e-06, "loss": 1.9261, "step": 26 }, { "epoch": 0.07692307692307693, "grad_norm": 0.6808933019638062, "learning_rate": 9.25925925925926e-06, "loss": 1.9784, "step": 27 }, { "epoch": 0.07977207977207977, "grad_norm": 0.7099093198776245, "learning_rate": 9.230769230769232e-06, "loss": 1.9884, "step": 28 }, { "epoch": 0.08262108262108261, "grad_norm": 0.6004197597503662, "learning_rate": 9.202279202279202e-06, "loss": 1.8912, "step": 29 }, { "epoch": 0.08547008547008547, "grad_norm": 0.5550093650817871, "learning_rate": 9.173789173789175e-06, "loss": 1.7948, "step": 30 }, { "epoch": 0.08831908831908832, "grad_norm": 0.6177744269371033, "learning_rate": 9.145299145299145e-06, "loss": 1.9004, "step": 31 }, { "epoch": 0.09116809116809117, "grad_norm": 0.5736203789710999, "learning_rate": 9.116809116809118e-06, "loss": 1.8579, "step": 32 }, { "epoch": 0.09401709401709402, "grad_norm": 0.5455344915390015, "learning_rate": 9.088319088319088e-06, "loss": 1.8232, "step": 33 }, { "epoch": 0.09686609686609686, "grad_norm": 0.5457695126533508, "learning_rate": 9.059829059829061e-06, "loss": 1.8387, "step": 34 }, { "epoch": 0.09971509971509972, "grad_norm": 0.6495256423950195, "learning_rate": 9.031339031339033e-06, "loss": 1.9007, "step": 35 }, { "epoch": 0.10256410256410256, "grad_norm": 0.553978443145752, "learning_rate": 9.002849002849004e-06, "loss": 1.7967, "step": 36 }, { "epoch": 0.10541310541310542, "grad_norm": 0.6648301482200623, "learning_rate": 8.974358974358976e-06, "loss": 1.8203, "step": 37 }, { "epoch": 0.10826210826210826, "grad_norm": 0.604141354560852, "learning_rate": 8.945868945868947e-06, "loss": 1.8569, "step": 38 }, { "epoch": 0.1111111111111111, "grad_norm": 0.5134737491607666, "learning_rate": 8.917378917378919e-06, "loss": 1.7601, "step": 39 }, { "epoch": 0.11396011396011396, "grad_norm": 0.5309232473373413, "learning_rate": 8.888888888888888e-06, "loss": 1.8382, "step": 40 }, { "epoch": 0.1168091168091168, "grad_norm": 0.5077832937240601, "learning_rate": 8.860398860398861e-06, "loss": 1.7595, "step": 41 }, { "epoch": 0.11965811965811966, "grad_norm": 0.511060357093811, "learning_rate": 8.831908831908833e-06, "loss": 1.7981, "step": 42 }, { "epoch": 0.1225071225071225, "grad_norm": 0.48027244210243225, "learning_rate": 8.803418803418804e-06, "loss": 1.726, "step": 43 }, { "epoch": 0.12535612535612536, "grad_norm": 0.4738457202911377, "learning_rate": 8.774928774928776e-06, "loss": 1.7552, "step": 44 }, { "epoch": 0.1282051282051282, "grad_norm": 0.4702482223510742, "learning_rate": 8.746438746438747e-06, "loss": 1.7324, "step": 45 }, { "epoch": 0.13105413105413105, "grad_norm": 0.48187750577926636, "learning_rate": 8.717948717948719e-06, "loss": 1.7393, "step": 46 }, { "epoch": 0.1339031339031339, "grad_norm": 0.46382951736450195, "learning_rate": 8.68945868945869e-06, "loss": 1.7103, "step": 47 }, { "epoch": 0.13675213675213677, "grad_norm": 0.5777999758720398, "learning_rate": 8.660968660968662e-06, "loss": 1.7991, "step": 48 }, { "epoch": 0.1396011396011396, "grad_norm": 0.46543341875076294, "learning_rate": 8.632478632478633e-06, "loss": 1.7483, "step": 49 }, { "epoch": 0.14245014245014245, "grad_norm": 0.5707411766052246, "learning_rate": 8.603988603988605e-06, "loss": 1.7243, "step": 50 }, { "epoch": 0.1452991452991453, "grad_norm": 0.5121602416038513, "learning_rate": 8.575498575498576e-06, "loss": 1.7487, "step": 51 }, { "epoch": 0.14814814814814814, "grad_norm": 0.45368504524230957, "learning_rate": 8.547008547008548e-06, "loss": 1.7515, "step": 52 }, { "epoch": 0.150997150997151, "grad_norm": 0.44115832448005676, "learning_rate": 8.518518518518519e-06, "loss": 1.7165, "step": 53 }, { "epoch": 0.15384615384615385, "grad_norm": 0.43293115496635437, "learning_rate": 8.49002849002849e-06, "loss": 1.7247, "step": 54 }, { "epoch": 0.15669515669515668, "grad_norm": 0.4369884431362152, "learning_rate": 8.461538461538462e-06, "loss": 1.7046, "step": 55 }, { "epoch": 0.15954415954415954, "grad_norm": 0.44155994057655334, "learning_rate": 8.433048433048434e-06, "loss": 1.7397, "step": 56 }, { "epoch": 0.1623931623931624, "grad_norm": 0.4158068895339966, "learning_rate": 8.404558404558405e-06, "loss": 1.7038, "step": 57 }, { "epoch": 0.16524216524216523, "grad_norm": 0.4057186245918274, "learning_rate": 8.376068376068377e-06, "loss": 1.6994, "step": 58 }, { "epoch": 0.16809116809116809, "grad_norm": 0.4907611906528473, "learning_rate": 8.347578347578348e-06, "loss": 1.7246, "step": 59 }, { "epoch": 0.17094017094017094, "grad_norm": 0.4695189595222473, "learning_rate": 8.31908831908832e-06, "loss": 1.7315, "step": 60 }, { "epoch": 0.1737891737891738, "grad_norm": 0.40382280945777893, "learning_rate": 8.290598290598293e-06, "loss": 1.6904, "step": 61 }, { "epoch": 0.17663817663817663, "grad_norm": 0.42537087202072144, "learning_rate": 8.262108262108262e-06, "loss": 1.7413, "step": 62 }, { "epoch": 0.1794871794871795, "grad_norm": 0.45500096678733826, "learning_rate": 8.233618233618234e-06, "loss": 1.687, "step": 63 }, { "epoch": 0.18233618233618235, "grad_norm": 0.5165032148361206, "learning_rate": 8.205128205128205e-06, "loss": 1.6565, "step": 64 }, { "epoch": 0.18518518518518517, "grad_norm": 0.4045052230358124, "learning_rate": 8.176638176638177e-06, "loss": 1.679, "step": 65 }, { "epoch": 0.18803418803418803, "grad_norm": 0.5608129501342773, "learning_rate": 8.148148148148148e-06, "loss": 1.6782, "step": 66 }, { "epoch": 0.1908831908831909, "grad_norm": 0.42527124285697937, "learning_rate": 8.11965811965812e-06, "loss": 1.6164, "step": 67 }, { "epoch": 0.19373219373219372, "grad_norm": 0.39863091707229614, "learning_rate": 8.091168091168093e-06, "loss": 1.6564, "step": 68 }, { "epoch": 0.19658119658119658, "grad_norm": 0.40516364574432373, "learning_rate": 8.062678062678063e-06, "loss": 1.5941, "step": 69 }, { "epoch": 0.19943019943019943, "grad_norm": 0.42938536405563354, "learning_rate": 8.034188034188036e-06, "loss": 1.6471, "step": 70 }, { "epoch": 0.2022792022792023, "grad_norm": 0.3754700720310211, "learning_rate": 8.005698005698006e-06, "loss": 1.637, "step": 71 }, { "epoch": 0.20512820512820512, "grad_norm": 0.4259706735610962, "learning_rate": 7.977207977207979e-06, "loss": 1.6063, "step": 72 }, { "epoch": 0.20797720797720798, "grad_norm": 0.41146427392959595, "learning_rate": 7.948717948717949e-06, "loss": 1.6559, "step": 73 }, { "epoch": 0.21082621082621084, "grad_norm": 0.3858882486820221, "learning_rate": 7.92022792022792e-06, "loss": 1.6355, "step": 74 }, { "epoch": 0.21367521367521367, "grad_norm": 0.46363890171051025, "learning_rate": 7.891737891737893e-06, "loss": 1.7049, "step": 75 }, { "epoch": 0.21652421652421652, "grad_norm": 0.40698277950286865, "learning_rate": 7.863247863247863e-06, "loss": 1.6436, "step": 76 }, { "epoch": 0.21937321937321938, "grad_norm": 0.3834919035434723, "learning_rate": 7.834757834757836e-06, "loss": 1.5902, "step": 77 }, { "epoch": 0.2222222222222222, "grad_norm": 0.3849916160106659, "learning_rate": 7.806267806267806e-06, "loss": 1.6127, "step": 78 }, { "epoch": 0.22507122507122507, "grad_norm": 0.6278889179229736, "learning_rate": 7.77777777777778e-06, "loss": 1.6641, "step": 79 }, { "epoch": 0.22792022792022792, "grad_norm": 0.4905427396297455, "learning_rate": 7.749287749287749e-06, "loss": 1.5825, "step": 80 }, { "epoch": 0.23076923076923078, "grad_norm": 0.4097338318824768, "learning_rate": 7.720797720797722e-06, "loss": 1.5964, "step": 81 }, { "epoch": 0.2336182336182336, "grad_norm": 0.3952528238296509, "learning_rate": 7.692307692307694e-06, "loss": 1.6584, "step": 82 }, { "epoch": 0.23646723646723647, "grad_norm": 0.38913223147392273, "learning_rate": 7.663817663817665e-06, "loss": 1.5694, "step": 83 }, { "epoch": 0.23931623931623933, "grad_norm": 0.391777902841568, "learning_rate": 7.635327635327637e-06, "loss": 1.6378, "step": 84 }, { "epoch": 0.24216524216524216, "grad_norm": 0.41954416036605835, "learning_rate": 7.606837606837607e-06, "loss": 1.6073, "step": 85 }, { "epoch": 0.245014245014245, "grad_norm": 0.3974544107913971, "learning_rate": 7.578347578347579e-06, "loss": 1.6005, "step": 86 }, { "epoch": 0.24786324786324787, "grad_norm": 0.43366730213165283, "learning_rate": 7.54985754985755e-06, "loss": 1.5905, "step": 87 }, { "epoch": 0.25071225071225073, "grad_norm": 0.37673377990722656, "learning_rate": 7.521367521367522e-06, "loss": 1.562, "step": 88 }, { "epoch": 0.2535612535612536, "grad_norm": 0.48865458369255066, "learning_rate": 7.492877492877494e-06, "loss": 1.5934, "step": 89 }, { "epoch": 0.2564102564102564, "grad_norm": 0.38269999623298645, "learning_rate": 7.4643874643874645e-06, "loss": 1.6024, "step": 90 }, { "epoch": 0.25925925925925924, "grad_norm": 0.40311211347579956, "learning_rate": 7.435897435897437e-06, "loss": 1.6263, "step": 91 }, { "epoch": 0.2621082621082621, "grad_norm": 0.3799367845058441, "learning_rate": 7.4074074074074075e-06, "loss": 1.599, "step": 92 }, { "epoch": 0.26495726495726496, "grad_norm": 0.39559420943260193, "learning_rate": 7.37891737891738e-06, "loss": 1.6103, "step": 93 }, { "epoch": 0.2678062678062678, "grad_norm": 0.37981730699539185, "learning_rate": 7.350427350427351e-06, "loss": 1.598, "step": 94 }, { "epoch": 0.2706552706552707, "grad_norm": 0.3881866931915283, "learning_rate": 7.321937321937323e-06, "loss": 1.5843, "step": 95 }, { "epoch": 0.27350427350427353, "grad_norm": 0.3740154504776001, "learning_rate": 7.293447293447294e-06, "loss": 1.6069, "step": 96 }, { "epoch": 0.27635327635327633, "grad_norm": 0.3980708718299866, "learning_rate": 7.264957264957266e-06, "loss": 1.5667, "step": 97 }, { "epoch": 0.2792022792022792, "grad_norm": 0.37536391615867615, "learning_rate": 7.236467236467237e-06, "loss": 1.5926, "step": 98 }, { "epoch": 0.28205128205128205, "grad_norm": 0.4172308146953583, "learning_rate": 7.207977207977208e-06, "loss": 1.5371, "step": 99 }, { "epoch": 0.2849002849002849, "grad_norm": 0.39715775847435, "learning_rate": 7.17948717948718e-06, "loss": 1.5931, "step": 100 }, { "epoch": 0.28774928774928776, "grad_norm": 0.4845562279224396, "learning_rate": 7.1509971509971524e-06, "loss": 1.6267, "step": 101 }, { "epoch": 0.2905982905982906, "grad_norm": 0.38772156834602356, "learning_rate": 7.122507122507123e-06, "loss": 1.5949, "step": 102 }, { "epoch": 0.2934472934472934, "grad_norm": 0.3815441429615021, "learning_rate": 7.0940170940170945e-06, "loss": 1.5758, "step": 103 }, { "epoch": 0.2962962962962963, "grad_norm": 0.4964717626571655, "learning_rate": 7.065527065527066e-06, "loss": 1.5653, "step": 104 }, { "epoch": 0.29914529914529914, "grad_norm": 0.378212571144104, "learning_rate": 7.0370370370370375e-06, "loss": 1.536, "step": 105 }, { "epoch": 0.301994301994302, "grad_norm": 0.36918291449546814, "learning_rate": 7.008547008547009e-06, "loss": 1.555, "step": 106 }, { "epoch": 0.30484330484330485, "grad_norm": 0.39171653985977173, "learning_rate": 6.9800569800569804e-06, "loss": 1.6057, "step": 107 }, { "epoch": 0.3076923076923077, "grad_norm": 0.5356259942054749, "learning_rate": 6.951566951566953e-06, "loss": 1.5825, "step": 108 }, { "epoch": 0.31054131054131057, "grad_norm": 0.40925300121307373, "learning_rate": 6.923076923076923e-06, "loss": 1.6084, "step": 109 }, { "epoch": 0.31339031339031337, "grad_norm": 0.3943912386894226, "learning_rate": 6.894586894586896e-06, "loss": 1.5231, "step": 110 }, { "epoch": 0.3162393162393162, "grad_norm": 0.40087035298347473, "learning_rate": 6.866096866096866e-06, "loss": 1.5833, "step": 111 }, { "epoch": 0.3190883190883191, "grad_norm": 0.3822116553783417, "learning_rate": 6.837606837606839e-06, "loss": 1.5477, "step": 112 }, { "epoch": 0.32193732193732194, "grad_norm": 0.39919513463974, "learning_rate": 6.809116809116809e-06, "loss": 1.555, "step": 113 }, { "epoch": 0.3247863247863248, "grad_norm": 0.39128148555755615, "learning_rate": 6.780626780626781e-06, "loss": 1.5886, "step": 114 }, { "epoch": 0.32763532763532766, "grad_norm": 0.3694957196712494, "learning_rate": 6.752136752136753e-06, "loss": 1.4937, "step": 115 }, { "epoch": 0.33048433048433046, "grad_norm": 0.4147852659225464, "learning_rate": 6.723646723646724e-06, "loss": 1.5697, "step": 116 }, { "epoch": 0.3333333333333333, "grad_norm": 0.4091155230998993, "learning_rate": 6.695156695156696e-06, "loss": 1.511, "step": 117 }, { "epoch": 0.33618233618233617, "grad_norm": 0.3905634582042694, "learning_rate": 6.666666666666667e-06, "loss": 1.5462, "step": 118 }, { "epoch": 0.33903133903133903, "grad_norm": 0.4323817491531372, "learning_rate": 6.638176638176639e-06, "loss": 1.5459, "step": 119 }, { "epoch": 0.3418803418803419, "grad_norm": 0.38668230175971985, "learning_rate": 6.60968660968661e-06, "loss": 1.5664, "step": 120 }, { "epoch": 0.34472934472934474, "grad_norm": 0.4649519622325897, "learning_rate": 6.581196581196582e-06, "loss": 1.5827, "step": 121 }, { "epoch": 0.3475783475783476, "grad_norm": 0.4004313051700592, "learning_rate": 6.552706552706553e-06, "loss": 1.4653, "step": 122 }, { "epoch": 0.3504273504273504, "grad_norm": 0.3949541449546814, "learning_rate": 6.524216524216525e-06, "loss": 1.5285, "step": 123 }, { "epoch": 0.35327635327635326, "grad_norm": 0.6077877283096313, "learning_rate": 6.495726495726496e-06, "loss": 1.5648, "step": 124 }, { "epoch": 0.3561253561253561, "grad_norm": 0.5344558358192444, "learning_rate": 6.467236467236467e-06, "loss": 1.5311, "step": 125 }, { "epoch": 0.358974358974359, "grad_norm": 0.38816729187965393, "learning_rate": 6.438746438746439e-06, "loss": 1.5139, "step": 126 }, { "epoch": 0.36182336182336183, "grad_norm": 0.3926841914653778, "learning_rate": 6.410256410256412e-06, "loss": 1.5277, "step": 127 }, { "epoch": 0.3646723646723647, "grad_norm": 0.40280261635780334, "learning_rate": 6.381766381766382e-06, "loss": 1.553, "step": 128 }, { "epoch": 0.36752136752136755, "grad_norm": 0.38559049367904663, "learning_rate": 6.3532763532763546e-06, "loss": 1.5269, "step": 129 }, { "epoch": 0.37037037037037035, "grad_norm": 0.38594579696655273, "learning_rate": 6.324786324786325e-06, "loss": 1.5185, "step": 130 }, { "epoch": 0.3732193732193732, "grad_norm": 0.372689425945282, "learning_rate": 6.296296296296297e-06, "loss": 1.5058, "step": 131 }, { "epoch": 0.37606837606837606, "grad_norm": 0.3884972333908081, "learning_rate": 6.267806267806268e-06, "loss": 1.5255, "step": 132 }, { "epoch": 0.3789173789173789, "grad_norm": 0.40464359521865845, "learning_rate": 6.23931623931624e-06, "loss": 1.5212, "step": 133 }, { "epoch": 0.3817663817663818, "grad_norm": 0.4075316786766052, "learning_rate": 6.210826210826212e-06, "loss": 1.4987, "step": 134 }, { "epoch": 0.38461538461538464, "grad_norm": 0.41846784949302673, "learning_rate": 6.1823361823361825e-06, "loss": 1.5409, "step": 135 }, { "epoch": 0.38746438746438744, "grad_norm": 0.4159785509109497, "learning_rate": 6.153846153846155e-06, "loss": 1.5393, "step": 136 }, { "epoch": 0.3903133903133903, "grad_norm": 0.3839842975139618, "learning_rate": 6.1253561253561255e-06, "loss": 1.5139, "step": 137 }, { "epoch": 0.39316239316239315, "grad_norm": 0.5279687643051147, "learning_rate": 6.096866096866098e-06, "loss": 1.5747, "step": 138 }, { "epoch": 0.396011396011396, "grad_norm": 0.40492990612983704, "learning_rate": 6.0683760683760684e-06, "loss": 1.453, "step": 139 }, { "epoch": 0.39886039886039887, "grad_norm": 0.41720351576805115, "learning_rate": 6.039886039886041e-06, "loss": 1.4864, "step": 140 }, { "epoch": 0.4017094017094017, "grad_norm": 0.3866989016532898, "learning_rate": 6.011396011396012e-06, "loss": 1.4723, "step": 141 }, { "epoch": 0.4045584045584046, "grad_norm": 0.38849347829818726, "learning_rate": 5.982905982905983e-06, "loss": 1.4842, "step": 142 }, { "epoch": 0.4074074074074074, "grad_norm": 0.5428235530853271, "learning_rate": 5.954415954415955e-06, "loss": 1.5338, "step": 143 }, { "epoch": 0.41025641025641024, "grad_norm": 0.3945627808570862, "learning_rate": 5.925925925925926e-06, "loss": 1.528, "step": 144 }, { "epoch": 0.4131054131054131, "grad_norm": 0.3996782898902893, "learning_rate": 5.897435897435898e-06, "loss": 1.5212, "step": 145 }, { "epoch": 0.41595441595441596, "grad_norm": 0.4091893136501312, "learning_rate": 5.868945868945869e-06, "loss": 1.5419, "step": 146 }, { "epoch": 0.4188034188034188, "grad_norm": 0.3839370906352997, "learning_rate": 5.840455840455841e-06, "loss": 1.4778, "step": 147 }, { "epoch": 0.42165242165242167, "grad_norm": 0.3939463496208191, "learning_rate": 5.8119658119658126e-06, "loss": 1.4912, "step": 148 }, { "epoch": 0.42450142450142453, "grad_norm": 0.5488878488540649, "learning_rate": 5.783475783475784e-06, "loss": 1.459, "step": 149 }, { "epoch": 0.42735042735042733, "grad_norm": 0.6062666773796082, "learning_rate": 5.7549857549857555e-06, "loss": 1.4166, "step": 150 }, { "epoch": 0.4301994301994302, "grad_norm": 0.5629584193229675, "learning_rate": 5.726495726495727e-06, "loss": 1.4818, "step": 151 }, { "epoch": 0.43304843304843305, "grad_norm": 0.41644972562789917, "learning_rate": 5.6980056980056985e-06, "loss": 1.4625, "step": 152 }, { "epoch": 0.4358974358974359, "grad_norm": 0.4007890820503235, "learning_rate": 5.669515669515669e-06, "loss": 1.4898, "step": 153 }, { "epoch": 0.43874643874643876, "grad_norm": 0.5906901359558105, "learning_rate": 5.641025641025641e-06, "loss": 1.5235, "step": 154 }, { "epoch": 0.4415954415954416, "grad_norm": 0.5607777237892151, "learning_rate": 5.612535612535614e-06, "loss": 1.5234, "step": 155 }, { "epoch": 0.4444444444444444, "grad_norm": 0.3959032893180847, "learning_rate": 5.584045584045584e-06, "loss": 1.4788, "step": 156 }, { "epoch": 0.4472934472934473, "grad_norm": 0.4064564108848572, "learning_rate": 5.555555555555557e-06, "loss": 1.503, "step": 157 }, { "epoch": 0.45014245014245013, "grad_norm": 0.39798179268836975, "learning_rate": 5.527065527065527e-06, "loss": 1.5001, "step": 158 }, { "epoch": 0.452991452991453, "grad_norm": 0.45741236209869385, "learning_rate": 5.498575498575499e-06, "loss": 1.5012, "step": 159 }, { "epoch": 0.45584045584045585, "grad_norm": 0.45142683386802673, "learning_rate": 5.470085470085471e-06, "loss": 1.5039, "step": 160 }, { "epoch": 0.4586894586894587, "grad_norm": 0.39934027194976807, "learning_rate": 5.441595441595442e-06, "loss": 1.4824, "step": 161 }, { "epoch": 0.46153846153846156, "grad_norm": 0.3966750502586365, "learning_rate": 5.413105413105414e-06, "loss": 1.4791, "step": 162 }, { "epoch": 0.46438746438746437, "grad_norm": 0.4393257200717926, "learning_rate": 5.384615384615385e-06, "loss": 1.449, "step": 163 }, { "epoch": 0.4672364672364672, "grad_norm": 0.42632415890693665, "learning_rate": 5.356125356125357e-06, "loss": 1.5248, "step": 164 }, { "epoch": 0.4700854700854701, "grad_norm": 0.41508087515830994, "learning_rate": 5.327635327635328e-06, "loss": 1.4873, "step": 165 }, { "epoch": 0.47293447293447294, "grad_norm": 0.4311036467552185, "learning_rate": 5.2991452991453e-06, "loss": 1.4981, "step": 166 }, { "epoch": 0.4757834757834758, "grad_norm": 0.39872288703918457, "learning_rate": 5.270655270655271e-06, "loss": 1.4905, "step": 167 }, { "epoch": 0.47863247863247865, "grad_norm": 0.412751168012619, "learning_rate": 5.242165242165243e-06, "loss": 1.4728, "step": 168 }, { "epoch": 0.48148148148148145, "grad_norm": 0.40860670804977417, "learning_rate": 5.213675213675214e-06, "loss": 1.4986, "step": 169 }, { "epoch": 0.4843304843304843, "grad_norm": 0.4355701208114624, "learning_rate": 5.185185185185185e-06, "loss": 1.5109, "step": 170 }, { "epoch": 0.48717948717948717, "grad_norm": 0.43395113945007324, "learning_rate": 5.156695156695157e-06, "loss": 1.4995, "step": 171 }, { "epoch": 0.49002849002849, "grad_norm": 0.43208786845207214, "learning_rate": 5.128205128205128e-06, "loss": 1.4399, "step": 172 }, { "epoch": 0.4928774928774929, "grad_norm": 0.40610820055007935, "learning_rate": 5.0997150997151e-06, "loss": 1.4794, "step": 173 }, { "epoch": 0.49572649572649574, "grad_norm": 0.40242278575897217, "learning_rate": 5.071225071225072e-06, "loss": 1.4634, "step": 174 }, { "epoch": 0.4985754985754986, "grad_norm": 0.39585167169570923, "learning_rate": 5.042735042735043e-06, "loss": 1.4701, "step": 175 }, { "epoch": 0.5014245014245015, "grad_norm": 0.43933385610580444, "learning_rate": 5.014245014245015e-06, "loss": 1.4759, "step": 176 }, { "epoch": 0.5042735042735043, "grad_norm": 0.5048877000808716, "learning_rate": 4.985754985754986e-06, "loss": 1.4405, "step": 177 }, { "epoch": 0.5071225071225072, "grad_norm": 0.45279544591903687, "learning_rate": 4.957264957264958e-06, "loss": 1.5182, "step": 178 }, { "epoch": 0.50997150997151, "grad_norm": 0.40896686911582947, "learning_rate": 4.928774928774929e-06, "loss": 1.4857, "step": 179 }, { "epoch": 0.5128205128205128, "grad_norm": 0.6420154571533203, "learning_rate": 4.9002849002849006e-06, "loss": 1.4331, "step": 180 }, { "epoch": 0.5156695156695157, "grad_norm": 0.45687025785446167, "learning_rate": 4.871794871794872e-06, "loss": 1.4716, "step": 181 }, { "epoch": 0.5185185185185185, "grad_norm": 0.4174126088619232, "learning_rate": 4.8433048433048435e-06, "loss": 1.4636, "step": 182 }, { "epoch": 0.5213675213675214, "grad_norm": 0.3912286162376404, "learning_rate": 4.814814814814815e-06, "loss": 1.4534, "step": 183 }, { "epoch": 0.5242165242165242, "grad_norm": 0.44232121109962463, "learning_rate": 4.786324786324787e-06, "loss": 1.4286, "step": 184 }, { "epoch": 0.5270655270655271, "grad_norm": 0.4259029030799866, "learning_rate": 4.757834757834758e-06, "loss": 1.5174, "step": 185 }, { "epoch": 0.5299145299145299, "grad_norm": 0.39745402336120605, "learning_rate": 4.729344729344729e-06, "loss": 1.4393, "step": 186 }, { "epoch": 0.5327635327635327, "grad_norm": 0.7201390266418457, "learning_rate": 4.700854700854701e-06, "loss": 1.5721, "step": 187 }, { "epoch": 0.5356125356125356, "grad_norm": 0.42101916670799255, "learning_rate": 4.672364672364672e-06, "loss": 1.4847, "step": 188 }, { "epoch": 0.5384615384615384, "grad_norm": 0.4132574498653412, "learning_rate": 4.643874643874644e-06, "loss": 1.4632, "step": 189 }, { "epoch": 0.5413105413105413, "grad_norm": 0.44261249899864197, "learning_rate": 4.615384615384616e-06, "loss": 1.4767, "step": 190 }, { "epoch": 0.5441595441595442, "grad_norm": 0.4636523723602295, "learning_rate": 4.586894586894588e-06, "loss": 1.4868, "step": 191 }, { "epoch": 0.5470085470085471, "grad_norm": 0.4402620792388916, "learning_rate": 4.558404558404559e-06, "loss": 1.5096, "step": 192 }, { "epoch": 0.5498575498575499, "grad_norm": 0.46384042501449585, "learning_rate": 4.5299145299145306e-06, "loss": 1.5022, "step": 193 }, { "epoch": 0.5527065527065527, "grad_norm": 0.4248226583003998, "learning_rate": 4.501424501424502e-06, "loss": 1.4968, "step": 194 }, { "epoch": 0.5555555555555556, "grad_norm": 0.41844654083251953, "learning_rate": 4.4729344729344735e-06, "loss": 1.4441, "step": 195 }, { "epoch": 0.5584045584045584, "grad_norm": 0.4129433035850525, "learning_rate": 4.444444444444444e-06, "loss": 1.4598, "step": 196 }, { "epoch": 0.5612535612535613, "grad_norm": 0.4882029891014099, "learning_rate": 4.4159544159544165e-06, "loss": 1.5211, "step": 197 }, { "epoch": 0.5641025641025641, "grad_norm": 0.4571973979473114, "learning_rate": 4.387464387464388e-06, "loss": 1.4964, "step": 198 }, { "epoch": 0.5669515669515669, "grad_norm": 0.4153326451778412, "learning_rate": 4.358974358974359e-06, "loss": 1.4912, "step": 199 }, { "epoch": 0.5698005698005698, "grad_norm": 0.41810521483421326, "learning_rate": 4.330484330484331e-06, "loss": 1.4881, "step": 200 }, { "epoch": 0.5726495726495726, "grad_norm": 0.43121734261512756, "learning_rate": 4.301994301994302e-06, "loss": 1.4489, "step": 201 }, { "epoch": 0.5754985754985755, "grad_norm": 0.39392393827438354, "learning_rate": 4.273504273504274e-06, "loss": 1.4354, "step": 202 }, { "epoch": 0.5783475783475783, "grad_norm": 0.4206382632255554, "learning_rate": 4.245014245014245e-06, "loss": 1.4294, "step": 203 }, { "epoch": 0.5811965811965812, "grad_norm": 0.7128792405128479, "learning_rate": 4.216524216524217e-06, "loss": 1.4796, "step": 204 }, { "epoch": 0.584045584045584, "grad_norm": 0.42449796199798584, "learning_rate": 4.188034188034188e-06, "loss": 1.44, "step": 205 }, { "epoch": 0.5868945868945868, "grad_norm": 0.40819981694221497, "learning_rate": 4.15954415954416e-06, "loss": 1.4674, "step": 206 }, { "epoch": 0.5897435897435898, "grad_norm": 0.4191708564758301, "learning_rate": 4.131054131054131e-06, "loss": 1.4231, "step": 207 }, { "epoch": 0.5925925925925926, "grad_norm": 0.4241287410259247, "learning_rate": 4.102564102564103e-06, "loss": 1.4841, "step": 208 }, { "epoch": 0.5954415954415955, "grad_norm": 0.4283653795719147, "learning_rate": 4.074074074074074e-06, "loss": 1.4251, "step": 209 }, { "epoch": 0.5982905982905983, "grad_norm": 0.41446876525878906, "learning_rate": 4.0455840455840465e-06, "loss": 1.4496, "step": 210 }, { "epoch": 0.6011396011396012, "grad_norm": 0.4163020849227905, "learning_rate": 4.017094017094018e-06, "loss": 1.4273, "step": 211 }, { "epoch": 0.603988603988604, "grad_norm": 0.42851346731185913, "learning_rate": 3.9886039886039894e-06, "loss": 1.4727, "step": 212 }, { "epoch": 0.6068376068376068, "grad_norm": 0.4239060878753662, "learning_rate": 3.96011396011396e-06, "loss": 1.4318, "step": 213 }, { "epoch": 0.6096866096866097, "grad_norm": 0.40873628854751587, "learning_rate": 3.9316239316239315e-06, "loss": 1.4548, "step": 214 }, { "epoch": 0.6125356125356125, "grad_norm": 0.45280134677886963, "learning_rate": 3.903133903133903e-06, "loss": 1.4932, "step": 215 }, { "epoch": 0.6153846153846154, "grad_norm": 0.6247657537460327, "learning_rate": 3.8746438746438745e-06, "loss": 1.4499, "step": 216 }, { "epoch": 0.6182336182336182, "grad_norm": 0.4122682511806488, "learning_rate": 3.846153846153847e-06, "loss": 1.4218, "step": 217 }, { "epoch": 0.6210826210826211, "grad_norm": 0.40727391839027405, "learning_rate": 3.817663817663818e-06, "loss": 1.4726, "step": 218 }, { "epoch": 0.6239316239316239, "grad_norm": 0.4725242555141449, "learning_rate": 3.7891737891737893e-06, "loss": 1.4214, "step": 219 }, { "epoch": 0.6267806267806267, "grad_norm": 0.45712363719940186, "learning_rate": 3.760683760683761e-06, "loss": 1.4518, "step": 220 }, { "epoch": 0.6296296296296297, "grad_norm": 0.40573611855506897, "learning_rate": 3.7321937321937323e-06, "loss": 1.459, "step": 221 }, { "epoch": 0.6324786324786325, "grad_norm": 0.4086320400238037, "learning_rate": 3.7037037037037037e-06, "loss": 1.4395, "step": 222 }, { "epoch": 0.6353276353276354, "grad_norm": 0.4158555567264557, "learning_rate": 3.6752136752136756e-06, "loss": 1.4436, "step": 223 }, { "epoch": 0.6381766381766382, "grad_norm": 0.5216575264930725, "learning_rate": 3.646723646723647e-06, "loss": 1.4659, "step": 224 }, { "epoch": 0.6410256410256411, "grad_norm": 0.394228994846344, "learning_rate": 3.6182336182336186e-06, "loss": 1.4637, "step": 225 }, { "epoch": 0.6438746438746439, "grad_norm": 0.41643351316452026, "learning_rate": 3.58974358974359e-06, "loss": 1.4298, "step": 226 }, { "epoch": 0.6467236467236467, "grad_norm": 0.407087117433548, "learning_rate": 3.5612535612535615e-06, "loss": 1.4426, "step": 227 }, { "epoch": 0.6495726495726496, "grad_norm": 0.47986599802970886, "learning_rate": 3.532763532763533e-06, "loss": 1.5079, "step": 228 }, { "epoch": 0.6524216524216524, "grad_norm": 0.42481309175491333, "learning_rate": 3.5042735042735045e-06, "loss": 1.4422, "step": 229 }, { "epoch": 0.6552706552706553, "grad_norm": 0.43366938829421997, "learning_rate": 3.4757834757834764e-06, "loss": 1.467, "step": 230 }, { "epoch": 0.6581196581196581, "grad_norm": 0.5313072204589844, "learning_rate": 3.447293447293448e-06, "loss": 1.4382, "step": 231 }, { "epoch": 0.6609686609686609, "grad_norm": 0.40050390362739563, "learning_rate": 3.4188034188034193e-06, "loss": 1.4024, "step": 232 }, { "epoch": 0.6638176638176638, "grad_norm": 0.42196667194366455, "learning_rate": 3.3903133903133904e-06, "loss": 1.4825, "step": 233 }, { "epoch": 0.6666666666666666, "grad_norm": 0.4109940826892853, "learning_rate": 3.361823361823362e-06, "loss": 1.4036, "step": 234 }, { "epoch": 0.6695156695156695, "grad_norm": 0.41641300916671753, "learning_rate": 3.3333333333333333e-06, "loss": 1.4409, "step": 235 }, { "epoch": 0.6723646723646723, "grad_norm": 0.4459202289581299, "learning_rate": 3.304843304843305e-06, "loss": 1.4422, "step": 236 }, { "epoch": 0.6752136752136753, "grad_norm": 0.40903767943382263, "learning_rate": 3.2763532763532767e-06, "loss": 1.4375, "step": 237 }, { "epoch": 0.6780626780626781, "grad_norm": 0.40536248683929443, "learning_rate": 3.247863247863248e-06, "loss": 1.4357, "step": 238 }, { "epoch": 0.6809116809116809, "grad_norm": 0.43088406324386597, "learning_rate": 3.2193732193732196e-06, "loss": 1.4428, "step": 239 }, { "epoch": 0.6837606837606838, "grad_norm": 0.43017005920410156, "learning_rate": 3.190883190883191e-06, "loss": 1.4213, "step": 240 }, { "epoch": 0.6866096866096866, "grad_norm": 0.43592897057533264, "learning_rate": 3.1623931623931626e-06, "loss": 1.5107, "step": 241 }, { "epoch": 0.6894586894586895, "grad_norm": 0.6451869606971741, "learning_rate": 3.133903133903134e-06, "loss": 1.4993, "step": 242 }, { "epoch": 0.6923076923076923, "grad_norm": 0.45624542236328125, "learning_rate": 3.105413105413106e-06, "loss": 1.4297, "step": 243 }, { "epoch": 0.6951566951566952, "grad_norm": 0.4131554067134857, "learning_rate": 3.0769230769230774e-06, "loss": 1.4272, "step": 244 }, { "epoch": 0.698005698005698, "grad_norm": 0.49703848361968994, "learning_rate": 3.048433048433049e-06, "loss": 1.4175, "step": 245 }, { "epoch": 0.7008547008547008, "grad_norm": 0.4367448091506958, "learning_rate": 3.0199430199430204e-06, "loss": 1.4585, "step": 246 }, { "epoch": 0.7037037037037037, "grad_norm": 0.44849011301994324, "learning_rate": 2.9914529914529914e-06, "loss": 1.4596, "step": 247 }, { "epoch": 0.7065527065527065, "grad_norm": 0.42930400371551514, "learning_rate": 2.962962962962963e-06, "loss": 1.4335, "step": 248 }, { "epoch": 0.7094017094017094, "grad_norm": 0.4332965612411499, "learning_rate": 2.9344729344729344e-06, "loss": 1.4509, "step": 249 }, { "epoch": 0.7122507122507122, "grad_norm": 0.44173556566238403, "learning_rate": 2.9059829059829063e-06, "loss": 1.4596, "step": 250 }, { "epoch": 0.7150997150997151, "grad_norm": 0.40930160880088806, "learning_rate": 2.8774928774928778e-06, "loss": 1.4327, "step": 251 }, { "epoch": 0.717948717948718, "grad_norm": 0.4137099087238312, "learning_rate": 2.8490028490028492e-06, "loss": 1.4119, "step": 252 }, { "epoch": 0.7207977207977208, "grad_norm": 0.43292713165283203, "learning_rate": 2.8205128205128207e-06, "loss": 1.4352, "step": 253 }, { "epoch": 0.7236467236467237, "grad_norm": 0.6853729486465454, "learning_rate": 2.792022792022792e-06, "loss": 1.4859, "step": 254 }, { "epoch": 0.7264957264957265, "grad_norm": 0.4223368465900421, "learning_rate": 2.7635327635327636e-06, "loss": 1.4189, "step": 255 }, { "epoch": 0.7293447293447294, "grad_norm": 0.4098432958126068, "learning_rate": 2.7350427350427355e-06, "loss": 1.4474, "step": 256 }, { "epoch": 0.7321937321937322, "grad_norm": 0.42546141147613525, "learning_rate": 2.706552706552707e-06, "loss": 1.4447, "step": 257 }, { "epoch": 0.7350427350427351, "grad_norm": 0.434319406747818, "learning_rate": 2.6780626780626785e-06, "loss": 1.4559, "step": 258 }, { "epoch": 0.7378917378917379, "grad_norm": 0.5959000587463379, "learning_rate": 2.64957264957265e-06, "loss": 1.3711, "step": 259 }, { "epoch": 0.7407407407407407, "grad_norm": 0.6558396220207214, "learning_rate": 2.6210826210826214e-06, "loss": 1.3735, "step": 260 }, { "epoch": 0.7435897435897436, "grad_norm": 0.4049711525440216, "learning_rate": 2.5925925925925925e-06, "loss": 1.4327, "step": 261 }, { "epoch": 0.7464387464387464, "grad_norm": 0.4057099223136902, "learning_rate": 2.564102564102564e-06, "loss": 1.4173, "step": 262 }, { "epoch": 0.7492877492877493, "grad_norm": 0.44100022315979004, "learning_rate": 2.535612535612536e-06, "loss": 1.4568, "step": 263 }, { "epoch": 0.7521367521367521, "grad_norm": 0.4259463846683502, "learning_rate": 2.5071225071225073e-06, "loss": 1.4473, "step": 264 }, { "epoch": 0.7549857549857549, "grad_norm": 0.47139763832092285, "learning_rate": 2.478632478632479e-06, "loss": 1.4467, "step": 265 }, { "epoch": 0.7578347578347578, "grad_norm": 0.4066116511821747, "learning_rate": 2.4501424501424503e-06, "loss": 1.4148, "step": 266 }, { "epoch": 0.7606837606837606, "grad_norm": 0.4442392587661743, "learning_rate": 2.4216524216524218e-06, "loss": 1.4166, "step": 267 }, { "epoch": 0.7635327635327636, "grad_norm": 0.4146524667739868, "learning_rate": 2.3931623931623937e-06, "loss": 1.4214, "step": 268 }, { "epoch": 0.7663817663817664, "grad_norm": 0.4352812170982361, "learning_rate": 2.3646723646723647e-06, "loss": 1.4268, "step": 269 }, { "epoch": 0.7692307692307693, "grad_norm": 0.4416466951370239, "learning_rate": 2.336182336182336e-06, "loss": 1.3947, "step": 270 }, { "epoch": 0.7720797720797721, "grad_norm": 0.4440385699272156, "learning_rate": 2.307692307692308e-06, "loss": 1.4114, "step": 271 }, { "epoch": 0.7749287749287749, "grad_norm": 0.42091333866119385, "learning_rate": 2.2792022792022796e-06, "loss": 1.4343, "step": 272 }, { "epoch": 0.7777777777777778, "grad_norm": 0.39965999126434326, "learning_rate": 2.250712250712251e-06, "loss": 1.4401, "step": 273 }, { "epoch": 0.7806267806267806, "grad_norm": 0.4088633060455322, "learning_rate": 2.222222222222222e-06, "loss": 1.3808, "step": 274 }, { "epoch": 0.7834757834757835, "grad_norm": 0.42541617155075073, "learning_rate": 2.193732193732194e-06, "loss": 1.45, "step": 275 }, { "epoch": 0.7863247863247863, "grad_norm": 0.42558950185775757, "learning_rate": 2.1652421652421654e-06, "loss": 1.4317, "step": 276 }, { "epoch": 0.7891737891737892, "grad_norm": 0.4297507703304291, "learning_rate": 2.136752136752137e-06, "loss": 1.4493, "step": 277 }, { "epoch": 0.792022792022792, "grad_norm": 0.42826247215270996, "learning_rate": 2.1082621082621084e-06, "loss": 1.4665, "step": 278 }, { "epoch": 0.7948717948717948, "grad_norm": 0.4104038178920746, "learning_rate": 2.07977207977208e-06, "loss": 1.3966, "step": 279 }, { "epoch": 0.7977207977207977, "grad_norm": 0.5832846164703369, "learning_rate": 2.0512820512820513e-06, "loss": 1.409, "step": 280 }, { "epoch": 0.8005698005698005, "grad_norm": 0.4132280647754669, "learning_rate": 2.0227920227920232e-06, "loss": 1.421, "step": 281 }, { "epoch": 0.8034188034188035, "grad_norm": 0.5175873637199402, "learning_rate": 1.9943019943019947e-06, "loss": 1.4251, "step": 282 }, { "epoch": 0.8062678062678063, "grad_norm": 0.3983429968357086, "learning_rate": 1.9658119658119658e-06, "loss": 1.4305, "step": 283 }, { "epoch": 0.8091168091168092, "grad_norm": 0.4195236563682556, "learning_rate": 1.9373219373219372e-06, "loss": 1.3955, "step": 284 }, { "epoch": 0.811965811965812, "grad_norm": 0.44437727332115173, "learning_rate": 1.908831908831909e-06, "loss": 1.3945, "step": 285 }, { "epoch": 0.8148148148148148, "grad_norm": 0.4069578945636749, "learning_rate": 1.8803418803418804e-06, "loss": 1.3872, "step": 286 }, { "epoch": 0.8176638176638177, "grad_norm": 0.4366849660873413, "learning_rate": 1.8518518518518519e-06, "loss": 1.4303, "step": 287 }, { "epoch": 0.8205128205128205, "grad_norm": 0.42686140537261963, "learning_rate": 1.8233618233618236e-06, "loss": 1.4297, "step": 288 }, { "epoch": 0.8233618233618234, "grad_norm": 0.4372996687889099, "learning_rate": 1.794871794871795e-06, "loss": 1.4205, "step": 289 }, { "epoch": 0.8262108262108262, "grad_norm": 0.5185275077819824, "learning_rate": 1.7663817663817665e-06, "loss": 1.4072, "step": 290 }, { "epoch": 0.8290598290598291, "grad_norm": 0.4375689625740051, "learning_rate": 1.7378917378917382e-06, "loss": 1.4093, "step": 291 }, { "epoch": 0.8319088319088319, "grad_norm": 0.6223400235176086, "learning_rate": 1.7094017094017097e-06, "loss": 1.4038, "step": 292 }, { "epoch": 0.8347578347578347, "grad_norm": 0.49658337235450745, "learning_rate": 1.680911680911681e-06, "loss": 1.4587, "step": 293 }, { "epoch": 0.8376068376068376, "grad_norm": 0.48749840259552, "learning_rate": 1.6524216524216524e-06, "loss": 1.4573, "step": 294 }, { "epoch": 0.8404558404558404, "grad_norm": 0.4375877380371094, "learning_rate": 1.623931623931624e-06, "loss": 1.4126, "step": 295 }, { "epoch": 0.8433048433048433, "grad_norm": 0.5864587426185608, "learning_rate": 1.5954415954415956e-06, "loss": 1.3915, "step": 296 }, { "epoch": 0.8461538461538461, "grad_norm": 0.4243745803833008, "learning_rate": 1.566951566951567e-06, "loss": 1.4475, "step": 297 }, { "epoch": 0.8490028490028491, "grad_norm": 0.5398270487785339, "learning_rate": 1.5384615384615387e-06, "loss": 1.3658, "step": 298 }, { "epoch": 0.8518518518518519, "grad_norm": 0.4248296916484833, "learning_rate": 1.5099715099715102e-06, "loss": 1.3898, "step": 299 }, { "epoch": 0.8547008547008547, "grad_norm": 0.4054194986820221, "learning_rate": 1.4814814814814815e-06, "loss": 1.3806, "step": 300 }, { "epoch": 0.8575498575498576, "grad_norm": 0.4230331778526306, "learning_rate": 1.4529914529914531e-06, "loss": 1.431, "step": 301 }, { "epoch": 0.8603988603988604, "grad_norm": 0.42785853147506714, "learning_rate": 1.4245014245014246e-06, "loss": 1.3905, "step": 302 }, { "epoch": 0.8632478632478633, "grad_norm": 0.6043952703475952, "learning_rate": 1.396011396011396e-06, "loss": 1.444, "step": 303 }, { "epoch": 0.8660968660968661, "grad_norm": 0.41546547412872314, "learning_rate": 1.3675213675213678e-06, "loss": 1.3876, "step": 304 }, { "epoch": 0.8689458689458689, "grad_norm": 0.5535686612129211, "learning_rate": 1.3390313390313392e-06, "loss": 1.3663, "step": 305 }, { "epoch": 0.8717948717948718, "grad_norm": 0.43172240257263184, "learning_rate": 1.3105413105413107e-06, "loss": 1.4281, "step": 306 }, { "epoch": 0.8746438746438746, "grad_norm": 0.4234292209148407, "learning_rate": 1.282051282051282e-06, "loss": 1.4105, "step": 307 }, { "epoch": 0.8774928774928775, "grad_norm": 0.4184323847293854, "learning_rate": 1.2535612535612537e-06, "loss": 1.3755, "step": 308 }, { "epoch": 0.8803418803418803, "grad_norm": 0.6069676876068115, "learning_rate": 1.2250712250712251e-06, "loss": 1.3666, "step": 309 }, { "epoch": 0.8831908831908832, "grad_norm": 0.4531959891319275, "learning_rate": 1.1965811965811968e-06, "loss": 1.4109, "step": 310 }, { "epoch": 0.886039886039886, "grad_norm": 0.49059048295021057, "learning_rate": 1.168091168091168e-06, "loss": 1.4259, "step": 311 }, { "epoch": 0.8888888888888888, "grad_norm": 0.4053284823894501, "learning_rate": 1.1396011396011398e-06, "loss": 1.4173, "step": 312 }, { "epoch": 0.8917378917378918, "grad_norm": 0.4258776307106018, "learning_rate": 1.111111111111111e-06, "loss": 1.4079, "step": 313 }, { "epoch": 0.8945868945868946, "grad_norm": 0.4315298795700073, "learning_rate": 1.0826210826210827e-06, "loss": 1.3791, "step": 314 }, { "epoch": 0.8974358974358975, "grad_norm": 0.48497509956359863, "learning_rate": 1.0541310541310542e-06, "loss": 1.4389, "step": 315 }, { "epoch": 0.9002849002849003, "grad_norm": 0.4596964716911316, "learning_rate": 1.0256410256410257e-06, "loss": 1.4253, "step": 316 }, { "epoch": 0.9031339031339032, "grad_norm": 0.43682560324668884, "learning_rate": 9.971509971509974e-07, "loss": 1.4358, "step": 317 }, { "epoch": 0.905982905982906, "grad_norm": 0.5284684896469116, "learning_rate": 9.686609686609686e-07, "loss": 1.3974, "step": 318 }, { "epoch": 0.9088319088319088, "grad_norm": 0.444614440202713, "learning_rate": 9.401709401709402e-07, "loss": 1.4258, "step": 319 }, { "epoch": 0.9116809116809117, "grad_norm": 0.41446149349212646, "learning_rate": 9.116809116809118e-07, "loss": 1.4093, "step": 320 }, { "epoch": 0.9145299145299145, "grad_norm": 0.505181074142456, "learning_rate": 8.831908831908833e-07, "loss": 1.4355, "step": 321 }, { "epoch": 0.9173789173789174, "grad_norm": 0.41858991980552673, "learning_rate": 8.547008547008548e-07, "loss": 1.4259, "step": 322 }, { "epoch": 0.9202279202279202, "grad_norm": 0.6958276033401489, "learning_rate": 8.262108262108262e-07, "loss": 1.4456, "step": 323 }, { "epoch": 0.9230769230769231, "grad_norm": 0.4824206829071045, "learning_rate": 7.977207977207978e-07, "loss": 1.4341, "step": 324 }, { "epoch": 0.9259259259259259, "grad_norm": 0.4208286702632904, "learning_rate": 7.692307692307694e-07, "loss": 1.4401, "step": 325 }, { "epoch": 0.9287749287749287, "grad_norm": 0.43090713024139404, "learning_rate": 7.407407407407407e-07, "loss": 1.4295, "step": 326 }, { "epoch": 0.9316239316239316, "grad_norm": 0.4124811589717865, "learning_rate": 7.122507122507123e-07, "loss": 1.4234, "step": 327 }, { "epoch": 0.9344729344729344, "grad_norm": 0.4865758419036865, "learning_rate": 6.837606837606839e-07, "loss": 1.4787, "step": 328 }, { "epoch": 0.9373219373219374, "grad_norm": 0.4624764323234558, "learning_rate": 6.552706552706554e-07, "loss": 1.3913, "step": 329 }, { "epoch": 0.9401709401709402, "grad_norm": 0.4168078899383545, "learning_rate": 6.267806267806268e-07, "loss": 1.3954, "step": 330 }, { "epoch": 0.9430199430199431, "grad_norm": 0.43121403455734253, "learning_rate": 5.982905982905984e-07, "loss": 1.4046, "step": 331 }, { "epoch": 0.9458689458689459, "grad_norm": 0.43017080426216125, "learning_rate": 5.698005698005699e-07, "loss": 1.4471, "step": 332 }, { "epoch": 0.9487179487179487, "grad_norm": 0.41371017694473267, "learning_rate": 5.413105413105414e-07, "loss": 1.3891, "step": 333 }, { "epoch": 0.9515669515669516, "grad_norm": 0.42624595761299133, "learning_rate": 5.128205128205128e-07, "loss": 1.4431, "step": 334 }, { "epoch": 0.9544159544159544, "grad_norm": 0.4311563968658447, "learning_rate": 4.843304843304843e-07, "loss": 1.3985, "step": 335 }, { "epoch": 0.9572649572649573, "grad_norm": 0.42693498730659485, "learning_rate": 4.558404558404559e-07, "loss": 1.3818, "step": 336 }, { "epoch": 0.9601139601139601, "grad_norm": 0.7170986533164978, "learning_rate": 4.273504273504274e-07, "loss": 1.4704, "step": 337 }, { "epoch": 0.9629629629629629, "grad_norm": 0.42342740297317505, "learning_rate": 3.988603988603989e-07, "loss": 1.4172, "step": 338 }, { "epoch": 0.9658119658119658, "grad_norm": 0.5637214183807373, "learning_rate": 3.7037037037037036e-07, "loss": 1.3729, "step": 339 }, { "epoch": 0.9686609686609686, "grad_norm": 0.42340558767318726, "learning_rate": 3.4188034188034194e-07, "loss": 1.3958, "step": 340 }, { "epoch": 0.9715099715099715, "grad_norm": 0.4184475541114807, "learning_rate": 3.133903133903134e-07, "loss": 1.4015, "step": 341 }, { "epoch": 0.9743589743589743, "grad_norm": 0.42320722341537476, "learning_rate": 2.8490028490028494e-07, "loss": 1.396, "step": 342 }, { "epoch": 0.9772079772079773, "grad_norm": 0.4045957624912262, "learning_rate": 2.564102564102564e-07, "loss": 1.4237, "step": 343 }, { "epoch": 0.98005698005698, "grad_norm": 0.4371383488178253, "learning_rate": 2.2792022792022794e-07, "loss": 1.4127, "step": 344 }, { "epoch": 0.9829059829059829, "grad_norm": 0.5121440291404724, "learning_rate": 1.9943019943019944e-07, "loss": 1.4109, "step": 345 }, { "epoch": 0.9857549857549858, "grad_norm": 0.42042669653892517, "learning_rate": 1.7094017094017097e-07, "loss": 1.352, "step": 346 }, { "epoch": 0.9886039886039886, "grad_norm": 0.7115257382392883, "learning_rate": 1.4245014245014247e-07, "loss": 1.404, "step": 347 }, { "epoch": 0.9914529914529915, "grad_norm": 0.5735996961593628, "learning_rate": 1.1396011396011397e-07, "loss": 1.4461, "step": 348 }, { "epoch": 0.9943019943019943, "grad_norm": 0.4333067834377289, "learning_rate": 8.547008547008549e-08, "loss": 1.403, "step": 349 }, { "epoch": 0.9971509971509972, "grad_norm": 0.4068621098995209, "learning_rate": 5.6980056980056986e-08, "loss": 1.4357, "step": 350 }, { "epoch": 1.0, "grad_norm": 0.6922910809516907, "learning_rate": 2.8490028490028493e-08, "loss": 1.4253, "step": 351 } ], "logging_steps": 1.0, "max_steps": 351, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6380890003275776e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }