{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 203, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0049261083743842365, "grad_norm": 28.522218704223633, "learning_rate": 1e-05, "loss": 1.8038, "step": 1 }, { "epoch": 0.009852216748768473, "grad_norm": 10.439436912536621, "learning_rate": 9.999401258983426e-06, "loss": 1.7024, "step": 2 }, { "epoch": 0.014778325123152709, "grad_norm": 10.16576099395752, "learning_rate": 9.997605179330018e-06, "loss": 1.6585, "step": 3 }, { "epoch": 0.019704433497536946, "grad_norm": 8.24977970123291, "learning_rate": 9.994612191194407e-06, "loss": 1.5447, "step": 4 }, { "epoch": 0.024630541871921183, "grad_norm": 4.327300071716309, "learning_rate": 9.990423011386489e-06, "loss": 1.4008, "step": 5 }, { "epoch": 0.029556650246305417, "grad_norm": 3.9305260181427, "learning_rate": 9.98503864319978e-06, "loss": 1.4287, "step": 6 }, { "epoch": 0.034482758620689655, "grad_norm": 2.2586233615875244, "learning_rate": 9.978460376171113e-06, "loss": 1.4288, "step": 7 }, { "epoch": 0.03940886699507389, "grad_norm": 2.152981996536255, "learning_rate": 9.970689785771798e-06, "loss": 1.2331, "step": 8 }, { "epoch": 0.04433497536945813, "grad_norm": 2.3123421669006348, "learning_rate": 9.961728733030318e-06, "loss": 1.5576, "step": 9 }, { "epoch": 0.04926108374384237, "grad_norm": 2.1223628520965576, "learning_rate": 9.951579364086603e-06, "loss": 1.3364, "step": 10 }, { "epoch": 0.054187192118226604, "grad_norm": 2.337981939315796, "learning_rate": 9.940244109678043e-06, "loss": 1.3432, "step": 11 }, { "epoch": 0.059113300492610835, "grad_norm": 2.3937160968780518, "learning_rate": 9.927725684557339e-06, "loss": 1.3346, "step": 12 }, { "epoch": 0.06403940886699508, "grad_norm": 1.533258080482483, "learning_rate": 9.914027086842323e-06, "loss": 1.2448, "step": 13 }, { "epoch": 0.06896551724137931, "grad_norm": 1.9601627588272095, "learning_rate": 9.899151597297923e-06, "loss": 1.2093, "step": 14 }, { "epoch": 0.07389162561576355, "grad_norm": 1.4371508359909058, "learning_rate": 9.883102778550434e-06, "loss": 1.0955, "step": 15 }, { "epoch": 0.07881773399014778, "grad_norm": 1.5759714841842651, "learning_rate": 9.865884474234275e-06, "loss": 1.2615, "step": 16 }, { "epoch": 0.08374384236453201, "grad_norm": 1.4584101438522339, "learning_rate": 9.847500808071458e-06, "loss": 1.2764, "step": 17 }, { "epoch": 0.08866995073891626, "grad_norm": 1.432774305343628, "learning_rate": 9.82795618288397e-06, "loss": 1.11, "step": 18 }, { "epoch": 0.09359605911330049, "grad_norm": 1.7812994718551636, "learning_rate": 9.807255279539313e-06, "loss": 1.2687, "step": 19 }, { "epoch": 0.09852216748768473, "grad_norm": 1.3773082494735718, "learning_rate": 9.78540305582945e-06, "loss": 1.1375, "step": 20 }, { "epoch": 0.10344827586206896, "grad_norm": 1.4377540349960327, "learning_rate": 9.762404745283439e-06, "loss": 1.1887, "step": 21 }, { "epoch": 0.10837438423645321, "grad_norm": 1.2567236423492432, "learning_rate": 9.738265855914014e-06, "loss": 1.1226, "step": 22 }, { "epoch": 0.11330049261083744, "grad_norm": 1.288097620010376, "learning_rate": 9.712992168898436e-06, "loss": 1.1442, "step": 23 }, { "epoch": 0.11822660098522167, "grad_norm": 1.3083038330078125, "learning_rate": 9.686589737193929e-06, "loss": 1.1809, "step": 24 }, { "epoch": 0.12315270935960591, "grad_norm": 1.0722836256027222, "learning_rate": 9.659064884088017e-06, "loss": 1.1327, "step": 25 }, { "epoch": 0.12807881773399016, "grad_norm": 1.1409716606140137, "learning_rate": 9.630424201684105e-06, "loss": 1.0866, "step": 26 }, { "epoch": 0.1330049261083744, "grad_norm": 1.1258468627929688, "learning_rate": 9.600674549322716e-06, "loss": 1.0847, "step": 27 }, { "epoch": 0.13793103448275862, "grad_norm": 1.0608943700790405, "learning_rate": 9.569823051938689e-06, "loss": 0.9715, "step": 28 }, { "epoch": 0.14285714285714285, "grad_norm": 1.1090885400772095, "learning_rate": 9.537877098354787e-06, "loss": 1.0492, "step": 29 }, { "epoch": 0.1477832512315271, "grad_norm": 1.2303950786590576, "learning_rate": 9.504844339512096e-06, "loss": 0.995, "step": 30 }, { "epoch": 0.15270935960591134, "grad_norm": 1.2325893640518188, "learning_rate": 9.470732686637665e-06, "loss": 1.1353, "step": 31 }, { "epoch": 0.15763546798029557, "grad_norm": 1.0923973321914673, "learning_rate": 9.435550309349776e-06, "loss": 1.0256, "step": 32 }, { "epoch": 0.1625615763546798, "grad_norm": 1.8741207122802734, "learning_rate": 9.399305633701372e-06, "loss": 1.117, "step": 33 }, { "epoch": 0.16748768472906403, "grad_norm": 1.292672038078308, "learning_rate": 9.36200734016203e-06, "loss": 1.0424, "step": 34 }, { "epoch": 0.1724137931034483, "grad_norm": 1.2905791997909546, "learning_rate": 9.32366436153902e-06, "loss": 1.1793, "step": 35 }, { "epoch": 0.17733990147783252, "grad_norm": 1.1905455589294434, "learning_rate": 9.284285880837947e-06, "loss": 1.0032, "step": 36 }, { "epoch": 0.18226600985221675, "grad_norm": 1.1533136367797852, "learning_rate": 9.243881329063436e-06, "loss": 1.0406, "step": 37 }, { "epoch": 0.18719211822660098, "grad_norm": 1.2299302816390991, "learning_rate": 9.202460382960449e-06, "loss": 1.1085, "step": 38 }, { "epoch": 0.1921182266009852, "grad_norm": 1.0995800495147705, "learning_rate": 9.160032962696734e-06, "loss": 1.0465, "step": 39 }, { "epoch": 0.19704433497536947, "grad_norm": 1.2899202108383179, "learning_rate": 9.116609229486992e-06, "loss": 1.0072, "step": 40 }, { "epoch": 0.2019704433497537, "grad_norm": 1.068886399269104, "learning_rate": 9.072199583159285e-06, "loss": 1.0853, "step": 41 }, { "epoch": 0.20689655172413793, "grad_norm": 1.0160249471664429, "learning_rate": 9.026814659664331e-06, "loss": 0.9201, "step": 42 }, { "epoch": 0.21182266009852216, "grad_norm": 0.980324387550354, "learning_rate": 8.98046532852822e-06, "loss": 0.9792, "step": 43 }, { "epoch": 0.21674876847290642, "grad_norm": 1.0656648874282837, "learning_rate": 8.93316269024921e-06, "loss": 0.9549, "step": 44 }, { "epoch": 0.22167487684729065, "grad_norm": 1.124436855316162, "learning_rate": 8.88491807363919e-06, "loss": 1.0474, "step": 45 }, { "epoch": 0.22660098522167488, "grad_norm": 1.1231716871261597, "learning_rate": 8.835743033110482e-06, "loss": 0.9981, "step": 46 }, { "epoch": 0.2315270935960591, "grad_norm": 1.0960031747817993, "learning_rate": 8.78564934590859e-06, "loss": 1.0547, "step": 47 }, { "epoch": 0.23645320197044334, "grad_norm": 1.056442141532898, "learning_rate": 8.734649009291586e-06, "loss": 1.0868, "step": 48 }, { "epoch": 0.2413793103448276, "grad_norm": 1.0149261951446533, "learning_rate": 8.68275423765683e-06, "loss": 0.9538, "step": 49 }, { "epoch": 0.24630541871921183, "grad_norm": 0.9313431978225708, "learning_rate": 8.629977459615655e-06, "loss": 0.9597, "step": 50 }, { "epoch": 0.2512315270935961, "grad_norm": 1.086411714553833, "learning_rate": 8.576331315016753e-06, "loss": 1.0181, "step": 51 }, { "epoch": 0.2561576354679803, "grad_norm": 1.1177152395248413, "learning_rate": 8.521828651918983e-06, "loss": 1.0278, "step": 52 }, { "epoch": 0.26108374384236455, "grad_norm": 0.9545988440513611, "learning_rate": 8.46648252351431e-06, "loss": 0.9892, "step": 53 }, { "epoch": 0.2660098522167488, "grad_norm": 1.05325186252594, "learning_rate": 8.41030618500161e-06, "loss": 1.0133, "step": 54 }, { "epoch": 0.270935960591133, "grad_norm": 1.0253876447677612, "learning_rate": 8.353313090412093e-06, "loss": 0.9538, "step": 55 }, { "epoch": 0.27586206896551724, "grad_norm": 1.0110273361206055, "learning_rate": 8.295516889387115e-06, "loss": 0.8805, "step": 56 }, { "epoch": 0.28078817733990147, "grad_norm": 1.0400066375732422, "learning_rate": 8.23693142390914e-06, "loss": 0.9632, "step": 57 }, { "epoch": 0.2857142857142857, "grad_norm": 1.128901481628418, "learning_rate": 8.177570724986627e-06, "loss": 1.015, "step": 58 }, { "epoch": 0.29064039408866993, "grad_norm": 1.1031105518341064, "learning_rate": 8.117449009293668e-06, "loss": 0.9957, "step": 59 }, { "epoch": 0.2955665024630542, "grad_norm": 4.238386154174805, "learning_rate": 8.05658067576513e-06, "loss": 0.9085, "step": 60 }, { "epoch": 0.30049261083743845, "grad_norm": 1.160597324371338, "learning_rate": 7.99498030214817e-06, "loss": 0.9809, "step": 61 }, { "epoch": 0.3054187192118227, "grad_norm": 1.0774437189102173, "learning_rate": 7.932662641510915e-06, "loss": 0.99, "step": 62 }, { "epoch": 0.3103448275862069, "grad_norm": 1.0282933712005615, "learning_rate": 7.869642618709162e-06, "loss": 0.9275, "step": 63 }, { "epoch": 0.31527093596059114, "grad_norm": 1.0454133749008179, "learning_rate": 7.805935326811913e-06, "loss": 0.9071, "step": 64 }, { "epoch": 0.32019704433497537, "grad_norm": 1.1418848037719727, "learning_rate": 7.741556023486655e-06, "loss": 0.9734, "step": 65 }, { "epoch": 0.3251231527093596, "grad_norm": 1.0286744832992554, "learning_rate": 7.676520127345198e-06, "loss": 0.9934, "step": 66 }, { "epoch": 0.33004926108374383, "grad_norm": 1.2144535779953003, "learning_rate": 7.610843214250964e-06, "loss": 0.9829, "step": 67 }, { "epoch": 0.33497536945812806, "grad_norm": 1.4030691385269165, "learning_rate": 7.5445410135886455e-06, "loss": 0.9717, "step": 68 }, { "epoch": 0.3399014778325123, "grad_norm": 1.0528010129928589, "learning_rate": 7.477629404497048e-06, "loss": 0.9649, "step": 69 }, { "epoch": 0.3448275862068966, "grad_norm": 1.0271952152252197, "learning_rate": 7.4101244120661105e-06, "loss": 0.9185, "step": 70 }, { "epoch": 0.3497536945812808, "grad_norm": 0.9849188327789307, "learning_rate": 7.342042203498952e-06, "loss": 0.9192, "step": 71 }, { "epoch": 0.35467980295566504, "grad_norm": 1.0050177574157715, "learning_rate": 7.273399084239878e-06, "loss": 0.9326, "step": 72 }, { "epoch": 0.35960591133004927, "grad_norm": 0.9628230929374695, "learning_rate": 7.204211494069292e-06, "loss": 0.884, "step": 73 }, { "epoch": 0.3645320197044335, "grad_norm": 1.1869782209396362, "learning_rate": 7.134496003166423e-06, "loss": 0.966, "step": 74 }, { "epoch": 0.3694581280788177, "grad_norm": 1.0189898014068604, "learning_rate": 7.06426930814083e-06, "loss": 0.8847, "step": 75 }, { "epoch": 0.37438423645320196, "grad_norm": 1.5452977418899536, "learning_rate": 6.993548228033618e-06, "loss": 0.9902, "step": 76 }, { "epoch": 0.3793103448275862, "grad_norm": 1.061618685722351, "learning_rate": 6.922349700289348e-06, "loss": 0.9273, "step": 77 }, { "epoch": 0.3842364532019704, "grad_norm": 0.9350699186325073, "learning_rate": 6.850690776699574e-06, "loss": 0.8633, "step": 78 }, { "epoch": 0.3891625615763547, "grad_norm": 0.944102942943573, "learning_rate": 6.7785886193189936e-06, "loss": 0.9348, "step": 79 }, { "epoch": 0.39408866995073893, "grad_norm": 0.9876391291618347, "learning_rate": 6.7060604963552125e-06, "loss": 0.9354, "step": 80 }, { "epoch": 0.39901477832512317, "grad_norm": 1.1221191883087158, "learning_rate": 6.633123778033061e-06, "loss": 0.9122, "step": 81 }, { "epoch": 0.4039408866995074, "grad_norm": 1.171373724937439, "learning_rate": 6.559795932434489e-06, "loss": 0.9184, "step": 82 }, { "epoch": 0.4088669950738916, "grad_norm": 1.0021214485168457, "learning_rate": 6.486094521315022e-06, "loss": 0.904, "step": 83 }, { "epoch": 0.41379310344827586, "grad_norm": 1.0860635042190552, "learning_rate": 6.412037195897786e-06, "loss": 0.9216, "step": 84 }, { "epoch": 0.4187192118226601, "grad_norm": 1.1491731405258179, "learning_rate": 6.337641692646106e-06, "loss": 0.9381, "step": 85 }, { "epoch": 0.4236453201970443, "grad_norm": 1.0246098041534424, "learning_rate": 6.262925829015675e-06, "loss": 0.8873, "step": 86 }, { "epoch": 0.42857142857142855, "grad_norm": 0.9979232549667358, "learning_rate": 6.187907499187357e-06, "loss": 0.955, "step": 87 }, { "epoch": 0.43349753694581283, "grad_norm": 1.022977590560913, "learning_rate": 6.112604669781572e-06, "loss": 0.8763, "step": 88 }, { "epoch": 0.43842364532019706, "grad_norm": 1.0037063360214233, "learning_rate": 6.037035375555376e-06, "loss": 0.9651, "step": 89 }, { "epoch": 0.4433497536945813, "grad_norm": 1.0663460493087769, "learning_rate": 5.961217715083185e-06, "loss": 0.969, "step": 90 }, { "epoch": 0.4482758620689655, "grad_norm": 0.9997808933258057, "learning_rate": 5.885169846422242e-06, "loss": 1.0117, "step": 91 }, { "epoch": 0.45320197044334976, "grad_norm": 0.9787831902503967, "learning_rate": 5.808909982763825e-06, "loss": 0.7974, "step": 92 }, { "epoch": 0.458128078817734, "grad_norm": 1.020983338356018, "learning_rate": 5.732456388071247e-06, "loss": 0.9561, "step": 93 }, { "epoch": 0.4630541871921182, "grad_norm": 0.9058898091316223, "learning_rate": 5.655827372705712e-06, "loss": 0.9409, "step": 94 }, { "epoch": 0.46798029556650245, "grad_norm": 1.0344359874725342, "learning_rate": 5.579041289041045e-06, "loss": 0.9722, "step": 95 }, { "epoch": 0.4729064039408867, "grad_norm": 0.9576674103736877, "learning_rate": 5.502116527068363e-06, "loss": 0.8873, "step": 96 }, { "epoch": 0.47783251231527096, "grad_norm": 1.2546874284744263, "learning_rate": 5.425071509991737e-06, "loss": 0.912, "step": 97 }, { "epoch": 0.4827586206896552, "grad_norm": 0.9510334730148315, "learning_rate": 5.347924689815906e-06, "loss": 0.8627, "step": 98 }, { "epoch": 0.4876847290640394, "grad_norm": 0.9422460198402405, "learning_rate": 5.270694542927089e-06, "loss": 0.9726, "step": 99 }, { "epoch": 0.49261083743842365, "grad_norm": 0.9733043909072876, "learning_rate": 5.193399565667945e-06, "loss": 0.8824, "step": 100 }, { "epoch": 0.4975369458128079, "grad_norm": 0.9182532429695129, "learning_rate": 5.116058269907779e-06, "loss": 0.8612, "step": 101 }, { "epoch": 0.5024630541871922, "grad_norm": 0.8935402035713196, "learning_rate": 5.038689178609011e-06, "loss": 0.9394, "step": 102 }, { "epoch": 0.5073891625615764, "grad_norm": 1.4738961458206177, "learning_rate": 4.96131082139099e-06, "loss": 0.9609, "step": 103 }, { "epoch": 0.5123152709359606, "grad_norm": 1.0967806577682495, "learning_rate": 4.883941730092222e-06, "loss": 0.9497, "step": 104 }, { "epoch": 0.5172413793103449, "grad_norm": 0.9221424460411072, "learning_rate": 4.806600434332056e-06, "loss": 0.891, "step": 105 }, { "epoch": 0.5221674876847291, "grad_norm": 1.0894826650619507, "learning_rate": 4.729305457072913e-06, "loss": 0.8682, "step": 106 }, { "epoch": 0.5270935960591133, "grad_norm": 0.9089194536209106, "learning_rate": 4.6520753101840945e-06, "loss": 0.8213, "step": 107 }, { "epoch": 0.5320197044334976, "grad_norm": 0.8551648855209351, "learning_rate": 4.574928490008264e-06, "loss": 0.8802, "step": 108 }, { "epoch": 0.5369458128078818, "grad_norm": 0.9886828064918518, "learning_rate": 4.497883472931639e-06, "loss": 0.9347, "step": 109 }, { "epoch": 0.541871921182266, "grad_norm": 0.9351578950881958, "learning_rate": 4.4209587109589565e-06, "loss": 0.8173, "step": 110 }, { "epoch": 0.5467980295566502, "grad_norm": 0.9019532203674316, "learning_rate": 4.3441726272942895e-06, "loss": 0.8473, "step": 111 }, { "epoch": 0.5517241379310345, "grad_norm": 0.9672942757606506, "learning_rate": 4.267543611928755e-06, "loss": 0.918, "step": 112 }, { "epoch": 0.5566502463054187, "grad_norm": 1.3270796537399292, "learning_rate": 4.191090017236177e-06, "loss": 0.9567, "step": 113 }, { "epoch": 0.5615763546798029, "grad_norm": 0.980305552482605, "learning_rate": 4.114830153577759e-06, "loss": 0.863, "step": 114 }, { "epoch": 0.5665024630541872, "grad_norm": 0.8682278394699097, "learning_rate": 4.0387822849168165e-06, "loss": 0.8437, "step": 115 }, { "epoch": 0.5714285714285714, "grad_norm": 0.9460963010787964, "learning_rate": 3.962964624444625e-06, "loss": 0.899, "step": 116 }, { "epoch": 0.5763546798029556, "grad_norm": 0.8857601881027222, "learning_rate": 3.887395330218429e-06, "loss": 0.849, "step": 117 }, { "epoch": 0.5812807881773399, "grad_norm": 1.0073286294937134, "learning_rate": 3.8120925008126457e-06, "loss": 0.9561, "step": 118 }, { "epoch": 0.5862068965517241, "grad_norm": 0.976075291633606, "learning_rate": 3.7370741709843263e-06, "loss": 0.8938, "step": 119 }, { "epoch": 0.5911330049261084, "grad_norm": 0.9713646173477173, "learning_rate": 3.662358307353897e-06, "loss": 0.9119, "step": 120 }, { "epoch": 0.5960591133004927, "grad_norm": 0.8663797974586487, "learning_rate": 3.587962804102214e-06, "loss": 0.8631, "step": 121 }, { "epoch": 0.6009852216748769, "grad_norm": 0.8859656453132629, "learning_rate": 3.5139054786849787e-06, "loss": 0.8044, "step": 122 }, { "epoch": 0.6059113300492611, "grad_norm": 1.091760277748108, "learning_rate": 3.440204067565511e-06, "loss": 0.9143, "step": 123 }, { "epoch": 0.6108374384236454, "grad_norm": 0.982275128364563, "learning_rate": 3.3668762219669393e-06, "loss": 0.918, "step": 124 }, { "epoch": 0.6157635467980296, "grad_norm": 0.8803215622901917, "learning_rate": 3.293939503644788e-06, "loss": 0.8426, "step": 125 }, { "epoch": 0.6206896551724138, "grad_norm": 0.922527015209198, "learning_rate": 3.2214113806810077e-06, "loss": 0.8571, "step": 126 }, { "epoch": 0.625615763546798, "grad_norm": 0.9506503343582153, "learning_rate": 3.149309223300428e-06, "loss": 0.8659, "step": 127 }, { "epoch": 0.6305418719211823, "grad_norm": 1.0316869020462036, "learning_rate": 3.0776502997106526e-06, "loss": 0.9088, "step": 128 }, { "epoch": 0.6354679802955665, "grad_norm": 1.9990965127944946, "learning_rate": 3.0064517719663833e-06, "loss": 0.8672, "step": 129 }, { "epoch": 0.6403940886699507, "grad_norm": 0.928225040435791, "learning_rate": 2.935730691859172e-06, "loss": 0.8305, "step": 130 }, { "epoch": 0.645320197044335, "grad_norm": 0.9361408948898315, "learning_rate": 2.8655039968335774e-06, "loss": 0.8462, "step": 131 }, { "epoch": 0.6502463054187192, "grad_norm": 0.9435849189758301, "learning_rate": 2.7957885059307097e-06, "loss": 0.8756, "step": 132 }, { "epoch": 0.6551724137931034, "grad_norm": 0.9171866178512573, "learning_rate": 2.7266009157601226e-06, "loss": 0.917, "step": 133 }, { "epoch": 0.6600985221674877, "grad_norm": 0.911807656288147, "learning_rate": 2.65795779650105e-06, "loss": 0.8588, "step": 134 }, { "epoch": 0.6650246305418719, "grad_norm": 1.8045060634613037, "learning_rate": 2.589875587933892e-06, "loss": 0.9057, "step": 135 }, { "epoch": 0.6699507389162561, "grad_norm": 0.8906491994857788, "learning_rate": 2.522370595502954e-06, "loss": 0.8708, "step": 136 }, { "epoch": 0.6748768472906403, "grad_norm": 0.9065340757369995, "learning_rate": 2.4554589864113566e-06, "loss": 0.8558, "step": 137 }, { "epoch": 0.6798029556650246, "grad_norm": 1.116025686264038, "learning_rate": 2.3891567857490373e-06, "loss": 0.9355, "step": 138 }, { "epoch": 0.6847290640394089, "grad_norm": 0.8539987802505493, "learning_rate": 2.323479872654805e-06, "loss": 0.7964, "step": 139 }, { "epoch": 0.6896551724137931, "grad_norm": 0.8484991192817688, "learning_rate": 2.2584439765133453e-06, "loss": 0.808, "step": 140 }, { "epoch": 0.6945812807881774, "grad_norm": 0.9017306566238403, "learning_rate": 2.1940646731880887e-06, "loss": 0.9113, "step": 141 }, { "epoch": 0.6995073891625616, "grad_norm": 1.0858136415481567, "learning_rate": 2.1303573812908383e-06, "loss": 0.8572, "step": 142 }, { "epoch": 0.7044334975369458, "grad_norm": 0.8687289953231812, "learning_rate": 2.0673373584890847e-06, "loss": 0.8145, "step": 143 }, { "epoch": 0.7093596059113301, "grad_norm": 0.9045321345329285, "learning_rate": 2.0050196978518323e-06, "loss": 0.8543, "step": 144 }, { "epoch": 0.7142857142857143, "grad_norm": 1.0116416215896606, "learning_rate": 1.943419324234871e-06, "loss": 0.8539, "step": 145 }, { "epoch": 0.7192118226600985, "grad_norm": 0.9924123883247375, "learning_rate": 1.8825509907063328e-06, "loss": 0.95, "step": 146 }, { "epoch": 0.7241379310344828, "grad_norm": 0.8728001713752747, "learning_rate": 1.8224292750133743e-06, "loss": 0.9293, "step": 147 }, { "epoch": 0.729064039408867, "grad_norm": 0.8748157620429993, "learning_rate": 1.7630685760908623e-06, "loss": 0.844, "step": 148 }, { "epoch": 0.7339901477832512, "grad_norm": 0.857449471950531, "learning_rate": 1.7044831106128867e-06, "loss": 0.8433, "step": 149 }, { "epoch": 0.7389162561576355, "grad_norm": 0.9839447140693665, "learning_rate": 1.6466869095879079e-06, "loss": 0.8528, "step": 150 }, { "epoch": 0.7438423645320197, "grad_norm": 1.1705787181854248, "learning_rate": 1.589693814998391e-06, "loss": 0.9376, "step": 151 }, { "epoch": 0.7487684729064039, "grad_norm": 1.311543345451355, "learning_rate": 1.533517476485691e-06, "loss": 0.7922, "step": 152 }, { "epoch": 0.7536945812807881, "grad_norm": 0.8858788013458252, "learning_rate": 1.4781713480810184e-06, "loss": 0.8161, "step": 153 }, { "epoch": 0.7586206896551724, "grad_norm": 0.8841625452041626, "learning_rate": 1.4236686849832497e-06, "loss": 0.8746, "step": 154 }, { "epoch": 0.7635467980295566, "grad_norm": 1.0294075012207031, "learning_rate": 1.370022540384347e-06, "loss": 0.9812, "step": 155 }, { "epoch": 0.7684729064039408, "grad_norm": 1.0972410440444946, "learning_rate": 1.3172457623431706e-06, "loss": 0.966, "step": 156 }, { "epoch": 0.7733990147783252, "grad_norm": 0.884397566318512, "learning_rate": 1.2653509907084171e-06, "loss": 0.8526, "step": 157 }, { "epoch": 0.7783251231527094, "grad_norm": 0.9068350791931152, "learning_rate": 1.214350654091413e-06, "loss": 0.9192, "step": 158 }, { "epoch": 0.7832512315270936, "grad_norm": 1.0193994045257568, "learning_rate": 1.1642569668895171e-06, "loss": 0.8804, "step": 159 }, { "epoch": 0.7881773399014779, "grad_norm": 0.8925516605377197, "learning_rate": 1.1150819263608098e-06, "loss": 0.8384, "step": 160 }, { "epoch": 0.7931034482758621, "grad_norm": 1.0054816007614136, "learning_rate": 1.0668373097507922e-06, "loss": 0.8544, "step": 161 }, { "epoch": 0.7980295566502463, "grad_norm": 0.8788971304893494, "learning_rate": 1.0195346714717813e-06, "loss": 0.8462, "step": 162 }, { "epoch": 0.8029556650246306, "grad_norm": 0.8969894051551819, "learning_rate": 9.731853403356705e-07, "loss": 0.8614, "step": 163 }, { "epoch": 0.8078817733990148, "grad_norm": 0.9033743739128113, "learning_rate": 9.278004168407151e-07, "loss": 0.7701, "step": 164 }, { "epoch": 0.812807881773399, "grad_norm": 0.8596148490905762, "learning_rate": 8.833907705130091e-07, "loss": 0.8451, "step": 165 }, { "epoch": 0.8177339901477833, "grad_norm": 0.9375607967376709, "learning_rate": 8.399670373032665e-07, "loss": 0.8357, "step": 166 }, { "epoch": 0.8226600985221675, "grad_norm": 1.2474132776260376, "learning_rate": 7.975396170395522e-07, "loss": 0.8348, "step": 167 }, { "epoch": 0.8275862068965517, "grad_norm": 1.0321135520935059, "learning_rate": 7.561186709365653e-07, "loss": 1.0182, "step": 168 }, { "epoch": 0.8325123152709359, "grad_norm": 0.8872163891792297, "learning_rate": 7.157141191620548e-07, "loss": 0.8193, "step": 169 }, { "epoch": 0.8374384236453202, "grad_norm": 1.0191763639450073, "learning_rate": 6.763356384609809e-07, "loss": 0.867, "step": 170 }, { "epoch": 0.8423645320197044, "grad_norm": 1.7469654083251953, "learning_rate": 6.379926598379727e-07, "loss": 0.7807, "step": 171 }, { "epoch": 0.8472906403940886, "grad_norm": 0.8473700284957886, "learning_rate": 6.006943662986275e-07, "loss": 0.7896, "step": 172 }, { "epoch": 0.8522167487684729, "grad_norm": 0.8807177543640137, "learning_rate": 5.644496906502233e-07, "loss": 0.8352, "step": 173 }, { "epoch": 0.8571428571428571, "grad_norm": 1.1127580404281616, "learning_rate": 5.292673133623372e-07, "loss": 0.931, "step": 174 }, { "epoch": 0.8620689655172413, "grad_norm": 1.0402883291244507, "learning_rate": 4.951556604879049e-07, "loss": 0.954, "step": 175 }, { "epoch": 0.8669950738916257, "grad_norm": 0.8743595480918884, "learning_rate": 4.6212290164521554e-07, "loss": 0.8078, "step": 176 }, { "epoch": 0.8719211822660099, "grad_norm": 0.8114182353019714, "learning_rate": 4.3017694806131163e-07, "loss": 0.8326, "step": 177 }, { "epoch": 0.8768472906403941, "grad_norm": 0.907577395439148, "learning_rate": 3.9932545067728366e-07, "loss": 0.9081, "step": 178 }, { "epoch": 0.8817733990147784, "grad_norm": 0.8789636492729187, "learning_rate": 3.695757983158954e-07, "loss": 0.8478, "step": 179 }, { "epoch": 0.8866995073891626, "grad_norm": 0.8163318634033203, "learning_rate": 3.409351159119845e-07, "loss": 0.7992, "step": 180 }, { "epoch": 0.8916256157635468, "grad_norm": 0.9108045697212219, "learning_rate": 3.134102628060698e-07, "loss": 0.9684, "step": 181 }, { "epoch": 0.896551724137931, "grad_norm": 0.9332152009010315, "learning_rate": 2.8700783110156507e-07, "loss": 0.8402, "step": 182 }, { "epoch": 0.9014778325123153, "grad_norm": 0.9538592100143433, "learning_rate": 2.617341440859883e-07, "loss": 0.885, "step": 183 }, { "epoch": 0.9064039408866995, "grad_norm": 0.9269714951515198, "learning_rate": 2.3759525471656163e-07, "loss": 0.8598, "step": 184 }, { "epoch": 0.9113300492610837, "grad_norm": 1.0273958444595337, "learning_rate": 2.1459694417055033e-07, "loss": 0.9211, "step": 185 }, { "epoch": 0.916256157635468, "grad_norm": 0.8281537890434265, "learning_rate": 1.9274472046068805e-07, "loss": 0.7494, "step": 186 }, { "epoch": 0.9211822660098522, "grad_norm": 0.91705721616745, "learning_rate": 1.7204381711603046e-07, "loss": 0.8442, "step": 187 }, { "epoch": 0.9261083743842364, "grad_norm": 0.9961190223693848, "learning_rate": 1.524991919285429e-07, "loss": 1.0154, "step": 188 }, { "epoch": 0.9310344827586207, "grad_norm": 0.8181946277618408, "learning_rate": 1.3411552576572562e-07, "loss": 0.8561, "step": 189 }, { "epoch": 0.9359605911330049, "grad_norm": 0.822562575340271, "learning_rate": 1.1689722144956672e-07, "loss": 0.7987, "step": 190 }, { "epoch": 0.9408866995073891, "grad_norm": 0.9585839509963989, "learning_rate": 1.008484027020773e-07, "loss": 0.8168, "step": 191 }, { "epoch": 0.9458128078817734, "grad_norm": 0.8027588129043579, "learning_rate": 8.597291315767808e-08, "loss": 0.779, "step": 192 }, { "epoch": 0.9507389162561576, "grad_norm": 0.965819239616394, "learning_rate": 7.227431544266194e-08, "loss": 0.9602, "step": 193 }, { "epoch": 0.9556650246305419, "grad_norm": 0.8224076628684998, "learning_rate": 5.97558903219575e-08, "loss": 0.8123, "step": 194 }, { "epoch": 0.9605911330049262, "grad_norm": 0.9190309643745422, "learning_rate": 4.842063591339763e-08, "loss": 0.9018, "step": 195 }, { "epoch": 0.9655172413793104, "grad_norm": 0.8280571699142456, "learning_rate": 3.82712669696822e-08, "loss": 0.8204, "step": 196 }, { "epoch": 0.9704433497536946, "grad_norm": 0.8307924270629883, "learning_rate": 2.9310214228202016e-08, "loss": 0.8532, "step": 197 }, { "epoch": 0.9753694581280788, "grad_norm": 0.9194669127464294, "learning_rate": 2.153962382888841e-08, "loss": 0.8382, "step": 198 }, { "epoch": 0.9802955665024631, "grad_norm": 0.804408848285675, "learning_rate": 1.496135680021993e-08, "loss": 0.8196, "step": 199 }, { "epoch": 0.9852216748768473, "grad_norm": 0.8451563715934753, "learning_rate": 9.576988613511084e-09, "loss": 0.8492, "step": 200 }, { "epoch": 0.9901477832512315, "grad_norm": 0.8822647929191589, "learning_rate": 5.387808805594752e-09, "loss": 0.8916, "step": 201 }, { "epoch": 0.9950738916256158, "grad_norm": 0.8675324320793152, "learning_rate": 2.3948206699819787e-09, "loss": 0.8024, "step": 202 }, { "epoch": 1.0, "grad_norm": 0.8639441132545471, "learning_rate": 5.987410165758656e-10, "loss": 0.8228, "step": 203 } ], "logging_steps": 1.0, "max_steps": 203, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4104222270160896e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }