{ "best_metric": 2.126425266265869, "best_model_checkpoint": "/work/Ccp-OldNewsBERT_2024/modelling/checkpoint-95500", "epoch": 15.0, "eval_steps": 500, "global_step": 98640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07603406326034064, "grad_norm": 1.1528505086898804, "learning_rate": 1.25e-05, "loss": 8.5532, "step": 500 }, { "epoch": 0.07603406326034064, "eval_loss": 7.3156208992004395, "eval_runtime": 392.108, "eval_samples_per_second": 1073.301, "eval_steps_per_second": 4.193, "step": 500 }, { "epoch": 0.15206812652068127, "grad_norm": 1.3152525424957275, "learning_rate": 2.5e-05, "loss": 7.0493, "step": 1000 }, { "epoch": 0.15206812652068127, "eval_loss": 6.841813087463379, "eval_runtime": 392.0485, "eval_samples_per_second": 1073.464, "eval_steps_per_second": 4.193, "step": 1000 }, { "epoch": 0.2281021897810219, "grad_norm": 1.7870614528656006, "learning_rate": 3.7500000000000003e-05, "loss": 6.7805, "step": 1500 }, { "epoch": 0.2281021897810219, "eval_loss": 6.62256383895874, "eval_runtime": 391.9186, "eval_samples_per_second": 1073.82, "eval_steps_per_second": 4.195, "step": 1500 }, { "epoch": 0.30413625304136255, "grad_norm": 1.1754438877105713, "learning_rate": 5e-05, "loss": 6.5763, "step": 2000 }, { "epoch": 0.30413625304136255, "eval_loss": 6.442608833312988, "eval_runtime": 391.8632, "eval_samples_per_second": 1073.972, "eval_steps_per_second": 4.195, "step": 2000 }, { "epoch": 0.38017031630170317, "grad_norm": 1.4492470026016235, "learning_rate": 4.999669762518974e-05, "loss": 6.4084, "step": 2500 }, { "epoch": 0.38017031630170317, "eval_loss": 6.2790422439575195, "eval_runtime": 395.0617, "eval_samples_per_second": 1065.277, "eval_steps_per_second": 4.161, "step": 2500 }, { "epoch": 0.4562043795620438, "grad_norm": 1.4527273178100586, "learning_rate": 4.9986791373213283e-05, "loss": 6.2422, "step": 3000 }, { "epoch": 0.4562043795620438, "eval_loss": 6.124966621398926, "eval_runtime": 394.995, "eval_samples_per_second": 1065.457, "eval_steps_per_second": 4.162, "step": 3000 }, { "epoch": 0.5322384428223844, "grad_norm": 1.8543823957443237, "learning_rate": 4.997028386120321e-05, "loss": 6.0635, "step": 3500 }, { "epoch": 0.5322384428223844, "eval_loss": 5.868374347686768, "eval_runtime": 394.8994, "eval_samples_per_second": 1065.715, "eval_steps_per_second": 4.163, "step": 3500 }, { "epoch": 0.6082725060827251, "grad_norm": 1.973868489265442, "learning_rate": 4.994717945027886e-05, "loss": 5.7633, "step": 4000 }, { "epoch": 0.6082725060827251, "eval_loss": 5.493896961212158, "eval_runtime": 394.9221, "eval_samples_per_second": 1065.653, "eval_steps_per_second": 4.163, "step": 4000 }, { "epoch": 0.6843065693430657, "grad_norm": 1.8778235912322998, "learning_rate": 4.99174842443942e-05, "loss": 5.429, "step": 4500 }, { "epoch": 0.6843065693430657, "eval_loss": 5.116617679595947, "eval_runtime": 394.8716, "eval_samples_per_second": 1065.79, "eval_steps_per_second": 4.163, "step": 4500 }, { "epoch": 0.7603406326034063, "grad_norm": 2.0149049758911133, "learning_rate": 4.9881206088725227e-05, "loss": 5.0808, "step": 5000 }, { "epoch": 0.7603406326034063, "eval_loss": 4.774472713470459, "eval_runtime": 395.1882, "eval_samples_per_second": 1064.936, "eval_steps_per_second": 4.16, "step": 5000 }, { "epoch": 0.8363746958637469, "grad_norm": 1.7959963083267212, "learning_rate": 4.983835456759734e-05, "loss": 4.7725, "step": 5500 }, { "epoch": 0.8363746958637469, "eval_loss": 4.4951066970825195, "eval_runtime": 395.052, "eval_samples_per_second": 1065.303, "eval_steps_per_second": 4.161, "step": 5500 }, { "epoch": 0.9124087591240876, "grad_norm": 1.6965287923812866, "learning_rate": 4.978894100195325e-05, "loss": 4.5344, "step": 6000 }, { "epoch": 0.9124087591240876, "eval_loss": 4.28698205947876, "eval_runtime": 395.1764, "eval_samples_per_second": 1064.967, "eval_steps_per_second": 4.16, "step": 6000 }, { "epoch": 0.9884428223844283, "grad_norm": 1.6758971214294434, "learning_rate": 4.973297844636212e-05, "loss": 4.3347, "step": 6500 }, { "epoch": 0.9884428223844283, "eval_loss": 4.129937648773193, "eval_runtime": 395.104, "eval_samples_per_second": 1065.163, "eval_steps_per_second": 4.161, "step": 6500 }, { "epoch": 1.0644768856447688, "grad_norm": 1.7391337156295776, "learning_rate": 4.9670481685570645e-05, "loss": 4.1883, "step": 7000 }, { "epoch": 1.0644768856447688, "eval_loss": 4.002706050872803, "eval_runtime": 395.1014, "eval_samples_per_second": 1065.17, "eval_steps_per_second": 4.161, "step": 7000 }, { "epoch": 1.1405109489051095, "grad_norm": 1.592909812927246, "learning_rate": 4.960146723059713e-05, "loss": 4.0579, "step": 7500 }, { "epoch": 1.1405109489051095, "eval_loss": 3.8906095027923584, "eval_runtime": 395.0202, "eval_samples_per_second": 1065.389, "eval_steps_per_second": 4.162, "step": 7500 }, { "epoch": 1.2165450121654502, "grad_norm": 1.7625865936279297, "learning_rate": 4.952595331436939e-05, "loss": 3.9484, "step": 8000 }, { "epoch": 1.2165450121654502, "eval_loss": 3.793649673461914, "eval_runtime": 395.0939, "eval_samples_per_second": 1065.19, "eval_steps_per_second": 4.161, "step": 8000 }, { "epoch": 1.2925790754257909, "grad_norm": 1.5408483743667603, "learning_rate": 4.9443959886907786e-05, "loss": 3.8541, "step": 8500 }, { "epoch": 1.2925790754257909, "eval_loss": 3.707909107208252, "eval_runtime": 395.0107, "eval_samples_per_second": 1065.414, "eval_steps_per_second": 4.162, "step": 8500 }, { "epoch": 1.3686131386861313, "grad_norm": 1.5377788543701172, "learning_rate": 4.935550861005469e-05, "loss": 3.7751, "step": 9000 }, { "epoch": 1.3686131386861313, "eval_loss": 3.6381478309631348, "eval_runtime": 395.0196, "eval_samples_per_second": 1065.39, "eval_steps_per_second": 4.162, "step": 9000 }, { "epoch": 1.444647201946472, "grad_norm": 1.5185712575912476, "learning_rate": 4.926062285175158e-05, "loss": 3.7, "step": 9500 }, { "epoch": 1.444647201946472, "eval_loss": 3.56645131111145, "eval_runtime": 395.0168, "eval_samples_per_second": 1065.398, "eval_steps_per_second": 4.162, "step": 9500 }, { "epoch": 1.5206812652068127, "grad_norm": 1.4640849828720093, "learning_rate": 4.9159536649297986e-05, "loss": 3.6397, "step": 10000 }, { "epoch": 1.5206812652068127, "eval_loss": 3.5038576126098633, "eval_runtime": 395.2609, "eval_samples_per_second": 1064.74, "eval_steps_per_second": 4.159, "step": 10000 }, { "epoch": 1.5967153284671531, "grad_norm": 1.7205146551132202, "learning_rate": 4.9051871562474056e-05, "loss": 3.5783, "step": 10500 }, { "epoch": 1.5967153284671531, "eval_loss": 3.4472110271453857, "eval_runtime": 395.1986, "eval_samples_per_second": 1064.908, "eval_steps_per_second": 4.16, "step": 10500 }, { "epoch": 1.672749391727494, "grad_norm": 1.605870008468628, "learning_rate": 4.8937852212067106e-05, "loss": 3.5196, "step": 11000 }, { "epoch": 1.672749391727494, "eval_loss": 3.3966190814971924, "eval_runtime": 395.1285, "eval_samples_per_second": 1065.097, "eval_steps_per_second": 4.161, "step": 11000 }, { "epoch": 1.7487834549878345, "grad_norm": 1.6770403385162354, "learning_rate": 4.8817508720847596e-05, "loss": 3.4701, "step": 11500 }, { "epoch": 1.7487834549878345, "eval_loss": 3.34128999710083, "eval_runtime": 395.1091, "eval_samples_per_second": 1065.149, "eval_steps_per_second": 4.161, "step": 11500 }, { "epoch": 1.8248175182481752, "grad_norm": 1.5218740701675415, "learning_rate": 4.869087288236064e-05, "loss": 3.4226, "step": 12000 }, { "epoch": 1.8248175182481752, "eval_loss": 3.301135540008545, "eval_runtime": 395.0668, "eval_samples_per_second": 1065.263, "eval_steps_per_second": 4.161, "step": 12000 }, { "epoch": 1.9008515815085159, "grad_norm": 1.528290867805481, "learning_rate": 4.855797815252648e-05, "loss": 3.3704, "step": 12500 }, { "epoch": 1.9008515815085159, "eval_loss": 3.255563735961914, "eval_runtime": 395.0617, "eval_samples_per_second": 1065.277, "eval_steps_per_second": 4.161, "step": 12500 }, { "epoch": 1.9768856447688563, "grad_norm": 1.4962824583053589, "learning_rate": 4.8418859640801796e-05, "loss": 3.3326, "step": 13000 }, { "epoch": 1.9768856447688563, "eval_loss": 3.2163586616516113, "eval_runtime": 395.1594, "eval_samples_per_second": 1065.013, "eval_steps_per_second": 4.16, "step": 13000 }, { "epoch": 2.052919708029197, "grad_norm": 1.5214394330978394, "learning_rate": 4.8273554100904066e-05, "loss": 3.2872, "step": 13500 }, { "epoch": 2.052919708029197, "eval_loss": 3.178077220916748, "eval_runtime": 395.023, "eval_samples_per_second": 1065.381, "eval_steps_per_second": 4.162, "step": 13500 }, { "epoch": 2.1289537712895377, "grad_norm": 1.6362810134887695, "learning_rate": 4.8122408939478185e-05, "loss": 3.2453, "step": 14000 }, { "epoch": 2.1289537712895377, "eval_loss": 3.1436197757720947, "eval_runtime": 395.163, "eval_samples_per_second": 1065.004, "eval_steps_per_second": 4.16, "step": 14000 }, { "epoch": 2.204987834549878, "grad_norm": 1.6314831972122192, "learning_rate": 4.79651794790509e-05, "loss": 3.2149, "step": 14500 }, { "epoch": 2.204987834549878, "eval_loss": 3.1076748371124268, "eval_runtime": 395.3195, "eval_samples_per_second": 1064.582, "eval_steps_per_second": 4.159, "step": 14500 }, { "epoch": 2.281021897810219, "grad_norm": 1.5647250413894653, "learning_rate": 4.7801573854264494e-05, "loss": 3.1836, "step": 15000 }, { "epoch": 2.281021897810219, "eval_loss": 3.081753969192505, "eval_runtime": 395.1551, "eval_samples_per_second": 1065.025, "eval_steps_per_second": 4.16, "step": 15000 }, { "epoch": 2.3570559610705595, "grad_norm": 1.559869408607483, "learning_rate": 4.763194428202762e-05, "loss": 3.1459, "step": 15500 }, { "epoch": 2.3570559610705595, "eval_loss": 3.044140100479126, "eval_runtime": 395.2791, "eval_samples_per_second": 1064.691, "eval_steps_per_second": 4.159, "step": 15500 }, { "epoch": 2.4330900243309004, "grad_norm": 1.669546365737915, "learning_rate": 4.745633557677441e-05, "loss": 3.1298, "step": 16000 }, { "epoch": 2.4330900243309004, "eval_loss": 3.015268325805664, "eval_runtime": 395.2158, "eval_samples_per_second": 1064.861, "eval_steps_per_second": 4.16, "step": 16000 }, { "epoch": 2.509124087591241, "grad_norm": 1.5877552032470703, "learning_rate": 4.727479413256602e-05, "loss": 3.0882, "step": 16500 }, { "epoch": 2.509124087591241, "eval_loss": 2.9866795539855957, "eval_runtime": 395.1307, "eval_samples_per_second": 1065.091, "eval_steps_per_second": 4.161, "step": 16500 }, { "epoch": 2.5851581508515817, "grad_norm": 1.6820305585861206, "learning_rate": 4.708736791083384e-05, "loss": 3.0738, "step": 17000 }, { "epoch": 2.5851581508515817, "eval_loss": 2.957209587097168, "eval_runtime": 395.2085, "eval_samples_per_second": 1064.881, "eval_steps_per_second": 4.16, "step": 17000 }, { "epoch": 2.661192214111922, "grad_norm": 1.4878249168395996, "learning_rate": 4.6894106427708574e-05, "loss": 3.0409, "step": 17500 }, { "epoch": 2.661192214111922, "eval_loss": 2.931816339492798, "eval_runtime": 395.1436, "eval_samples_per_second": 1065.056, "eval_steps_per_second": 4.161, "step": 17500 }, { "epoch": 2.7372262773722627, "grad_norm": 1.5256247520446777, "learning_rate": 4.669546457024816e-05, "loss": 3.0155, "step": 18000 }, { "epoch": 2.7372262773722627, "eval_loss": 2.9121601581573486, "eval_runtime": 395.308, "eval_samples_per_second": 1064.613, "eval_steps_per_second": 4.159, "step": 18000 }, { "epoch": 2.8132603406326036, "grad_norm": 1.6648399829864502, "learning_rate": 4.649069867545623e-05, "loss": 2.9909, "step": 18500 }, { "epoch": 2.8132603406326036, "eval_loss": 2.890857219696045, "eval_runtime": 395.436, "eval_samples_per_second": 1064.268, "eval_steps_per_second": 4.157, "step": 18500 }, { "epoch": 2.889294403892944, "grad_norm": 1.6078656911849976, "learning_rate": 4.628025515330744e-05, "loss": 2.9754, "step": 19000 }, { "epoch": 2.889294403892944, "eval_loss": 2.865665912628174, "eval_runtime": 395.2576, "eval_samples_per_second": 1064.749, "eval_steps_per_second": 4.159, "step": 19000 }, { "epoch": 2.9653284671532845, "grad_norm": 1.595712661743164, "learning_rate": 4.60641896008727e-05, "loss": 2.9512, "step": 19500 }, { "epoch": 2.9653284671532845, "eval_loss": 2.8427441120147705, "eval_runtime": 395.1474, "eval_samples_per_second": 1065.046, "eval_steps_per_second": 4.16, "step": 19500 }, { "epoch": 3.0413625304136254, "grad_norm": 1.5582592487335205, "learning_rate": 4.584255910050703e-05, "loss": 2.9132, "step": 20000 }, { "epoch": 3.0413625304136254, "eval_loss": 2.821183681488037, "eval_runtime": 395.3458, "eval_samples_per_second": 1064.511, "eval_steps_per_second": 4.158, "step": 20000 }, { "epoch": 3.117396593673966, "grad_norm": 1.6548606157302856, "learning_rate": 4.561588193429872e-05, "loss": 2.9021, "step": 20500 }, { "epoch": 3.117396593673966, "eval_loss": 2.802894115447998, "eval_runtime": 395.4185, "eval_samples_per_second": 1064.315, "eval_steps_per_second": 4.158, "step": 20500 }, { "epoch": 3.1934306569343067, "grad_norm": 1.6921550035476685, "learning_rate": 4.538330948241111e-05, "loss": 2.8889, "step": 21000 }, { "epoch": 3.1934306569343067, "eval_loss": 2.7827913761138916, "eval_runtime": 395.1602, "eval_samples_per_second": 1065.011, "eval_steps_per_second": 4.16, "step": 21000 }, { "epoch": 3.269464720194647, "grad_norm": 1.7307897806167603, "learning_rate": 4.514535196430073e-05, "loss": 2.8642, "step": 21500 }, { "epoch": 3.269464720194647, "eval_loss": 2.767017126083374, "eval_runtime": 395.2268, "eval_samples_per_second": 1064.832, "eval_steps_per_second": 4.16, "step": 21500 }, { "epoch": 3.345498783454988, "grad_norm": 1.7314034700393677, "learning_rate": 4.490207224596068e-05, "loss": 2.8517, "step": 22000 }, { "epoch": 3.345498783454988, "eval_loss": 2.747631311416626, "eval_runtime": 395.1865, "eval_samples_per_second": 1064.94, "eval_steps_per_second": 4.16, "step": 22000 }, { "epoch": 3.4215328467153285, "grad_norm": 1.7844088077545166, "learning_rate": 4.465353459945605e-05, "loss": 2.8341, "step": 22500 }, { "epoch": 3.4215328467153285, "eval_loss": 2.7319579124450684, "eval_runtime": 395.3244, "eval_samples_per_second": 1064.569, "eval_steps_per_second": 4.159, "step": 22500 }, { "epoch": 3.497566909975669, "grad_norm": 1.5570697784423828, "learning_rate": 4.43998046859439e-05, "loss": 2.8102, "step": 23000 }, { "epoch": 3.497566909975669, "eval_loss": 2.7134299278259277, "eval_runtime": 395.3813, "eval_samples_per_second": 1064.416, "eval_steps_per_second": 4.158, "step": 23000 }, { "epoch": 3.57360097323601, "grad_norm": 1.5903196334838867, "learning_rate": 4.414094953832625e-05, "loss": 2.7942, "step": 23500 }, { "epoch": 3.57360097323601, "eval_loss": 2.696880340576172, "eval_runtime": 395.3996, "eval_samples_per_second": 1064.366, "eval_steps_per_second": 4.158, "step": 23500 }, { "epoch": 3.6496350364963503, "grad_norm": 1.7155580520629883, "learning_rate": 4.387703754354059e-05, "loss": 2.7893, "step": 24000 }, { "epoch": 3.6496350364963503, "eval_loss": 2.6850531101226807, "eval_runtime": 395.3598, "eval_samples_per_second": 1064.473, "eval_steps_per_second": 4.158, "step": 24000 }, { "epoch": 3.725669099756691, "grad_norm": 1.5948296785354614, "learning_rate": 4.3608681152880126e-05, "loss": 2.7681, "step": 24500 }, { "epoch": 3.725669099756691, "eval_loss": 2.66740345954895, "eval_runtime": 395.2365, "eval_samples_per_second": 1064.805, "eval_steps_per_second": 4.16, "step": 24500 }, { "epoch": 3.8017031630170317, "grad_norm": 1.5723962783813477, "learning_rate": 4.333487571042728e-05, "loss": 2.7577, "step": 25000 }, { "epoch": 3.8017031630170317, "eval_loss": 2.654303789138794, "eval_runtime": 395.3645, "eval_samples_per_second": 1064.461, "eval_steps_per_second": 4.158, "step": 25000 }, { "epoch": 3.877737226277372, "grad_norm": 1.6151896715164185, "learning_rate": 4.3056226377438776e-05, "loss": 2.7427, "step": 25500 }, { "epoch": 3.877737226277372, "eval_loss": 2.643014669418335, "eval_runtime": 395.2931, "eval_samples_per_second": 1064.653, "eval_steps_per_second": 4.159, "step": 25500 }, { "epoch": 3.9537712895377126, "grad_norm": 1.670333743095398, "learning_rate": 4.27728067702777e-05, "loss": 2.7302, "step": 26000 }, { "epoch": 3.9537712895377126, "eval_loss": 2.6284077167510986, "eval_runtime": 395.1357, "eval_samples_per_second": 1065.077, "eval_steps_per_second": 4.161, "step": 26000 }, { "epoch": 4.0298053527980535, "grad_norm": 1.551099419593811, "learning_rate": 4.248469176556575e-05, "loss": 2.7106, "step": 26500 }, { "epoch": 4.0298053527980535, "eval_loss": 2.616875171661377, "eval_runtime": 395.4614, "eval_samples_per_second": 1064.2, "eval_steps_per_second": 4.157, "step": 26500 }, { "epoch": 4.105839416058394, "grad_norm": 1.6209259033203125, "learning_rate": 4.219313751705213e-05, "loss": 2.7007, "step": 27000 }, { "epoch": 4.105839416058394, "eval_loss": 2.6068313121795654, "eval_runtime": 395.5688, "eval_samples_per_second": 1063.911, "eval_steps_per_second": 4.156, "step": 27000 }, { "epoch": 4.181873479318734, "grad_norm": 1.616698980331421, "learning_rate": 4.189587930102075e-05, "loss": 2.6858, "step": 27500 }, { "epoch": 4.181873479318734, "eval_loss": 2.5947837829589844, "eval_runtime": 395.5316, "eval_samples_per_second": 1064.011, "eval_steps_per_second": 4.156, "step": 27500 }, { "epoch": 4.257907542579075, "grad_norm": 1.6252193450927734, "learning_rate": 4.1594157362893294e-05, "loss": 2.6748, "step": 28000 }, { "epoch": 4.257907542579075, "eval_loss": 2.5821821689605713, "eval_runtime": 395.6317, "eval_samples_per_second": 1063.742, "eval_steps_per_second": 4.155, "step": 28000 }, { "epoch": 4.333941605839416, "grad_norm": 1.5178853273391724, "learning_rate": 4.1288051414584004e-05, "loss": 2.672, "step": 28500 }, { "epoch": 4.333941605839416, "eval_loss": 2.566763162612915, "eval_runtime": 395.3024, "eval_samples_per_second": 1064.628, "eval_steps_per_second": 4.159, "step": 28500 }, { "epoch": 4.409975669099756, "grad_norm": 1.6428803205490112, "learning_rate": 4.097764232621873e-05, "loss": 2.6498, "step": 29000 }, { "epoch": 4.409975669099756, "eval_loss": 2.560192823410034, "eval_runtime": 395.2916, "eval_samples_per_second": 1064.657, "eval_steps_per_second": 4.159, "step": 29000 }, { "epoch": 4.486009732360097, "grad_norm": 1.546608805656433, "learning_rate": 4.066301210476981e-05, "loss": 2.6422, "step": 29500 }, { "epoch": 4.486009732360097, "eval_loss": 2.5504369735717773, "eval_runtime": 395.4025, "eval_samples_per_second": 1064.358, "eval_steps_per_second": 4.158, "step": 29500 }, { "epoch": 4.562043795620438, "grad_norm": 1.6463203430175781, "learning_rate": 4.034424387239068e-05, "loss": 2.6334, "step": 30000 }, { "epoch": 4.562043795620438, "eval_loss": 2.540264844894409, "eval_runtime": 395.609, "eval_samples_per_second": 1063.803, "eval_steps_per_second": 4.156, "step": 30000 }, { "epoch": 4.638077858880779, "grad_norm": 1.69281005859375, "learning_rate": 4.002142184445579e-05, "loss": 2.6246, "step": 30500 }, { "epoch": 4.638077858880779, "eval_loss": 2.529710292816162, "eval_runtime": 395.534, "eval_samples_per_second": 1064.005, "eval_steps_per_second": 4.156, "step": 30500 }, { "epoch": 4.714111922141119, "grad_norm": 1.4954875707626343, "learning_rate": 3.969594626065171e-05, "loss": 2.6194, "step": 31000 }, { "epoch": 4.714111922141119, "eval_loss": 2.5173487663269043, "eval_runtime": 395.5366, "eval_samples_per_second": 1063.998, "eval_steps_per_second": 4.156, "step": 31000 }, { "epoch": 4.79014598540146, "grad_norm": 1.586890459060669, "learning_rate": 3.936528890443755e-05, "loss": 2.6044, "step": 31500 }, { "epoch": 4.79014598540146, "eval_loss": 2.509347438812256, "eval_runtime": 395.6037, "eval_samples_per_second": 1063.817, "eval_steps_per_second": 4.156, "step": 31500 }, { "epoch": 4.866180048661801, "grad_norm": 1.4862339496612549, "learning_rate": 3.903083638276577e-05, "loss": 2.585, "step": 32000 }, { "epoch": 4.866180048661801, "eval_loss": 2.498917579650879, "eval_runtime": 395.4783, "eval_samples_per_second": 1064.154, "eval_steps_per_second": 4.157, "step": 32000 }, { "epoch": 4.942214111922141, "grad_norm": 1.6119396686553955, "learning_rate": 3.869267705464299e-05, "loss": 2.5825, "step": 32500 }, { "epoch": 4.942214111922141, "eval_loss": 2.4927380084991455, "eval_runtime": 395.5817, "eval_samples_per_second": 1063.876, "eval_steps_per_second": 4.156, "step": 32500 }, { "epoch": 5.018248175182482, "grad_norm": 1.5895634889602661, "learning_rate": 3.835090025837699e-05, "loss": 2.5708, "step": 33000 }, { "epoch": 5.018248175182482, "eval_loss": 2.4862186908721924, "eval_runtime": 395.6219, "eval_samples_per_second": 1063.768, "eval_steps_per_second": 4.155, "step": 33000 }, { "epoch": 5.094282238442823, "grad_norm": 1.6652857065200806, "learning_rate": 3.800559628797438e-05, "loss": 2.5612, "step": 33500 }, { "epoch": 5.094282238442823, "eval_loss": 2.475658416748047, "eval_runtime": 394.9698, "eval_samples_per_second": 1065.525, "eval_steps_per_second": 4.162, "step": 33500 }, { "epoch": 5.170316301703163, "grad_norm": 1.6712974309921265, "learning_rate": 3.765685636928585e-05, "loss": 2.5508, "step": 34000 }, { "epoch": 5.170316301703163, "eval_loss": 2.4684622287750244, "eval_runtime": 394.7029, "eval_samples_per_second": 1066.245, "eval_steps_per_second": 4.165, "step": 34000 }, { "epoch": 5.2463503649635035, "grad_norm": 1.7370678186416626, "learning_rate": 3.7305480078818275e-05, "loss": 2.5517, "step": 34500 }, { "epoch": 5.2463503649635035, "eval_loss": 2.4651219844818115, "eval_runtime": 395.1235, "eval_samples_per_second": 1065.11, "eval_steps_per_second": 4.161, "step": 34500 }, { "epoch": 5.322384428223844, "grad_norm": 1.6240907907485962, "learning_rate": 3.6950151955931227e-05, "loss": 2.536, "step": 35000 }, { "epoch": 5.322384428223844, "eval_loss": 2.4535937309265137, "eval_runtime": 394.9571, "eval_samples_per_second": 1065.559, "eval_steps_per_second": 4.162, "step": 35000 }, { "epoch": 5.398418491484185, "grad_norm": 1.8107973337173462, "learning_rate": 3.659166672258033e-05, "loss": 2.5362, "step": 35500 }, { "epoch": 5.398418491484185, "eval_loss": 2.4444773197174072, "eval_runtime": 395.3, "eval_samples_per_second": 1064.635, "eval_steps_per_second": 4.159, "step": 35500 }, { "epoch": 5.474452554744525, "grad_norm": 1.550801396369934, "learning_rate": 3.623011908697394e-05, "loss": 2.5267, "step": 36000 }, { "epoch": 5.474452554744525, "eval_loss": 2.4367120265960693, "eval_runtime": 395.2579, "eval_samples_per_second": 1064.748, "eval_steps_per_second": 4.159, "step": 36000 }, { "epoch": 5.550486618004866, "grad_norm": 1.4852931499481201, "learning_rate": 3.5866336492488555e-05, "loss": 2.5165, "step": 36500 }, { "epoch": 5.550486618004866, "eval_loss": 2.431751251220703, "eval_runtime": 395.2301, "eval_samples_per_second": 1064.823, "eval_steps_per_second": 4.16, "step": 36500 }, { "epoch": 5.626520681265207, "grad_norm": 1.603376865386963, "learning_rate": 3.5498957032536564e-05, "loss": 2.5194, "step": 37000 }, { "epoch": 5.626520681265207, "eval_loss": 2.4255075454711914, "eval_runtime": 395.4389, "eval_samples_per_second": 1064.26, "eval_steps_per_second": 4.157, "step": 37000 }, { "epoch": 5.702554744525547, "grad_norm": 1.6579174995422363, "learning_rate": 3.512880385328552e-05, "loss": 2.5063, "step": 37500 }, { "epoch": 5.702554744525547, "eval_loss": 2.4162917137145996, "eval_runtime": 395.2478, "eval_samples_per_second": 1064.775, "eval_steps_per_second": 4.159, "step": 37500 }, { "epoch": 5.778588807785888, "grad_norm": 1.6467429399490356, "learning_rate": 3.475597474549821e-05, "loss": 2.4969, "step": 38000 }, { "epoch": 5.778588807785888, "eval_loss": 2.4108052253723145, "eval_runtime": 395.1001, "eval_samples_per_second": 1065.173, "eval_steps_per_second": 4.161, "step": 38000 }, { "epoch": 5.854622871046229, "grad_norm": 1.6167348623275757, "learning_rate": 3.438056820689096e-05, "loss": 2.492, "step": 38500 }, { "epoch": 5.854622871046229, "eval_loss": 2.402526617050171, "eval_runtime": 395.2077, "eval_samples_per_second": 1064.883, "eval_steps_per_second": 4.16, "step": 38500 }, { "epoch": 5.930656934306569, "grad_norm": 1.7401496171951294, "learning_rate": 3.400344159273908e-05, "loss": 2.4729, "step": 39000 }, { "epoch": 5.930656934306569, "eval_loss": 2.3961234092712402, "eval_runtime": 395.2683, "eval_samples_per_second": 1064.72, "eval_steps_per_second": 4.159, "step": 39000 }, { "epoch": 6.00669099756691, "grad_norm": 1.7321972846984863, "learning_rate": 3.3623183039946427e-05, "loss": 2.4753, "step": 39500 }, { "epoch": 6.00669099756691, "eval_loss": 2.390777826309204, "eval_runtime": 395.3927, "eval_samples_per_second": 1064.385, "eval_steps_per_second": 4.158, "step": 39500 }, { "epoch": 6.082725060827251, "grad_norm": 1.6455748081207275, "learning_rate": 3.3240646328557325e-05, "loss": 2.4653, "step": 40000 }, { "epoch": 6.082725060827251, "eval_loss": 2.385394334793091, "eval_runtime": 395.2314, "eval_samples_per_second": 1064.819, "eval_steps_per_second": 4.16, "step": 40000 }, { "epoch": 6.158759124087592, "grad_norm": 1.6246484518051147, "learning_rate": 3.2855932520939756e-05, "loss": 2.4552, "step": 40500 }, { "epoch": 6.158759124087592, "eval_loss": 2.3780696392059326, "eval_runtime": 395.2284, "eval_samples_per_second": 1064.827, "eval_steps_per_second": 4.16, "step": 40500 }, { "epoch": 6.234793187347932, "grad_norm": 1.6907716989517212, "learning_rate": 3.246914325462873e-05, "loss": 2.4577, "step": 41000 }, { "epoch": 6.234793187347932, "eval_loss": 2.3710057735443115, "eval_runtime": 395.2817, "eval_samples_per_second": 1064.684, "eval_steps_per_second": 4.159, "step": 41000 }, { "epoch": 6.3108272506082725, "grad_norm": 1.733163595199585, "learning_rate": 3.208038071547463e-05, "loss": 2.4512, "step": 41500 }, { "epoch": 6.3108272506082725, "eval_loss": 2.364978313446045, "eval_runtime": 395.0989, "eval_samples_per_second": 1065.176, "eval_steps_per_second": 4.161, "step": 41500 }, { "epoch": 6.386861313868613, "grad_norm": 1.604212999343872, "learning_rate": 3.1690530675165916e-05, "loss": 2.4419, "step": 42000 }, { "epoch": 6.386861313868613, "eval_loss": 2.3593010902404785, "eval_runtime": 394.8589, "eval_samples_per_second": 1065.824, "eval_steps_per_second": 4.164, "step": 42000 }, { "epoch": 6.4628953771289535, "grad_norm": 1.799272060394287, "learning_rate": 3.1298133637437146e-05, "loss": 2.443, "step": 42500 }, { "epoch": 6.4628953771289535, "eval_loss": 2.3553106784820557, "eval_runtime": 395.5826, "eval_samples_per_second": 1063.874, "eval_steps_per_second": 4.156, "step": 42500 }, { "epoch": 6.538929440389294, "grad_norm": 1.5894908905029297, "learning_rate": 3.0904072695878296e-05, "loss": 2.4291, "step": 43000 }, { "epoch": 6.538929440389294, "eval_loss": 2.350308656692505, "eval_runtime": 395.6156, "eval_samples_per_second": 1063.785, "eval_steps_per_second": 4.156, "step": 43000 }, { "epoch": 6.614963503649635, "grad_norm": 1.6308026313781738, "learning_rate": 3.050845195744353e-05, "loss": 2.4212, "step": 43500 }, { "epoch": 6.614963503649635, "eval_loss": 2.3425817489624023, "eval_runtime": 395.5628, "eval_samples_per_second": 1063.927, "eval_steps_per_second": 4.156, "step": 43500 }, { "epoch": 6.690997566909976, "grad_norm": 1.5576202869415283, "learning_rate": 3.011137594116975e-05, "loss": 2.4217, "step": 44000 }, { "epoch": 6.690997566909976, "eval_loss": 2.3366506099700928, "eval_runtime": 395.6852, "eval_samples_per_second": 1063.598, "eval_steps_per_second": 4.155, "step": 44000 }, { "epoch": 6.767031630170316, "grad_norm": 1.698960542678833, "learning_rate": 2.9713747681111948e-05, "loss": 2.4191, "step": 44500 }, { "epoch": 6.767031630170316, "eval_loss": 2.3311471939086914, "eval_runtime": 395.6553, "eval_samples_per_second": 1063.678, "eval_steps_per_second": 4.155, "step": 44500 }, { "epoch": 6.843065693430657, "grad_norm": 1.700810194015503, "learning_rate": 2.931407856139074e-05, "loss": 2.4101, "step": 45000 }, { "epoch": 6.843065693430657, "eval_loss": 2.326604127883911, "eval_runtime": 395.4811, "eval_samples_per_second": 1064.147, "eval_steps_per_second": 4.157, "step": 45000 }, { "epoch": 6.919099756690997, "grad_norm": 1.675718069076538, "learning_rate": 2.8913269705319878e-05, "loss": 2.4092, "step": 45500 }, { "epoch": 6.919099756690997, "eval_loss": 2.3215043544769287, "eval_runtime": 395.6152, "eval_samples_per_second": 1063.786, "eval_steps_per_second": 4.156, "step": 45500 }, { "epoch": 6.995133819951338, "grad_norm": 1.7430431842803955, "learning_rate": 2.851142700258497e-05, "loss": 2.4028, "step": 46000 }, { "epoch": 6.995133819951338, "eval_loss": 2.3190836906433105, "eval_runtime": 395.7789, "eval_samples_per_second": 1063.346, "eval_steps_per_second": 4.154, "step": 46000 }, { "epoch": 7.071167883211679, "grad_norm": 1.7376880645751953, "learning_rate": 2.8108656616003542e-05, "loss": 2.393, "step": 46500 }, { "epoch": 7.071167883211679, "eval_loss": 2.314730167388916, "eval_runtime": 395.8715, "eval_samples_per_second": 1063.097, "eval_steps_per_second": 4.153, "step": 46500 }, { "epoch": 7.14720194647202, "grad_norm": 1.647200584411621, "learning_rate": 2.7705064953477926e-05, "loss": 2.3864, "step": 47000 }, { "epoch": 7.14720194647202, "eval_loss": 2.3095407485961914, "eval_runtime": 392.0209, "eval_samples_per_second": 1073.54, "eval_steps_per_second": 4.194, "step": 47000 }, { "epoch": 7.22323600973236, "grad_norm": 1.5628902912139893, "learning_rate": 2.7300758639883305e-05, "loss": 2.3853, "step": 47500 }, { "epoch": 7.22323600973236, "eval_loss": 2.3034095764160156, "eval_runtime": 392.407, "eval_samples_per_second": 1072.483, "eval_steps_per_second": 4.19, "step": 47500 }, { "epoch": 7.299270072992701, "grad_norm": 1.6254950761795044, "learning_rate": 2.6896654852743762e-05, "loss": 2.3778, "step": 48000 }, { "epoch": 7.299270072992701, "eval_loss": 2.3009138107299805, "eval_runtime": 392.2743, "eval_samples_per_second": 1072.846, "eval_steps_per_second": 4.191, "step": 48000 }, { "epoch": 7.375304136253042, "grad_norm": 1.7831765413284302, "learning_rate": 2.6491240733505536e-05, "loss": 2.3902, "step": 48500 }, { "epoch": 7.375304136253042, "eval_loss": 2.2940807342529297, "eval_runtime": 392.0933, "eval_samples_per_second": 1073.342, "eval_steps_per_second": 4.193, "step": 48500 }, { "epoch": 7.451338199513382, "grad_norm": 1.7135417461395264, "learning_rate": 2.608543264340055e-05, "loss": 2.3734, "step": 49000 }, { "epoch": 7.451338199513382, "eval_loss": 2.2903780937194824, "eval_runtime": 392.3395, "eval_samples_per_second": 1072.668, "eval_steps_per_second": 4.19, "step": 49000 }, { "epoch": 7.5273722627737225, "grad_norm": 1.7215466499328613, "learning_rate": 2.5679337792861973e-05, "loss": 2.3644, "step": 49500 }, { "epoch": 7.5273722627737225, "eval_loss": 2.2882533073425293, "eval_runtime": 391.7386, "eval_samples_per_second": 1074.313, "eval_steps_per_second": 4.197, "step": 49500 }, { "epoch": 7.603406326034063, "grad_norm": 1.5934220552444458, "learning_rate": 2.527306346808222e-05, "loss": 2.3644, "step": 50000 }, { "epoch": 7.603406326034063, "eval_loss": 2.278449296951294, "eval_runtime": 392.029, "eval_samples_per_second": 1073.517, "eval_steps_per_second": 4.194, "step": 50000 }, { "epoch": 7.679440389294404, "grad_norm": 1.734836459159851, "learning_rate": 2.4866717002668977e-05, "loss": 2.3643, "step": 50500 }, { "epoch": 7.679440389294404, "eval_loss": 2.2776286602020264, "eval_runtime": 391.9926, "eval_samples_per_second": 1073.617, "eval_steps_per_second": 4.194, "step": 50500 }, { "epoch": 7.755474452554744, "grad_norm": 1.6759928464889526, "learning_rate": 2.4461218265301844e-05, "loss": 2.3549, "step": 51000 }, { "epoch": 7.755474452554744, "eval_loss": 2.275527000427246, "eval_runtime": 392.0053, "eval_samples_per_second": 1073.582, "eval_steps_per_second": 4.194, "step": 51000 }, { "epoch": 7.831508515815085, "grad_norm": 1.6229385137557983, "learning_rate": 2.4055049175099393e-05, "loss": 2.3475, "step": 51500 }, { "epoch": 7.831508515815085, "eval_loss": 2.269463539123535, "eval_runtime": 392.7325, "eval_samples_per_second": 1071.594, "eval_steps_per_second": 4.186, "step": 51500 }, { "epoch": 7.907542579075426, "grad_norm": 1.5919690132141113, "learning_rate": 2.3649129731441017e-05, "loss": 2.3556, "step": 52000 }, { "epoch": 7.907542579075426, "eval_loss": 2.2632956504821777, "eval_runtime": 392.8483, "eval_samples_per_second": 1071.279, "eval_steps_per_second": 4.185, "step": 52000 }, { "epoch": 7.983576642335766, "grad_norm": 1.6283611059188843, "learning_rate": 2.32435671741784e-05, "loss": 2.3441, "step": 52500 }, { "epoch": 7.983576642335766, "eval_loss": 2.2631113529205322, "eval_runtime": 393.1076, "eval_samples_per_second": 1070.572, "eval_steps_per_second": 4.182, "step": 52500 }, { "epoch": 8.059610705596107, "grad_norm": 1.6927645206451416, "learning_rate": 2.2838468648877376e-05, "loss": 2.3396, "step": 53000 }, { "epoch": 8.059610705596107, "eval_loss": 2.2605204582214355, "eval_runtime": 393.0545, "eval_samples_per_second": 1070.717, "eval_steps_per_second": 4.183, "step": 53000 }, { "epoch": 8.135644768856448, "grad_norm": 1.6524484157562256, "learning_rate": 2.2433941178511185e-05, "loss": 2.3281, "step": 53500 }, { "epoch": 8.135644768856448, "eval_loss": 2.255591869354248, "eval_runtime": 393.065, "eval_samples_per_second": 1070.688, "eval_steps_per_second": 4.183, "step": 53500 }, { "epoch": 8.211678832116789, "grad_norm": 1.8136180639266968, "learning_rate": 2.2030091635186097e-05, "loss": 2.3251, "step": 54000 }, { "epoch": 8.211678832116789, "eval_loss": 2.2528815269470215, "eval_runtime": 393.1403, "eval_samples_per_second": 1070.483, "eval_steps_per_second": 4.182, "step": 54000 }, { "epoch": 8.28771289537713, "grad_norm": 1.7461555004119873, "learning_rate": 2.1627831987887616e-05, "loss": 2.3252, "step": 54500 }, { "epoch": 8.28771289537713, "eval_loss": 2.247727155685425, "eval_runtime": 394.607, "eval_samples_per_second": 1066.504, "eval_steps_per_second": 4.166, "step": 54500 }, { "epoch": 8.363746958637469, "grad_norm": 1.6148008108139038, "learning_rate": 2.1225656282037674e-05, "loss": 2.3231, "step": 55000 }, { "epoch": 8.363746958637469, "eval_loss": 2.245650291442871, "eval_runtime": 393.1496, "eval_samples_per_second": 1070.458, "eval_steps_per_second": 4.182, "step": 55000 }, { "epoch": 8.43978102189781, "grad_norm": 1.5390928983688354, "learning_rate": 2.082447771999728e-05, "loss": 2.3218, "step": 55500 }, { "epoch": 8.43978102189781, "eval_loss": 2.240283489227295, "eval_runtime": 393.128, "eval_samples_per_second": 1070.517, "eval_steps_per_second": 4.182, "step": 55500 }, { "epoch": 8.51581508515815, "grad_norm": 1.7353328466415405, "learning_rate": 2.0424402289124667e-05, "loss": 2.3113, "step": 56000 }, { "epoch": 8.51581508515815, "eval_loss": 2.236283540725708, "eval_runtime": 392.9933, "eval_samples_per_second": 1070.883, "eval_steps_per_second": 4.183, "step": 56000 }, { "epoch": 8.591849148418492, "grad_norm": 1.6553759574890137, "learning_rate": 2.0025535685341834e-05, "loss": 2.3137, "step": 56500 }, { "epoch": 8.591849148418492, "eval_loss": 2.2341954708099365, "eval_runtime": 394.1952, "eval_samples_per_second": 1067.618, "eval_steps_per_second": 4.171, "step": 56500 }, { "epoch": 8.667883211678832, "grad_norm": 1.6300148963928223, "learning_rate": 1.9627983285210795e-05, "loss": 2.3153, "step": 57000 }, { "epoch": 8.667883211678832, "eval_loss": 2.2316806316375732, "eval_runtime": 394.4429, "eval_samples_per_second": 1066.948, "eval_steps_per_second": 4.168, "step": 57000 }, { "epoch": 8.743917274939173, "grad_norm": 1.7760825157165527, "learning_rate": 1.9231850118094083e-05, "loss": 2.3086, "step": 57500 }, { "epoch": 8.743917274939173, "eval_loss": 2.2260444164276123, "eval_runtime": 394.1825, "eval_samples_per_second": 1067.653, "eval_steps_per_second": 4.171, "step": 57500 }, { "epoch": 8.819951338199512, "grad_norm": 1.6700938940048218, "learning_rate": 1.883724083840713e-05, "loss": 2.3051, "step": 58000 }, { "epoch": 8.819951338199512, "eval_loss": 2.2262229919433594, "eval_runtime": 394.2521, "eval_samples_per_second": 1067.464, "eval_steps_per_second": 4.17, "step": 58000 }, { "epoch": 8.895985401459853, "grad_norm": 1.6361171007156372, "learning_rate": 1.8445043966286124e-05, "loss": 2.2996, "step": 58500 }, { "epoch": 8.895985401459853, "eval_loss": 2.2197461128234863, "eval_runtime": 394.2947, "eval_samples_per_second": 1067.349, "eval_steps_per_second": 4.169, "step": 58500 }, { "epoch": 8.972019464720194, "grad_norm": 1.5987651348114014, "learning_rate": 1.805379121954309e-05, "loss": 2.295, "step": 59000 }, { "epoch": 8.972019464720194, "eval_loss": 2.218661069869995, "eval_runtime": 394.4471, "eval_samples_per_second": 1066.936, "eval_steps_per_second": 4.168, "step": 59000 }, { "epoch": 9.048053527980535, "grad_norm": 1.6805070638656616, "learning_rate": 1.7664373591592323e-05, "loss": 2.2898, "step": 59500 }, { "epoch": 9.048053527980535, "eval_loss": 2.2158923149108887, "eval_runtime": 394.3964, "eval_samples_per_second": 1067.074, "eval_steps_per_second": 4.168, "step": 59500 }, { "epoch": 9.124087591240876, "grad_norm": 1.559171199798584, "learning_rate": 1.727689396267106e-05, "loss": 2.294, "step": 60000 }, { "epoch": 9.124087591240876, "eval_loss": 2.213304281234741, "eval_runtime": 394.3761, "eval_samples_per_second": 1067.129, "eval_steps_per_second": 4.169, "step": 60000 }, { "epoch": 9.200121654501217, "grad_norm": 1.7154414653778076, "learning_rate": 1.689145470101657e-05, "loss": 2.2905, "step": 60500 }, { "epoch": 9.200121654501217, "eval_loss": 2.211729049682617, "eval_runtime": 394.4483, "eval_samples_per_second": 1066.933, "eval_steps_per_second": 4.168, "step": 60500 }, { "epoch": 9.276155717761558, "grad_norm": 1.7217854261398315, "learning_rate": 1.6508922024636513e-05, "loss": 2.2776, "step": 61000 }, { "epoch": 9.276155717761558, "eval_loss": 2.2076163291931152, "eval_runtime": 394.2479, "eval_samples_per_second": 1067.476, "eval_steps_per_second": 4.17, "step": 61000 }, { "epoch": 9.352189781021897, "grad_norm": 1.6988067626953125, "learning_rate": 1.6127863831556155e-05, "loss": 2.2888, "step": 61500 }, { "epoch": 9.352189781021897, "eval_loss": 2.2073538303375244, "eval_runtime": 394.4185, "eval_samples_per_second": 1067.014, "eval_steps_per_second": 4.168, "step": 61500 }, { "epoch": 9.428223844282238, "grad_norm": 1.6594995260238647, "learning_rate": 1.5749149567995482e-05, "loss": 2.2737, "step": 62000 }, { "epoch": 9.428223844282238, "eval_loss": 2.2045233249664307, "eval_runtime": 394.3688, "eval_samples_per_second": 1067.148, "eval_steps_per_second": 4.169, "step": 62000 }, { "epoch": 9.504257907542579, "grad_norm": 1.782347321510315, "learning_rate": 1.537287928647002e-05, "loss": 2.2715, "step": 62500 }, { "epoch": 9.504257907542579, "eval_loss": 2.1984219551086426, "eval_runtime": 394.2219, "eval_samples_per_second": 1067.546, "eval_steps_per_second": 4.17, "step": 62500 }, { "epoch": 9.58029197080292, "grad_norm": 1.7212417125701904, "learning_rate": 1.4999897243562522e-05, "loss": 2.2736, "step": 63000 }, { "epoch": 9.58029197080292, "eval_loss": 2.200115919113159, "eval_runtime": 394.4095, "eval_samples_per_second": 1067.038, "eval_steps_per_second": 4.168, "step": 63000 }, { "epoch": 9.65632603406326, "grad_norm": 1.636083722114563, "learning_rate": 1.4628807092364161e-05, "loss": 2.2714, "step": 63500 }, { "epoch": 9.65632603406326, "eval_loss": 2.196516752243042, "eval_runtime": 394.3398, "eval_samples_per_second": 1067.227, "eval_steps_per_second": 4.169, "step": 63500 }, { "epoch": 9.732360097323602, "grad_norm": 1.669154405593872, "learning_rate": 1.4260456906462644e-05, "loss": 2.2581, "step": 64000 }, { "epoch": 9.732360097323602, "eval_loss": 2.1947672367095947, "eval_runtime": 394.2775, "eval_samples_per_second": 1067.396, "eval_steps_per_second": 4.17, "step": 64000 }, { "epoch": 9.808394160583942, "grad_norm": 1.5820955038070679, "learning_rate": 1.3894944000287996e-05, "loss": 2.2673, "step": 64500 }, { "epoch": 9.808394160583942, "eval_loss": 2.1930572986602783, "eval_runtime": 394.3185, "eval_samples_per_second": 1067.284, "eval_steps_per_second": 4.169, "step": 64500 }, { "epoch": 9.884428223844282, "grad_norm": 1.878128170967102, "learning_rate": 1.3532364938689365e-05, "loss": 2.2532, "step": 65000 }, { "epoch": 9.884428223844282, "eval_loss": 2.186814069747925, "eval_runtime": 394.1633, "eval_samples_per_second": 1067.705, "eval_steps_per_second": 4.171, "step": 65000 }, { "epoch": 9.960462287104622, "grad_norm": 1.6541669368743896, "learning_rate": 1.3172815511423497e-05, "loss": 2.2599, "step": 65500 }, { "epoch": 9.960462287104622, "eval_loss": 2.186183452606201, "eval_runtime": 394.274, "eval_samples_per_second": 1067.405, "eval_steps_per_second": 4.17, "step": 65500 }, { "epoch": 10.036496350364963, "grad_norm": 1.6656322479248047, "learning_rate": 1.2817100376353228e-05, "loss": 2.2626, "step": 66000 }, { "epoch": 10.036496350364963, "eval_loss": 2.1833560466766357, "eval_runtime": 394.4838, "eval_samples_per_second": 1066.837, "eval_steps_per_second": 4.167, "step": 66000 }, { "epoch": 10.112530413625304, "grad_norm": 1.64789617061615, "learning_rate": 1.246388782934231e-05, "loss": 2.2476, "step": 66500 }, { "epoch": 10.112530413625304, "eval_loss": 2.1836633682250977, "eval_runtime": 394.475, "eval_samples_per_second": 1066.861, "eval_steps_per_second": 4.168, "step": 66500 }, { "epoch": 10.188564476885645, "grad_norm": 1.626693844795227, "learning_rate": 1.2113987197615472e-05, "loss": 2.2597, "step": 67000 }, { "epoch": 10.188564476885645, "eval_loss": 2.177664041519165, "eval_runtime": 394.4402, "eval_samples_per_second": 1066.955, "eval_steps_per_second": 4.168, "step": 67000 }, { "epoch": 10.264598540145986, "grad_norm": 1.660078525543213, "learning_rate": 1.1767490921415291e-05, "loss": 2.2525, "step": 67500 }, { "epoch": 10.264598540145986, "eval_loss": 2.177150011062622, "eval_runtime": 394.2691, "eval_samples_per_second": 1067.418, "eval_steps_per_second": 4.17, "step": 67500 }, { "epoch": 10.340632603406325, "grad_norm": 1.6624382734298706, "learning_rate": 1.1424490541587752e-05, "loss": 2.2477, "step": 68000 }, { "epoch": 10.340632603406325, "eval_loss": 2.175464630126953, "eval_runtime": 394.3358, "eval_samples_per_second": 1067.238, "eval_steps_per_second": 4.169, "step": 68000 }, { "epoch": 10.416666666666666, "grad_norm": 1.7029284238815308, "learning_rate": 1.1085076675397963e-05, "loss": 2.2442, "step": 68500 }, { "epoch": 10.416666666666666, "eval_loss": 2.172318935394287, "eval_runtime": 394.363, "eval_samples_per_second": 1067.164, "eval_steps_per_second": 4.169, "step": 68500 }, { "epoch": 10.492700729927007, "grad_norm": 1.7094260454177856, "learning_rate": 1.0750006740005564e-05, "loss": 2.2461, "step": 69000 }, { "epoch": 10.492700729927007, "eval_loss": 2.1725075244903564, "eval_runtime": 394.3359, "eval_samples_per_second": 1067.237, "eval_steps_per_second": 4.169, "step": 69000 }, { "epoch": 10.568734793187348, "grad_norm": 1.7138928174972534, "learning_rate": 1.04180263214852e-05, "loss": 2.2428, "step": 69500 }, { "epoch": 10.568734793187348, "eval_loss": 2.1679632663726807, "eval_runtime": 394.3498, "eval_samples_per_second": 1067.2, "eval_steps_per_second": 4.169, "step": 69500 }, { "epoch": 10.644768856447689, "grad_norm": 1.7748503684997559, "learning_rate": 1.0089898314369628e-05, "loss": 2.2409, "step": 70000 }, { "epoch": 10.644768856447689, "eval_loss": 2.167714834213257, "eval_runtime": 394.3096, "eval_samples_per_second": 1067.308, "eval_steps_per_second": 4.169, "step": 70000 }, { "epoch": 10.72080291970803, "grad_norm": 1.8225022554397583, "learning_rate": 9.765709406792067e-06, "loss": 2.2421, "step": 70500 }, { "epoch": 10.72080291970803, "eval_loss": 2.1677842140197754, "eval_runtime": 394.4354, "eval_samples_per_second": 1066.968, "eval_steps_per_second": 4.168, "step": 70500 }, { "epoch": 10.79683698296837, "grad_norm": 1.682428002357483, "learning_rate": 9.445545246215093e-06, "loss": 2.2405, "step": 71000 }, { "epoch": 10.79683698296837, "eval_loss": 2.162020206451416, "eval_runtime": 394.4337, "eval_samples_per_second": 1066.973, "eval_steps_per_second": 4.168, "step": 71000 }, { "epoch": 10.87287104622871, "grad_norm": 1.8187251091003418, "learning_rate": 9.130118369667984e-06, "loss": 2.2338, "step": 71500 }, { "epoch": 10.87287104622871, "eval_loss": 2.161623001098633, "eval_runtime": 394.3265, "eval_samples_per_second": 1067.263, "eval_steps_per_second": 4.169, "step": 71500 }, { "epoch": 10.94890510948905, "grad_norm": 1.586653470993042, "learning_rate": 8.818247901683923e-06, "loss": 2.2291, "step": 72000 }, { "epoch": 10.94890510948905, "eval_loss": 2.1573026180267334, "eval_runtime": 394.3904, "eval_samples_per_second": 1067.09, "eval_steps_per_second": 4.168, "step": 72000 }, { "epoch": 11.024939172749392, "grad_norm": 1.6375211477279663, "learning_rate": 8.510652490541102e-06, "loss": 2.2337, "step": 72500 }, { "epoch": 11.024939172749392, "eval_loss": 2.158447027206421, "eval_runtime": 394.8845, "eval_samples_per_second": 1065.755, "eval_steps_per_second": 4.163, "step": 72500 }, { "epoch": 11.100973236009732, "grad_norm": 1.9024183750152588, "learning_rate": 8.207413399866525e-06, "loss": 2.2243, "step": 73000 }, { "epoch": 11.100973236009732, "eval_loss": 2.1577627658843994, "eval_runtime": 394.3929, "eval_samples_per_second": 1067.083, "eval_steps_per_second": 4.168, "step": 73000 }, { "epoch": 11.177007299270073, "grad_norm": 1.6612706184387207, "learning_rate": 7.908610742390934e-06, "loss": 2.2206, "step": 73500 }, { "epoch": 11.177007299270073, "eval_loss": 2.156655788421631, "eval_runtime": 394.4918, "eval_samples_per_second": 1066.816, "eval_steps_per_second": 4.167, "step": 73500 }, { "epoch": 11.253041362530414, "grad_norm": 1.6041182279586792, "learning_rate": 7.614323458783904e-06, "loss": 2.2316, "step": 74000 }, { "epoch": 11.253041362530414, "eval_loss": 2.154806137084961, "eval_runtime": 394.5111, "eval_samples_per_second": 1066.763, "eval_steps_per_second": 4.167, "step": 74000 }, { "epoch": 11.329075425790755, "grad_norm": 1.7304446697235107, "learning_rate": 7.324629296798397e-06, "loss": 2.2252, "step": 74500 }, { "epoch": 11.329075425790755, "eval_loss": 2.1519484519958496, "eval_runtime": 394.2907, "eval_samples_per_second": 1067.36, "eval_steps_per_second": 4.17, "step": 74500 }, { "epoch": 11.405109489051094, "grad_norm": 1.6792948246002197, "learning_rate": 7.039604790730683e-06, "loss": 2.2257, "step": 75000 }, { "epoch": 11.405109489051094, "eval_loss": 2.1538424491882324, "eval_runtime": 394.5221, "eval_samples_per_second": 1066.734, "eval_steps_per_second": 4.167, "step": 75000 }, { "epoch": 11.481143552311435, "grad_norm": 1.5765753984451294, "learning_rate": 6.7598810154057336e-06, "loss": 2.2252, "step": 75500 }, { "epoch": 11.481143552311435, "eval_loss": 2.1519691944122314, "eval_runtime": 394.4824, "eval_samples_per_second": 1066.841, "eval_steps_per_second": 4.167, "step": 75500 }, { "epoch": 11.557177615571776, "grad_norm": 1.644453525543213, "learning_rate": 6.484410758400267e-06, "loss": 2.2228, "step": 76000 }, { "epoch": 11.557177615571776, "eval_loss": 2.1509506702423096, "eval_runtime": 394.5661, "eval_samples_per_second": 1066.615, "eval_steps_per_second": 4.167, "step": 76000 }, { "epoch": 11.633211678832117, "grad_norm": 1.7033356428146362, "learning_rate": 6.213832134635486e-06, "loss": 2.2217, "step": 76500 }, { "epoch": 11.633211678832117, "eval_loss": 2.1477901935577393, "eval_runtime": 394.5248, "eval_samples_per_second": 1066.726, "eval_steps_per_second": 4.167, "step": 76500 }, { "epoch": 11.709245742092458, "grad_norm": 1.6563267707824707, "learning_rate": 5.948216628273909e-06, "loss": 2.2135, "step": 77000 }, { "epoch": 11.709245742092458, "eval_loss": 2.1486401557922363, "eval_runtime": 394.3353, "eval_samples_per_second": 1067.239, "eval_steps_per_second": 4.169, "step": 77000 }, { "epoch": 11.785279805352799, "grad_norm": 1.6282879114151, "learning_rate": 5.687634412272127e-06, "loss": 2.2254, "step": 77500 }, { "epoch": 11.785279805352799, "eval_loss": 2.1465682983398438, "eval_runtime": 394.4898, "eval_samples_per_second": 1066.821, "eval_steps_per_second": 4.167, "step": 77500 }, { "epoch": 11.861313868613138, "grad_norm": 1.7813278436660767, "learning_rate": 5.432154329841835e-06, "loss": 2.2166, "step": 78000 }, { "epoch": 11.861313868613138, "eval_loss": 2.14347505569458, "eval_runtime": 394.4933, "eval_samples_per_second": 1066.812, "eval_steps_per_second": 4.167, "step": 78000 }, { "epoch": 11.937347931873479, "grad_norm": 1.723649024963379, "learning_rate": 5.181843876262127e-06, "loss": 2.2181, "step": 78500 }, { "epoch": 11.937347931873479, "eval_loss": 2.1440093517303467, "eval_runtime": 394.3682, "eval_samples_per_second": 1067.15, "eval_steps_per_second": 4.169, "step": 78500 }, { "epoch": 12.01338199513382, "grad_norm": 1.7719519138336182, "learning_rate": 4.936769181047937e-06, "loss": 2.2092, "step": 79000 }, { "epoch": 12.01338199513382, "eval_loss": 2.141754388809204, "eval_runtime": 394.1783, "eval_samples_per_second": 1067.664, "eval_steps_per_second": 4.171, "step": 79000 }, { "epoch": 12.08941605839416, "grad_norm": 1.696637749671936, "learning_rate": 4.697469206617919e-06, "loss": 2.2007, "step": 79500 }, { "epoch": 12.08941605839416, "eval_loss": 2.1432430744171143, "eval_runtime": 394.2858, "eval_samples_per_second": 1067.373, "eval_steps_per_second": 4.17, "step": 79500 }, { "epoch": 12.165450121654501, "grad_norm": 1.6854994297027588, "learning_rate": 4.463511524513736e-06, "loss": 2.2084, "step": 80000 }, { "epoch": 12.165450121654501, "eval_loss": 2.141733407974243, "eval_runtime": 394.4029, "eval_samples_per_second": 1067.056, "eval_steps_per_second": 4.168, "step": 80000 }, { "epoch": 12.241484184914842, "grad_norm": 1.6496477127075195, "learning_rate": 4.2345051393941574e-06, "loss": 2.2089, "step": 80500 }, { "epoch": 12.241484184914842, "eval_loss": 2.139671802520752, "eval_runtime": 394.4989, "eval_samples_per_second": 1066.796, "eval_steps_per_second": 4.167, "step": 80500 }, { "epoch": 12.317518248175183, "grad_norm": 1.6591581106185913, "learning_rate": 4.010984790046615e-06, "loss": 2.2058, "step": 81000 }, { "epoch": 12.317518248175183, "eval_loss": 2.1399948596954346, "eval_runtime": 394.4647, "eval_samples_per_second": 1066.889, "eval_steps_per_second": 4.168, "step": 81000 }, { "epoch": 12.393552311435522, "grad_norm": 1.7192113399505615, "learning_rate": 3.7930095283087966e-06, "loss": 2.2059, "step": 81500 }, { "epoch": 12.393552311435522, "eval_loss": 2.1405417919158936, "eval_runtime": 394.3798, "eval_samples_per_second": 1067.118, "eval_steps_per_second": 4.169, "step": 81500 }, { "epoch": 12.469586374695863, "grad_norm": 1.6483603715896606, "learning_rate": 3.5806369410618047e-06, "loss": 2.2144, "step": 82000 }, { "epoch": 12.469586374695863, "eval_loss": 2.1386895179748535, "eval_runtime": 394.4506, "eval_samples_per_second": 1066.927, "eval_steps_per_second": 4.168, "step": 82000 }, { "epoch": 12.545620437956204, "grad_norm": 1.6323285102844238, "learning_rate": 3.3739231350162437e-06, "loss": 2.2076, "step": 82500 }, { "epoch": 12.545620437956204, "eval_loss": 2.1366796493530273, "eval_runtime": 394.6499, "eval_samples_per_second": 1066.388, "eval_steps_per_second": 4.166, "step": 82500 }, { "epoch": 12.621654501216545, "grad_norm": 1.7512730360031128, "learning_rate": 3.173318985201379e-06, "loss": 2.21, "step": 83000 }, { "epoch": 12.621654501216545, "eval_loss": 2.1367809772491455, "eval_runtime": 394.4888, "eval_samples_per_second": 1066.824, "eval_steps_per_second": 4.167, "step": 83000 }, { "epoch": 12.697688564476886, "grad_norm": 1.7279080152511597, "learning_rate": 2.9780734823130846e-06, "loss": 2.2014, "step": 83500 }, { "epoch": 12.697688564476886, "eval_loss": 2.136183500289917, "eval_runtime": 394.5466, "eval_samples_per_second": 1066.667, "eval_steps_per_second": 4.167, "step": 83500 }, { "epoch": 12.773722627737227, "grad_norm": 1.7061643600463867, "learning_rate": 2.7886459518572467e-06, "loss": 2.2073, "step": 84000 }, { "epoch": 12.773722627737227, "eval_loss": 2.136634111404419, "eval_runtime": 394.4488, "eval_samples_per_second": 1066.932, "eval_steps_per_second": 4.168, "step": 84000 }, { "epoch": 12.849756690997566, "grad_norm": 1.6525273323059082, "learning_rate": 2.6050864386902433e-06, "loss": 2.2062, "step": 84500 }, { "epoch": 12.849756690997566, "eval_loss": 2.135418653488159, "eval_runtime": 394.6522, "eval_samples_per_second": 1066.382, "eval_steps_per_second": 4.166, "step": 84500 }, { "epoch": 12.925790754257907, "grad_norm": 1.753316879272461, "learning_rate": 2.4274434373970757e-06, "loss": 2.1969, "step": 85000 }, { "epoch": 12.925790754257907, "eval_loss": 2.130448579788208, "eval_runtime": 394.5649, "eval_samples_per_second": 1066.618, "eval_steps_per_second": 4.167, "step": 85000 }, { "epoch": 13.001824817518248, "grad_norm": 1.5890535116195679, "learning_rate": 2.256101256668691e-06, "loss": 2.2078, "step": 85500 }, { "epoch": 13.001824817518248, "eval_loss": 2.1335136890411377, "eval_runtime": 394.3918, "eval_samples_per_second": 1067.086, "eval_steps_per_second": 4.168, "step": 85500 }, { "epoch": 13.077858880778589, "grad_norm": 1.7298823595046997, "learning_rate": 2.0904184363357256e-06, "loss": 2.203, "step": 86000 }, { "epoch": 13.077858880778589, "eval_loss": 2.132927894592285, "eval_runtime": 394.4131, "eval_samples_per_second": 1067.029, "eval_steps_per_second": 4.168, "step": 86000 }, { "epoch": 13.15389294403893, "grad_norm": 1.7888143062591553, "learning_rate": 1.930788098008321e-06, "loss": 2.1993, "step": 86500 }, { "epoch": 13.15389294403893, "eval_loss": 2.1313769817352295, "eval_runtime": 394.3014, "eval_samples_per_second": 1067.331, "eval_steps_per_second": 4.169, "step": 86500 }, { "epoch": 13.22992700729927, "grad_norm": 1.7427315711975098, "learning_rate": 1.7772524144231473e-06, "loss": 2.2032, "step": 87000 }, { "epoch": 13.22992700729927, "eval_loss": 2.135279893875122, "eval_runtime": 394.1525, "eval_samples_per_second": 1067.734, "eval_steps_per_second": 4.171, "step": 87000 }, { "epoch": 13.305961070559611, "grad_norm": 1.700643539428711, "learning_rate": 1.6298519481701192e-06, "loss": 2.2, "step": 87500 }, { "epoch": 13.305961070559611, "eval_loss": 2.130155086517334, "eval_runtime": 393.7376, "eval_samples_per_second": 1068.859, "eval_steps_per_second": 4.175, "step": 87500 }, { "epoch": 13.38199513381995, "grad_norm": 1.6336027383804321, "learning_rate": 1.4889019067080928e-06, "loss": 2.1964, "step": 88000 }, { "epoch": 13.38199513381995, "eval_loss": 2.129770517349243, "eval_runtime": 394.1127, "eval_samples_per_second": 1067.842, "eval_steps_per_second": 4.171, "step": 88000 }, { "epoch": 13.458029197080291, "grad_norm": 1.698116421699524, "learning_rate": 1.3538746100630939e-06, "loss": 2.1957, "step": 88500 }, { "epoch": 13.458029197080291, "eval_loss": 2.1296403408050537, "eval_runtime": 394.7051, "eval_samples_per_second": 1066.239, "eval_steps_per_second": 4.165, "step": 88500 }, { "epoch": 13.534063260340632, "grad_norm": 1.7204720973968506, "learning_rate": 1.2250943829259454e-06, "loss": 2.1985, "step": 89000 }, { "epoch": 13.534063260340632, "eval_loss": 2.131389856338501, "eval_runtime": 394.7347, "eval_samples_per_second": 1066.159, "eval_steps_per_second": 4.165, "step": 89000 }, { "epoch": 13.610097323600973, "grad_norm": 1.7444037199020386, "learning_rate": 1.102595247742902e-06, "loss": 2.1967, "step": 89500 }, { "epoch": 13.610097323600973, "eval_loss": 2.13096284866333, "eval_runtime": 394.695, "eval_samples_per_second": 1066.266, "eval_steps_per_second": 4.165, "step": 89500 }, { "epoch": 13.686131386861314, "grad_norm": 1.7652897834777832, "learning_rate": 9.864095675586272e-07, "loss": 2.1979, "step": 90000 }, { "epoch": 13.686131386861314, "eval_loss": 2.1287431716918945, "eval_runtime": 394.6791, "eval_samples_per_second": 1066.309, "eval_steps_per_second": 4.165, "step": 90000 }, { "epoch": 13.762165450121655, "grad_norm": 1.6986685991287231, "learning_rate": 8.765680374662105e-07, "loss": 2.2055, "step": 90500 }, { "epoch": 13.762165450121655, "eval_loss": 2.128450870513916, "eval_runtime": 394.7254, "eval_samples_per_second": 1066.184, "eval_steps_per_second": 4.165, "step": 90500 }, { "epoch": 13.838199513381996, "grad_norm": 1.7826683521270752, "learning_rate": 7.730996764978071e-07, "loss": 2.1933, "step": 91000 }, { "epoch": 13.838199513381996, "eval_loss": 2.128603935241699, "eval_runtime": 394.6725, "eval_samples_per_second": 1066.327, "eval_steps_per_second": 4.165, "step": 91000 }, { "epoch": 13.914233576642335, "grad_norm": 1.7597603797912598, "learning_rate": 6.76031819958145e-07, "loss": 2.1945, "step": 91500 }, { "epoch": 13.914233576642335, "eval_loss": 2.1281092166900635, "eval_runtime": 394.6346, "eval_samples_per_second": 1066.43, "eval_steps_per_second": 4.166, "step": 91500 }, { "epoch": 13.990267639902676, "grad_norm": 1.5649290084838867, "learning_rate": 5.855649661219098e-07, "loss": 2.2016, "step": 92000 }, { "epoch": 13.990267639902676, "eval_loss": 2.129279613494873, "eval_runtime": 394.55, "eval_samples_per_second": 1066.658, "eval_steps_per_second": 4.167, "step": 92000 }, { "epoch": 14.066301703163017, "grad_norm": 1.6939290761947632, "learning_rate": 5.013604308242548e-07, "loss": 2.195, "step": 92500 }, { "epoch": 14.066301703163017, "eval_loss": 2.1266942024230957, "eval_runtime": 394.5988, "eval_samples_per_second": 1066.526, "eval_steps_per_second": 4.166, "step": 92500 }, { "epoch": 14.142335766423358, "grad_norm": 1.6481035947799683, "learning_rate": 4.236281907425227e-07, "loss": 2.1939, "step": 93000 }, { "epoch": 14.142335766423358, "eval_loss": 2.1291019916534424, "eval_runtime": 393.3337, "eval_samples_per_second": 1069.957, "eval_steps_per_second": 4.18, "step": 93000 }, { "epoch": 14.218369829683699, "grad_norm": 1.7540963888168335, "learning_rate": 3.523887819560451e-07, "loss": 2.1939, "step": 93500 }, { "epoch": 14.218369829683699, "eval_loss": 2.130265474319458, "eval_runtime": 393.7198, "eval_samples_per_second": 1068.907, "eval_steps_per_second": 4.176, "step": 93500 }, { "epoch": 14.29440389294404, "grad_norm": 1.7240368127822876, "learning_rate": 2.876610252031453e-07, "loss": 2.1907, "step": 94000 }, { "epoch": 14.29440389294404, "eval_loss": 2.126887321472168, "eval_runtime": 393.7098, "eval_samples_per_second": 1068.934, "eval_steps_per_second": 4.176, "step": 94000 }, { "epoch": 14.37043795620438, "grad_norm": 1.6906523704528809, "learning_rate": 2.2946202090889657e-07, "loss": 2.1999, "step": 94500 }, { "epoch": 14.37043795620438, "eval_loss": 2.126722812652588, "eval_runtime": 393.685, "eval_samples_per_second": 1069.002, "eval_steps_per_second": 4.176, "step": 94500 }, { "epoch": 14.44647201946472, "grad_norm": 1.7347662448883057, "learning_rate": 1.7790391402128793e-07, "loss": 2.1989, "step": 95000 }, { "epoch": 14.44647201946472, "eval_loss": 2.1272239685058594, "eval_runtime": 393.5995, "eval_samples_per_second": 1069.234, "eval_steps_per_second": 4.177, "step": 95000 }, { "epoch": 14.52250608272506, "grad_norm": 1.64090096950531, "learning_rate": 1.327936845155059e-07, "loss": 2.1963, "step": 95500 }, { "epoch": 14.52250608272506, "eval_loss": 2.126425266265869, "eval_runtime": 394.5292, "eval_samples_per_second": 1066.715, "eval_steps_per_second": 4.167, "step": 95500 }, { "epoch": 14.598540145985401, "grad_norm": 1.6597987413406372, "learning_rate": 9.425312186875923e-08, "loss": 2.1987, "step": 96000 }, { "epoch": 14.598540145985401, "eval_loss": 2.1285743713378906, "eval_runtime": 394.6211, "eval_samples_per_second": 1066.466, "eval_steps_per_second": 4.166, "step": 96000 }, { "epoch": 14.674574209245742, "grad_norm": 1.6827759742736816, "learning_rate": 6.2292408111711e-08, "loss": 2.2012, "step": 96500 }, { "epoch": 14.674574209245742, "eval_loss": 2.1267669200897217, "eval_runtime": 394.6661, "eval_samples_per_second": 1066.344, "eval_steps_per_second": 4.166, "step": 96500 }, { "epoch": 14.750608272506083, "grad_norm": 1.9470024108886719, "learning_rate": 3.691998694484722e-08, "loss": 2.2013, "step": 97000 }, { "epoch": 14.750608272506083, "eval_loss": 2.128140449523926, "eval_runtime": 394.6676, "eval_samples_per_second": 1066.34, "eval_steps_per_second": 4.166, "step": 97000 }, { "epoch": 14.826642335766424, "grad_norm": 1.6369675397872925, "learning_rate": 1.817353096532637e-08, "loss": 2.1923, "step": 97500 }, { "epoch": 14.826642335766424, "eval_loss": 2.128028392791748, "eval_runtime": 394.6764, "eval_samples_per_second": 1066.317, "eval_steps_per_second": 4.165, "step": 97500 }, { "epoch": 14.902676399026763, "grad_norm": 1.7755557298660278, "learning_rate": 5.982858360498167e-09, "loss": 2.1966, "step": 98000 }, { "epoch": 14.902676399026763, "eval_loss": 2.1286511421203613, "eval_runtime": 393.4618, "eval_samples_per_second": 1069.608, "eval_steps_per_second": 4.178, "step": 98000 }, { "epoch": 14.978710462287104, "grad_norm": 1.7456624507904053, "learning_rate": 3.953547649482303e-10, "loss": 2.1987, "step": 98500 }, { "epoch": 14.978710462287104, "eval_loss": 2.127889394760132, "eval_runtime": 393.3437, "eval_samples_per_second": 1069.93, "eval_steps_per_second": 4.18, "step": 98500 } ], "logging_steps": 500, "max_steps": 98640, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.646405662995644e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }