{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.816798539257456, "eval_steps": 500, "global_step": 56000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012172854534388313, "grad_norm": 21.32210922241211, "learning_rate": 4.85e-06, "loss": 3.6457, "step": 100 }, { "epoch": 0.024345709068776627, "grad_norm": 17.6686954498291, "learning_rate": 9.85e-06, "loss": 3.3243, "step": 200 }, { "epoch": 0.036518563603164945, "grad_norm": 17.45920181274414, "learning_rate": 1.4850000000000002e-05, "loss": 3.2128, "step": 300 }, { "epoch": 0.048691418137553254, "grad_norm": 18.293773651123047, "learning_rate": 1.985e-05, "loss": 3.124, "step": 400 }, { "epoch": 0.06086427267194157, "grad_norm": 15.793401718139648, "learning_rate": 1.995650224215247e-05, "loss": 3.1209, "step": 500 }, { "epoch": 0.06086427267194157, "eval_loss": 3.249819278717041, "eval_runtime": 6.941, "eval_samples_per_second": 144.072, "eval_steps_per_second": 36.018, "step": 500 }, { "epoch": 0.07303712720632989, "grad_norm": 13.932258605957031, "learning_rate": 1.9911659192825115e-05, "loss": 3.094, "step": 600 }, { "epoch": 0.0852099817407182, "grad_norm": 11.793479919433594, "learning_rate": 1.986681614349776e-05, "loss": 3.0426, "step": 700 }, { "epoch": 0.09738283627510651, "grad_norm": 11.373984336853027, "learning_rate": 1.9821973094170406e-05, "loss": 3.0645, "step": 800 }, { "epoch": 0.10955569080949483, "grad_norm": 10.407483100891113, "learning_rate": 1.9777130044843052e-05, "loss": 3.0681, "step": 900 }, { "epoch": 0.12172854534388314, "grad_norm": 9.600470542907715, "learning_rate": 1.9732286995515698e-05, "loss": 3.0599, "step": 1000 }, { "epoch": 0.12172854534388314, "eval_loss": 3.157822847366333, "eval_runtime": 6.8366, "eval_samples_per_second": 146.272, "eval_steps_per_second": 36.568, "step": 1000 }, { "epoch": 0.13390139987827146, "grad_norm": 10.010004043579102, "learning_rate": 1.9687443946188343e-05, "loss": 3.0379, "step": 1100 }, { "epoch": 0.14607425441265978, "grad_norm": 9.130040168762207, "learning_rate": 1.964260089686099e-05, "loss": 2.9859, "step": 1200 }, { "epoch": 0.15824710894704808, "grad_norm": 8.330909729003906, "learning_rate": 1.9597757847533635e-05, "loss": 3.0213, "step": 1300 }, { "epoch": 0.1704199634814364, "grad_norm": 7.502275466918945, "learning_rate": 1.955291479820628e-05, "loss": 3.0415, "step": 1400 }, { "epoch": 0.18259281801582472, "grad_norm": 7.305887222290039, "learning_rate": 1.9508071748878926e-05, "loss": 2.966, "step": 1500 }, { "epoch": 0.18259281801582472, "eval_loss": 3.091937303543091, "eval_runtime": 6.9209, "eval_samples_per_second": 144.491, "eval_steps_per_second": 36.123, "step": 1500 }, { "epoch": 0.19476567255021301, "grad_norm": 8.190788269042969, "learning_rate": 1.9463228699551572e-05, "loss": 2.9814, "step": 1600 }, { "epoch": 0.20693852708460134, "grad_norm": 7.867215633392334, "learning_rate": 1.9418385650224218e-05, "loss": 2.9614, "step": 1700 }, { "epoch": 0.21911138161898966, "grad_norm": 7.410882472991943, "learning_rate": 1.9373542600896864e-05, "loss": 2.9515, "step": 1800 }, { "epoch": 0.23128423615337795, "grad_norm": 6.388878345489502, "learning_rate": 1.9328699551569506e-05, "loss": 2.915, "step": 1900 }, { "epoch": 0.24345709068776628, "grad_norm": 6.401773452758789, "learning_rate": 1.928385650224215e-05, "loss": 2.942, "step": 2000 }, { "epoch": 0.24345709068776628, "eval_loss": 3.0528335571289062, "eval_runtime": 6.9438, "eval_samples_per_second": 144.014, "eval_steps_per_second": 36.004, "step": 2000 }, { "epoch": 0.2556299452221546, "grad_norm": 6.346031665802002, "learning_rate": 1.9239013452914797e-05, "loss": 2.952, "step": 2100 }, { "epoch": 0.2678027997565429, "grad_norm": 7.141861438751221, "learning_rate": 1.9194170403587446e-05, "loss": 2.9309, "step": 2200 }, { "epoch": 0.27997565429093124, "grad_norm": 7.175647735595703, "learning_rate": 1.9149327354260092e-05, "loss": 2.9315, "step": 2300 }, { "epoch": 0.29214850882531956, "grad_norm": 5.47502326965332, "learning_rate": 1.9104484304932738e-05, "loss": 2.944, "step": 2400 }, { "epoch": 0.30432136335970783, "grad_norm": 6.102653980255127, "learning_rate": 1.9059641255605384e-05, "loss": 2.8639, "step": 2500 }, { "epoch": 0.30432136335970783, "eval_loss": 3.0088276863098145, "eval_runtime": 6.9657, "eval_samples_per_second": 143.56, "eval_steps_per_second": 35.89, "step": 2500 }, { "epoch": 0.31649421789409615, "grad_norm": 6.21509313583374, "learning_rate": 1.901479820627803e-05, "loss": 2.8462, "step": 2600 }, { "epoch": 0.3286670724284845, "grad_norm": 7.218765735626221, "learning_rate": 1.8969955156950675e-05, "loss": 2.8849, "step": 2700 }, { "epoch": 0.3408399269628728, "grad_norm": 6.037746429443359, "learning_rate": 1.892511210762332e-05, "loss": 2.894, "step": 2800 }, { "epoch": 0.3530127814972611, "grad_norm": 5.483625411987305, "learning_rate": 1.8880269058295967e-05, "loss": 2.8988, "step": 2900 }, { "epoch": 0.36518563603164944, "grad_norm": 4.460190296173096, "learning_rate": 1.8835426008968612e-05, "loss": 2.8909, "step": 3000 }, { "epoch": 0.36518563603164944, "eval_loss": 2.9809019565582275, "eval_runtime": 6.9067, "eval_samples_per_second": 144.787, "eval_steps_per_second": 36.197, "step": 3000 }, { "epoch": 0.37735849056603776, "grad_norm": 5.2231125831604, "learning_rate": 1.8790582959641258e-05, "loss": 2.894, "step": 3100 }, { "epoch": 0.38953134510042603, "grad_norm": 5.9949517250061035, "learning_rate": 1.8745739910313904e-05, "loss": 2.8816, "step": 3200 }, { "epoch": 0.40170419963481435, "grad_norm": 5.3864054679870605, "learning_rate": 1.870089686098655e-05, "loss": 2.863, "step": 3300 }, { "epoch": 0.4138770541692027, "grad_norm": 6.138455390930176, "learning_rate": 1.8656053811659195e-05, "loss": 2.8577, "step": 3400 }, { "epoch": 0.426049908703591, "grad_norm": 5.222280025482178, "learning_rate": 1.861121076233184e-05, "loss": 2.901, "step": 3500 }, { "epoch": 0.426049908703591, "eval_loss": 2.944925308227539, "eval_runtime": 6.9152, "eval_samples_per_second": 144.608, "eval_steps_per_second": 36.152, "step": 3500 }, { "epoch": 0.4382227632379793, "grad_norm": 4.749873638153076, "learning_rate": 1.8566367713004487e-05, "loss": 2.8628, "step": 3600 }, { "epoch": 0.45039561777236764, "grad_norm": 4.7014336585998535, "learning_rate": 1.852152466367713e-05, "loss": 2.8418, "step": 3700 }, { "epoch": 0.4625684723067559, "grad_norm": 5.343926429748535, "learning_rate": 1.8476681614349775e-05, "loss": 2.9097, "step": 3800 }, { "epoch": 0.47474132684114423, "grad_norm": 5.276562690734863, "learning_rate": 1.8431838565022424e-05, "loss": 2.8659, "step": 3900 }, { "epoch": 0.48691418137553255, "grad_norm": 5.228163242340088, "learning_rate": 1.838699551569507e-05, "loss": 2.8497, "step": 4000 }, { "epoch": 0.48691418137553255, "eval_loss": 2.9137816429138184, "eval_runtime": 6.8289, "eval_samples_per_second": 146.437, "eval_steps_per_second": 36.609, "step": 4000 }, { "epoch": 0.4990870359099209, "grad_norm": 5.291093826293945, "learning_rate": 1.8342600896860988e-05, "loss": 2.8562, "step": 4100 }, { "epoch": 0.5112598904443092, "grad_norm": 5.388160705566406, "learning_rate": 1.8297757847533634e-05, "loss": 2.87, "step": 4200 }, { "epoch": 0.5234327449786975, "grad_norm": 5.260839939117432, "learning_rate": 1.825291479820628e-05, "loss": 2.8755, "step": 4300 }, { "epoch": 0.5356055995130858, "grad_norm": 5.170462131500244, "learning_rate": 1.8208071748878925e-05, "loss": 2.8342, "step": 4400 }, { "epoch": 0.5477784540474742, "grad_norm": 4.9179582595825195, "learning_rate": 1.816322869955157e-05, "loss": 2.8494, "step": 4500 }, { "epoch": 0.5477784540474742, "eval_loss": 2.886016607284546, "eval_runtime": 6.8492, "eval_samples_per_second": 146.002, "eval_steps_per_second": 36.5, "step": 4500 }, { "epoch": 0.5599513085818625, "grad_norm": 5.140480041503906, "learning_rate": 1.8118385650224217e-05, "loss": 2.8659, "step": 4600 }, { "epoch": 0.5721241631162508, "grad_norm": 5.088667869567871, "learning_rate": 1.8073542600896862e-05, "loss": 2.8228, "step": 4700 }, { "epoch": 0.5842970176506391, "grad_norm": 4.764868259429932, "learning_rate": 1.8028699551569508e-05, "loss": 2.8455, "step": 4800 }, { "epoch": 0.5964698721850273, "grad_norm": 4.458358287811279, "learning_rate": 1.7983856502242154e-05, "loss": 2.8196, "step": 4900 }, { "epoch": 0.6086427267194157, "grad_norm": 5.425631999969482, "learning_rate": 1.79390134529148e-05, "loss": 2.8247, "step": 5000 }, { "epoch": 0.6086427267194157, "eval_loss": 2.85610294342041, "eval_runtime": 6.9206, "eval_samples_per_second": 144.495, "eval_steps_per_second": 36.124, "step": 5000 }, { "epoch": 0.620815581253804, "grad_norm": 4.651830196380615, "learning_rate": 1.7894170403587445e-05, "loss": 2.8296, "step": 5100 }, { "epoch": 0.6329884357881923, "grad_norm": 5.064242839813232, "learning_rate": 1.784932735426009e-05, "loss": 2.8446, "step": 5200 }, { "epoch": 0.6451612903225806, "grad_norm": 5.3180413246154785, "learning_rate": 1.7804484304932737e-05, "loss": 2.7944, "step": 5300 }, { "epoch": 0.657334144856969, "grad_norm": 4.934672832489014, "learning_rate": 1.7759641255605383e-05, "loss": 2.7975, "step": 5400 }, { "epoch": 0.6695069993913573, "grad_norm": 5.154861927032471, "learning_rate": 1.7714798206278028e-05, "loss": 2.8144, "step": 5500 }, { "epoch": 0.6695069993913573, "eval_loss": 2.831345319747925, "eval_runtime": 6.9102, "eval_samples_per_second": 144.714, "eval_steps_per_second": 36.179, "step": 5500 }, { "epoch": 0.6816798539257456, "grad_norm": 5.322381973266602, "learning_rate": 1.7669955156950674e-05, "loss": 2.8196, "step": 5600 }, { "epoch": 0.6938527084601339, "grad_norm": 4.949143886566162, "learning_rate": 1.762511210762332e-05, "loss": 2.8154, "step": 5700 }, { "epoch": 0.7060255629945222, "grad_norm": 4.853809356689453, "learning_rate": 1.7580269058295965e-05, "loss": 2.8085, "step": 5800 }, { "epoch": 0.7181984175289106, "grad_norm": 4.941267490386963, "learning_rate": 1.753542600896861e-05, "loss": 2.7982, "step": 5900 }, { "epoch": 0.7303712720632989, "grad_norm": 4.971885681152344, "learning_rate": 1.7490582959641257e-05, "loss": 2.8049, "step": 6000 }, { "epoch": 0.7303712720632989, "eval_loss": 2.8138246536254883, "eval_runtime": 6.8576, "eval_samples_per_second": 145.824, "eval_steps_per_second": 36.456, "step": 6000 }, { "epoch": 0.7425441265976872, "grad_norm": 4.718198776245117, "learning_rate": 1.7445739910313903e-05, "loss": 2.7546, "step": 6100 }, { "epoch": 0.7547169811320755, "grad_norm": 5.367305278778076, "learning_rate": 1.740089686098655e-05, "loss": 2.7714, "step": 6200 }, { "epoch": 0.7668898356664637, "grad_norm": 4.827259063720703, "learning_rate": 1.7356053811659194e-05, "loss": 2.8043, "step": 6300 }, { "epoch": 0.7790626902008521, "grad_norm": 5.011576175689697, "learning_rate": 1.731121076233184e-05, "loss": 2.7859, "step": 6400 }, { "epoch": 0.7912355447352404, "grad_norm": 5.363623142242432, "learning_rate": 1.7266816143497758e-05, "loss": 2.8161, "step": 6500 }, { "epoch": 0.7912355447352404, "eval_loss": 2.791551113128662, "eval_runtime": 6.8881, "eval_samples_per_second": 145.177, "eval_steps_per_second": 36.294, "step": 6500 }, { "epoch": 0.8034083992696287, "grad_norm": 4.721231937408447, "learning_rate": 1.7221973094170404e-05, "loss": 2.7857, "step": 6600 }, { "epoch": 0.815581253804017, "grad_norm": 4.657351016998291, "learning_rate": 1.717713004484305e-05, "loss": 2.7734, "step": 6700 }, { "epoch": 0.8277541083384053, "grad_norm": 4.4942145347595215, "learning_rate": 1.7132286995515695e-05, "loss": 2.7885, "step": 6800 }, { "epoch": 0.8399269628727937, "grad_norm": 5.061729431152344, "learning_rate": 1.708744394618834e-05, "loss": 2.7841, "step": 6900 }, { "epoch": 0.852099817407182, "grad_norm": 4.816007137298584, "learning_rate": 1.7042600896860987e-05, "loss": 2.741, "step": 7000 }, { "epoch": 0.852099817407182, "eval_loss": 2.7756857872009277, "eval_runtime": 6.8679, "eval_samples_per_second": 145.605, "eval_steps_per_second": 36.401, "step": 7000 }, { "epoch": 0.8642726719415703, "grad_norm": 5.255375385284424, "learning_rate": 1.6997757847533633e-05, "loss": 2.7561, "step": 7100 }, { "epoch": 0.8764455264759586, "grad_norm": 4.844815254211426, "learning_rate": 1.695291479820628e-05, "loss": 2.7558, "step": 7200 }, { "epoch": 0.888618381010347, "grad_norm": 4.8912224769592285, "learning_rate": 1.6908071748878924e-05, "loss": 2.7512, "step": 7300 }, { "epoch": 0.9007912355447353, "grad_norm": 4.5775017738342285, "learning_rate": 1.686322869955157e-05, "loss": 2.745, "step": 7400 }, { "epoch": 0.9129640900791236, "grad_norm": 4.753942012786865, "learning_rate": 1.6818385650224216e-05, "loss": 2.7173, "step": 7500 }, { "epoch": 0.9129640900791236, "eval_loss": 2.7591283321380615, "eval_runtime": 6.877, "eval_samples_per_second": 145.412, "eval_steps_per_second": 36.353, "step": 7500 }, { "epoch": 0.9251369446135118, "grad_norm": 5.192244052886963, "learning_rate": 1.677354260089686e-05, "loss": 2.7373, "step": 7600 }, { "epoch": 0.9373097991479001, "grad_norm": 4.5390801429748535, "learning_rate": 1.6728699551569507e-05, "loss": 2.7654, "step": 7700 }, { "epoch": 0.9494826536822885, "grad_norm": 5.091897487640381, "learning_rate": 1.6683856502242153e-05, "loss": 2.7615, "step": 7800 }, { "epoch": 0.9616555082166768, "grad_norm": 4.253417015075684, "learning_rate": 1.6639013452914802e-05, "loss": 2.7521, "step": 7900 }, { "epoch": 0.9738283627510651, "grad_norm": 4.891059875488281, "learning_rate": 1.6594170403587448e-05, "loss": 2.7665, "step": 8000 }, { "epoch": 0.9738283627510651, "eval_loss": 2.7409751415252686, "eval_runtime": 6.8856, "eval_samples_per_second": 145.23, "eval_steps_per_second": 36.308, "step": 8000 }, { "epoch": 0.9860012172854534, "grad_norm": 4.412657260894775, "learning_rate": 1.6549327354260093e-05, "loss": 2.7471, "step": 8100 }, { "epoch": 0.9981740718198417, "grad_norm": 5.708240509033203, "learning_rate": 1.650448430493274e-05, "loss": 2.7545, "step": 8200 }, { "epoch": 1.01034692635423, "grad_norm": 4.956757068634033, "learning_rate": 1.645964125560538e-05, "loss": 2.6015, "step": 8300 }, { "epoch": 1.0225197808886184, "grad_norm": 5.220682621002197, "learning_rate": 1.6414798206278027e-05, "loss": 2.6077, "step": 8400 }, { "epoch": 1.0346926354230066, "grad_norm": 5.160597801208496, "learning_rate": 1.6369955156950673e-05, "loss": 2.5857, "step": 8500 }, { "epoch": 1.0346926354230066, "eval_loss": 2.7148427963256836, "eval_runtime": 6.8912, "eval_samples_per_second": 145.113, "eval_steps_per_second": 36.278, "step": 8500 }, { "epoch": 1.046865489957395, "grad_norm": 5.304019927978516, "learning_rate": 1.6325560538116595e-05, "loss": 2.5738, "step": 8600 }, { "epoch": 1.0590383444917832, "grad_norm": 5.3433637619018555, "learning_rate": 1.628071748878924e-05, "loss": 2.5499, "step": 8700 }, { "epoch": 1.0712111990261717, "grad_norm": 4.527110576629639, "learning_rate": 1.6235874439461886e-05, "loss": 2.6156, "step": 8800 }, { "epoch": 1.08338405356056, "grad_norm": 5.513104438781738, "learning_rate": 1.6191031390134532e-05, "loss": 2.6217, "step": 8900 }, { "epoch": 1.0955569080949483, "grad_norm": 5.579029083251953, "learning_rate": 1.6146188340807178e-05, "loss": 2.5829, "step": 9000 }, { "epoch": 1.0955569080949483, "eval_loss": 2.6865806579589844, "eval_runtime": 6.8369, "eval_samples_per_second": 146.265, "eval_steps_per_second": 36.566, "step": 9000 }, { "epoch": 1.1077297626293365, "grad_norm": 4.849677562713623, "learning_rate": 1.6101345291479823e-05, "loss": 2.6253, "step": 9100 }, { "epoch": 1.119902617163725, "grad_norm": 5.025945663452148, "learning_rate": 1.605650224215247e-05, "loss": 2.5725, "step": 9200 }, { "epoch": 1.1320754716981132, "grad_norm": 5.991898059844971, "learning_rate": 1.601165919282511e-05, "loss": 2.5994, "step": 9300 }, { "epoch": 1.1442483262325016, "grad_norm": 4.980128765106201, "learning_rate": 1.5966816143497757e-05, "loss": 2.5645, "step": 9400 }, { "epoch": 1.1564211807668898, "grad_norm": 4.839084625244141, "learning_rate": 1.5921973094170403e-05, "loss": 2.5861, "step": 9500 }, { "epoch": 1.1564211807668898, "eval_loss": 2.6708385944366455, "eval_runtime": 6.8615, "eval_samples_per_second": 145.74, "eval_steps_per_second": 36.435, "step": 9500 }, { "epoch": 1.168594035301278, "grad_norm": 5.058382511138916, "learning_rate": 1.587713004484305e-05, "loss": 2.5524, "step": 9600 }, { "epoch": 1.1807668898356665, "grad_norm": 4.867978572845459, "learning_rate": 1.5832286995515694e-05, "loss": 2.582, "step": 9700 }, { "epoch": 1.192939744370055, "grad_norm": 5.896303653717041, "learning_rate": 1.578744394618834e-05, "loss": 2.5899, "step": 9800 }, { "epoch": 1.205112598904443, "grad_norm": 4.735970497131348, "learning_rate": 1.574260089686099e-05, "loss": 2.5878, "step": 9900 }, { "epoch": 1.2172854534388313, "grad_norm": 4.8292670249938965, "learning_rate": 1.5697757847533635e-05, "loss": 2.6047, "step": 10000 }, { "epoch": 1.2172854534388313, "eval_loss": 2.65461802482605, "eval_runtime": 6.8819, "eval_samples_per_second": 145.309, "eval_steps_per_second": 36.327, "step": 10000 }, { "epoch": 1.2294583079732198, "grad_norm": 5.350712299346924, "learning_rate": 1.565291479820628e-05, "loss": 2.5777, "step": 10100 }, { "epoch": 1.241631162507608, "grad_norm": 5.471200466156006, "learning_rate": 1.5608071748878926e-05, "loss": 2.5908, "step": 10200 }, { "epoch": 1.2538040170419964, "grad_norm": 5.038080215454102, "learning_rate": 1.5563228699551572e-05, "loss": 2.5951, "step": 10300 }, { "epoch": 1.2659768715763846, "grad_norm": 4.982104778289795, "learning_rate": 1.5518385650224218e-05, "loss": 2.5461, "step": 10400 }, { "epoch": 1.278149726110773, "grad_norm": 4.736184120178223, "learning_rate": 1.5473542600896864e-05, "loss": 2.5874, "step": 10500 }, { "epoch": 1.278149726110773, "eval_loss": 2.6384053230285645, "eval_runtime": 6.8888, "eval_samples_per_second": 145.164, "eval_steps_per_second": 36.291, "step": 10500 }, { "epoch": 1.2903225806451613, "grad_norm": 5.710967540740967, "learning_rate": 1.5429147982062782e-05, "loss": 2.5818, "step": 10600 }, { "epoch": 1.3024954351795497, "grad_norm": 5.1653947830200195, "learning_rate": 1.5384304932735428e-05, "loss": 2.5916, "step": 10700 }, { "epoch": 1.314668289713938, "grad_norm": 5.706851959228516, "learning_rate": 1.5339461883408074e-05, "loss": 2.563, "step": 10800 }, { "epoch": 1.326841144248326, "grad_norm": 5.320187568664551, "learning_rate": 1.529461883408072e-05, "loss": 2.5657, "step": 10900 }, { "epoch": 1.3390139987827145, "grad_norm": 5.1567463874816895, "learning_rate": 1.5249775784753365e-05, "loss": 2.5362, "step": 11000 }, { "epoch": 1.3390139987827145, "eval_loss": 2.6256721019744873, "eval_runtime": 6.8781, "eval_samples_per_second": 145.389, "eval_steps_per_second": 36.347, "step": 11000 }, { "epoch": 1.351186853317103, "grad_norm": 5.355208396911621, "learning_rate": 1.520493273542601e-05, "loss": 2.5748, "step": 11100 }, { "epoch": 1.3633597078514912, "grad_norm": 4.878857612609863, "learning_rate": 1.5160089686098656e-05, "loss": 2.5768, "step": 11200 }, { "epoch": 1.3755325623858794, "grad_norm": 5.551296234130859, "learning_rate": 1.51152466367713e-05, "loss": 2.5616, "step": 11300 }, { "epoch": 1.3877054169202678, "grad_norm": 4.894459247589111, "learning_rate": 1.5070403587443946e-05, "loss": 2.5366, "step": 11400 }, { "epoch": 1.399878271454656, "grad_norm": 5.237545967102051, "learning_rate": 1.5025560538116592e-05, "loss": 2.5516, "step": 11500 }, { "epoch": 1.399878271454656, "eval_loss": 2.6034560203552246, "eval_runtime": 6.9038, "eval_samples_per_second": 144.848, "eval_steps_per_second": 36.212, "step": 11500 }, { "epoch": 1.4120511259890445, "grad_norm": 4.714597702026367, "learning_rate": 1.4980717488789238e-05, "loss": 2.5384, "step": 11600 }, { "epoch": 1.4242239805234327, "grad_norm": 4.776740550994873, "learning_rate": 1.4935874439461883e-05, "loss": 2.5733, "step": 11700 }, { "epoch": 1.4363968350578211, "grad_norm": 5.181590557098389, "learning_rate": 1.4891031390134529e-05, "loss": 2.5698, "step": 11800 }, { "epoch": 1.4485696895922093, "grad_norm": 4.948436737060547, "learning_rate": 1.4846188340807177e-05, "loss": 2.5288, "step": 11900 }, { "epoch": 1.4607425441265978, "grad_norm": 5.549213409423828, "learning_rate": 1.4801345291479822e-05, "loss": 2.5291, "step": 12000 }, { "epoch": 1.4607425441265978, "eval_loss": 2.5940563678741455, "eval_runtime": 6.8627, "eval_samples_per_second": 145.716, "eval_steps_per_second": 36.429, "step": 12000 }, { "epoch": 1.472915398660986, "grad_norm": 6.372870445251465, "learning_rate": 1.4756502242152468e-05, "loss": 2.5457, "step": 12100 }, { "epoch": 1.4850882531953742, "grad_norm": 5.433255195617676, "learning_rate": 1.4711659192825114e-05, "loss": 2.5521, "step": 12200 }, { "epoch": 1.4972611077297626, "grad_norm": 5.604691028594971, "learning_rate": 1.466681614349776e-05, "loss": 2.5585, "step": 12300 }, { "epoch": 1.509433962264151, "grad_norm": 5.348121643066406, "learning_rate": 1.4621973094170405e-05, "loss": 2.527, "step": 12400 }, { "epoch": 1.5216068167985393, "grad_norm": 4.68524694442749, "learning_rate": 1.4577130044843051e-05, "loss": 2.5351, "step": 12500 }, { "epoch": 1.5216068167985393, "eval_loss": 2.5787315368652344, "eval_runtime": 6.8521, "eval_samples_per_second": 145.94, "eval_steps_per_second": 36.485, "step": 12500 }, { "epoch": 1.5337796713329275, "grad_norm": 5.369399070739746, "learning_rate": 1.4532286995515697e-05, "loss": 2.5457, "step": 12600 }, { "epoch": 1.545952525867316, "grad_norm": 5.384763717651367, "learning_rate": 1.4487892376681615e-05, "loss": 2.5603, "step": 12700 }, { "epoch": 1.5581253804017043, "grad_norm": 5.1856369972229, "learning_rate": 1.4443049327354261e-05, "loss": 2.5531, "step": 12800 }, { "epoch": 1.5702982349360926, "grad_norm": 5.600665092468262, "learning_rate": 1.4398206278026907e-05, "loss": 2.5226, "step": 12900 }, { "epoch": 1.5824710894704808, "grad_norm": 5.185864448547363, "learning_rate": 1.4353363228699552e-05, "loss": 2.5585, "step": 13000 }, { "epoch": 1.5824710894704808, "eval_loss": 2.5597262382507324, "eval_runtime": 6.9067, "eval_samples_per_second": 144.787, "eval_steps_per_second": 36.197, "step": 13000 }, { "epoch": 1.5946439440048692, "grad_norm": 5.945424556732178, "learning_rate": 1.4308520179372198e-05, "loss": 2.5447, "step": 13100 }, { "epoch": 1.6068167985392574, "grad_norm": 4.447841167449951, "learning_rate": 1.4263677130044844e-05, "loss": 2.5638, "step": 13200 }, { "epoch": 1.6189896530736458, "grad_norm": 4.947375297546387, "learning_rate": 1.421883408071749e-05, "loss": 2.5245, "step": 13300 }, { "epoch": 1.631162507608034, "grad_norm": 5.11275053024292, "learning_rate": 1.4173991031390135e-05, "loss": 2.504, "step": 13400 }, { "epoch": 1.6433353621424223, "grad_norm": 5.144463539123535, "learning_rate": 1.4129147982062781e-05, "loss": 2.5517, "step": 13500 }, { "epoch": 1.6433353621424223, "eval_loss": 2.5378565788269043, "eval_runtime": 6.8542, "eval_samples_per_second": 145.896, "eval_steps_per_second": 36.474, "step": 13500 }, { "epoch": 1.6555082166768107, "grad_norm": 6.138312816619873, "learning_rate": 1.4084304932735427e-05, "loss": 2.5334, "step": 13600 }, { "epoch": 1.6676810712111991, "grad_norm": 4.641015529632568, "learning_rate": 1.4039461883408072e-05, "loss": 2.5692, "step": 13700 }, { "epoch": 1.6798539257455873, "grad_norm": 5.140405178070068, "learning_rate": 1.3994618834080718e-05, "loss": 2.5462, "step": 13800 }, { "epoch": 1.6920267802799756, "grad_norm": 5.093076705932617, "learning_rate": 1.3949775784753366e-05, "loss": 2.5227, "step": 13900 }, { "epoch": 1.704199634814364, "grad_norm": 5.549164772033691, "learning_rate": 1.3904932735426011e-05, "loss": 2.5469, "step": 14000 }, { "epoch": 1.704199634814364, "eval_loss": 2.5302209854125977, "eval_runtime": 6.8833, "eval_samples_per_second": 145.279, "eval_steps_per_second": 36.32, "step": 14000 }, { "epoch": 1.7163724893487524, "grad_norm": 5.112196922302246, "learning_rate": 1.3860089686098657e-05, "loss": 2.4753, "step": 14100 }, { "epoch": 1.7285453438831406, "grad_norm": 4.9223313331604, "learning_rate": 1.3815246636771303e-05, "loss": 2.5477, "step": 14200 }, { "epoch": 1.7407181984175288, "grad_norm": 5.270020484924316, "learning_rate": 1.3770403587443948e-05, "loss": 2.5141, "step": 14300 }, { "epoch": 1.7528910529519173, "grad_norm": 5.377967357635498, "learning_rate": 1.3725560538116594e-05, "loss": 2.5151, "step": 14400 }, { "epoch": 1.7650639074863055, "grad_norm": 4.732293605804443, "learning_rate": 1.368071748878924e-05, "loss": 2.559, "step": 14500 }, { "epoch": 1.7650639074863055, "eval_loss": 2.5161020755767822, "eval_runtime": 6.8279, "eval_samples_per_second": 146.457, "eval_steps_per_second": 36.614, "step": 14500 }, { "epoch": 1.777236762020694, "grad_norm": 5.2639241218566895, "learning_rate": 1.3635874439461884e-05, "loss": 2.5199, "step": 14600 }, { "epoch": 1.7894096165550821, "grad_norm": 5.222829818725586, "learning_rate": 1.3591479820627804e-05, "loss": 2.5122, "step": 14700 }, { "epoch": 1.8015824710894703, "grad_norm": 5.396998882293701, "learning_rate": 1.354663677130045e-05, "loss": 2.5665, "step": 14800 }, { "epoch": 1.8137553256238588, "grad_norm": 5.598328113555908, "learning_rate": 1.3501793721973096e-05, "loss": 2.5061, "step": 14900 }, { "epoch": 1.8259281801582472, "grad_norm": 4.519299507141113, "learning_rate": 1.3456950672645741e-05, "loss": 2.5173, "step": 15000 }, { "epoch": 1.8259281801582472, "eval_loss": 2.505549430847168, "eval_runtime": 6.8476, "eval_samples_per_second": 146.036, "eval_steps_per_second": 36.509, "step": 15000 }, { "epoch": 1.8381010346926354, "grad_norm": 5.07867431640625, "learning_rate": 1.3412107623318387e-05, "loss": 2.5085, "step": 15100 }, { "epoch": 1.8502738892270236, "grad_norm": 4.80793571472168, "learning_rate": 1.3367264573991033e-05, "loss": 2.5269, "step": 15200 }, { "epoch": 1.862446743761412, "grad_norm": 5.122992992401123, "learning_rate": 1.3322421524663679e-05, "loss": 2.5165, "step": 15300 }, { "epoch": 1.8746195982958005, "grad_norm": 5.070724010467529, "learning_rate": 1.3277578475336324e-05, "loss": 2.4733, "step": 15400 }, { "epoch": 1.8867924528301887, "grad_norm": 4.850822448730469, "learning_rate": 1.3232735426008968e-05, "loss": 2.5045, "step": 15500 }, { "epoch": 1.8867924528301887, "eval_loss": 2.49042010307312, "eval_runtime": 6.9202, "eval_samples_per_second": 144.505, "eval_steps_per_second": 36.126, "step": 15500 }, { "epoch": 1.898965307364577, "grad_norm": 5.182281494140625, "learning_rate": 1.3187892376681614e-05, "loss": 2.4858, "step": 15600 }, { "epoch": 1.9111381618989653, "grad_norm": 4.803709030151367, "learning_rate": 1.314304932735426e-05, "loss": 2.5043, "step": 15700 }, { "epoch": 1.9233110164333538, "grad_norm": 5.211897850036621, "learning_rate": 1.3098206278026905e-05, "loss": 2.4974, "step": 15800 }, { "epoch": 1.935483870967742, "grad_norm": 4.982048988342285, "learning_rate": 1.3053363228699553e-05, "loss": 2.4901, "step": 15900 }, { "epoch": 1.9476567255021302, "grad_norm": 5.34013557434082, "learning_rate": 1.3008520179372199e-05, "loss": 2.4938, "step": 16000 }, { "epoch": 1.9476567255021302, "eval_loss": 2.479241371154785, "eval_runtime": 6.9209, "eval_samples_per_second": 144.49, "eval_steps_per_second": 36.122, "step": 16000 }, { "epoch": 1.9598295800365184, "grad_norm": 4.926109790802002, "learning_rate": 1.2963677130044844e-05, "loss": 2.522, "step": 16100 }, { "epoch": 1.9720024345709068, "grad_norm": 5.252937316894531, "learning_rate": 1.291883408071749e-05, "loss": 2.4979, "step": 16200 }, { "epoch": 1.9841752891052953, "grad_norm": 4.676843166351318, "learning_rate": 1.2873991031390136e-05, "loss": 2.5011, "step": 16300 }, { "epoch": 1.9963481436396835, "grad_norm": 4.4982171058654785, "learning_rate": 1.2829147982062782e-05, "loss": 2.5232, "step": 16400 }, { "epoch": 2.0085209981740717, "grad_norm": 5.115514278411865, "learning_rate": 1.2784304932735427e-05, "loss": 2.4807, "step": 16500 }, { "epoch": 2.0085209981740717, "eval_loss": 2.4553143978118896, "eval_runtime": 6.8911, "eval_samples_per_second": 145.114, "eval_steps_per_second": 36.279, "step": 16500 }, { "epoch": 2.02069385270846, "grad_norm": 5.778520107269287, "learning_rate": 1.2739461883408073e-05, "loss": 2.3637, "step": 16600 }, { "epoch": 2.0328667072428486, "grad_norm": 4.936229705810547, "learning_rate": 1.2694618834080719e-05, "loss": 2.3936, "step": 16700 }, { "epoch": 2.045039561777237, "grad_norm": 6.013847827911377, "learning_rate": 1.2649775784753364e-05, "loss": 2.3953, "step": 16800 }, { "epoch": 2.057212416311625, "grad_norm": 6.078458786010742, "learning_rate": 1.2605381165919283e-05, "loss": 2.3312, "step": 16900 }, { "epoch": 2.069385270846013, "grad_norm": 5.697019100189209, "learning_rate": 1.2560538116591929e-05, "loss": 2.334, "step": 17000 }, { "epoch": 2.069385270846013, "eval_loss": 2.4449574947357178, "eval_runtime": 6.9363, "eval_samples_per_second": 144.169, "eval_steps_per_second": 36.042, "step": 17000 }, { "epoch": 2.081558125380402, "grad_norm": 5.652517795562744, "learning_rate": 1.2515695067264574e-05, "loss": 2.3902, "step": 17100 }, { "epoch": 2.09373097991479, "grad_norm": 6.007380485534668, "learning_rate": 1.247085201793722e-05, "loss": 2.3629, "step": 17200 }, { "epoch": 2.1059038344491783, "grad_norm": 5.070584774017334, "learning_rate": 1.2426008968609866e-05, "loss": 2.3523, "step": 17300 }, { "epoch": 2.1180766889835665, "grad_norm": 5.079153537750244, "learning_rate": 1.2381165919282512e-05, "loss": 2.3429, "step": 17400 }, { "epoch": 2.130249543517955, "grad_norm": 5.278266906738281, "learning_rate": 1.2336322869955157e-05, "loss": 2.2969, "step": 17500 }, { "epoch": 2.130249543517955, "eval_loss": 2.4217474460601807, "eval_runtime": 6.9637, "eval_samples_per_second": 143.601, "eval_steps_per_second": 35.9, "step": 17500 }, { "epoch": 2.1424223980523434, "grad_norm": 5.2419633865356445, "learning_rate": 1.2291479820627803e-05, "loss": 2.3671, "step": 17600 }, { "epoch": 2.1545952525867316, "grad_norm": 5.445255279541016, "learning_rate": 1.2246636771300449e-05, "loss": 2.3834, "step": 17700 }, { "epoch": 2.16676810712112, "grad_norm": 5.891075134277344, "learning_rate": 1.2201793721973095e-05, "loss": 2.36, "step": 17800 }, { "epoch": 2.178940961655508, "grad_norm": 5.8141865730285645, "learning_rate": 1.215695067264574e-05, "loss": 2.3596, "step": 17900 }, { "epoch": 2.1911138161898966, "grad_norm": 5.558561325073242, "learning_rate": 1.2112107623318388e-05, "loss": 2.3926, "step": 18000 }, { "epoch": 2.1911138161898966, "eval_loss": 2.415804624557495, "eval_runtime": 6.8469, "eval_samples_per_second": 146.052, "eval_steps_per_second": 36.513, "step": 18000 }, { "epoch": 2.203286670724285, "grad_norm": 5.968663692474365, "learning_rate": 1.2067264573991033e-05, "loss": 2.3609, "step": 18100 }, { "epoch": 2.215459525258673, "grad_norm": 5.241644382476807, "learning_rate": 1.2022421524663679e-05, "loss": 2.3634, "step": 18200 }, { "epoch": 2.2276323797930613, "grad_norm": 6.328832149505615, "learning_rate": 1.1977578475336325e-05, "loss": 2.3465, "step": 18300 }, { "epoch": 2.23980523432745, "grad_norm": 5.125701904296875, "learning_rate": 1.193273542600897e-05, "loss": 2.3171, "step": 18400 }, { "epoch": 2.251978088861838, "grad_norm": 4.962270259857178, "learning_rate": 1.1887892376681616e-05, "loss": 2.3739, "step": 18500 }, { "epoch": 2.251978088861838, "eval_loss": 2.4065887928009033, "eval_runtime": 6.9359, "eval_samples_per_second": 144.178, "eval_steps_per_second": 36.045, "step": 18500 }, { "epoch": 2.2641509433962264, "grad_norm": 5.895593643188477, "learning_rate": 1.1843049327354262e-05, "loss": 2.3656, "step": 18600 }, { "epoch": 2.2763237979306146, "grad_norm": 6.21762752532959, "learning_rate": 1.1798206278026906e-05, "loss": 2.3575, "step": 18700 }, { "epoch": 2.2884966524650032, "grad_norm": 5.935133934020996, "learning_rate": 1.1753363228699552e-05, "loss": 2.3687, "step": 18800 }, { "epoch": 2.3006695069993914, "grad_norm": 5.431483268737793, "learning_rate": 1.1708520179372198e-05, "loss": 2.3465, "step": 18900 }, { "epoch": 2.3128423615337796, "grad_norm": 6.319828510284424, "learning_rate": 1.1664125560538118e-05, "loss": 2.3659, "step": 19000 }, { "epoch": 2.3128423615337796, "eval_loss": 2.390819787979126, "eval_runtime": 6.9389, "eval_samples_per_second": 144.115, "eval_steps_per_second": 36.029, "step": 19000 }, { "epoch": 2.325015216068168, "grad_norm": 5.955752372741699, "learning_rate": 1.1619282511210763e-05, "loss": 2.3702, "step": 19100 }, { "epoch": 2.337188070602556, "grad_norm": 5.977270603179932, "learning_rate": 1.157443946188341e-05, "loss": 2.3935, "step": 19200 }, { "epoch": 2.3493609251369447, "grad_norm": 5.417830944061279, "learning_rate": 1.1529596412556055e-05, "loss": 2.359, "step": 19300 }, { "epoch": 2.361533779671333, "grad_norm": 5.452037334442139, "learning_rate": 1.14847533632287e-05, "loss": 2.3496, "step": 19400 }, { "epoch": 2.373706634205721, "grad_norm": 4.931158065795898, "learning_rate": 1.1439910313901346e-05, "loss": 2.3483, "step": 19500 }, { "epoch": 2.373706634205721, "eval_loss": 2.3805489540100098, "eval_runtime": 6.803, "eval_samples_per_second": 146.994, "eval_steps_per_second": 36.749, "step": 19500 }, { "epoch": 2.38587948874011, "grad_norm": 5.650387287139893, "learning_rate": 1.1395067264573992e-05, "loss": 2.3644, "step": 19600 }, { "epoch": 2.398052343274498, "grad_norm": 5.70589542388916, "learning_rate": 1.1350224215246636e-05, "loss": 2.3472, "step": 19700 }, { "epoch": 2.410225197808886, "grad_norm": 5.833774566650391, "learning_rate": 1.1305381165919282e-05, "loss": 2.3663, "step": 19800 }, { "epoch": 2.4223980523432744, "grad_norm": 5.079782485961914, "learning_rate": 1.1260538116591928e-05, "loss": 2.3726, "step": 19900 }, { "epoch": 2.4345709068776626, "grad_norm": 5.578153610229492, "learning_rate": 1.1215695067264577e-05, "loss": 2.3432, "step": 20000 }, { "epoch": 2.4345709068776626, "eval_loss": 2.3689472675323486, "eval_runtime": 6.993, "eval_samples_per_second": 143.0, "eval_steps_per_second": 35.75, "step": 20000 }, { "epoch": 2.4467437614120513, "grad_norm": 5.551452159881592, "learning_rate": 1.117085201793722e-05, "loss": 2.3658, "step": 20100 }, { "epoch": 2.4589166159464395, "grad_norm": 5.28959321975708, "learning_rate": 1.1126008968609866e-05, "loss": 2.3526, "step": 20200 }, { "epoch": 2.4710894704808277, "grad_norm": 5.358762741088867, "learning_rate": 1.1081165919282512e-05, "loss": 2.3161, "step": 20300 }, { "epoch": 2.483262325015216, "grad_norm": 5.633576393127441, "learning_rate": 1.1036322869955158e-05, "loss": 2.3778, "step": 20400 }, { "epoch": 2.495435179549604, "grad_norm": 5.258509635925293, "learning_rate": 1.0991479820627804e-05, "loss": 2.3538, "step": 20500 }, { "epoch": 2.495435179549604, "eval_loss": 2.358330488204956, "eval_runtime": 6.9369, "eval_samples_per_second": 144.156, "eval_steps_per_second": 36.039, "step": 20500 }, { "epoch": 2.507608034083993, "grad_norm": 5.632132053375244, "learning_rate": 1.094663677130045e-05, "loss": 2.3514, "step": 20600 }, { "epoch": 2.519780888618381, "grad_norm": 5.449893951416016, "learning_rate": 1.0901793721973095e-05, "loss": 2.3404, "step": 20700 }, { "epoch": 2.531953743152769, "grad_norm": 5.66605281829834, "learning_rate": 1.085695067264574e-05, "loss": 2.3335, "step": 20800 }, { "epoch": 2.544126597687158, "grad_norm": 6.729547500610352, "learning_rate": 1.0812107623318387e-05, "loss": 2.3784, "step": 20900 }, { "epoch": 2.556299452221546, "grad_norm": 5.5277581214904785, "learning_rate": 1.0767713004484305e-05, "loss": 2.3424, "step": 21000 }, { "epoch": 2.556299452221546, "eval_loss": 2.3434271812438965, "eval_runtime": 6.8629, "eval_samples_per_second": 145.712, "eval_steps_per_second": 36.428, "step": 21000 }, { "epoch": 2.5684723067559343, "grad_norm": 5.892464637756348, "learning_rate": 1.072286995515695e-05, "loss": 2.3577, "step": 21100 }, { "epoch": 2.5806451612903225, "grad_norm": 5.313469409942627, "learning_rate": 1.0678026905829597e-05, "loss": 2.3489, "step": 21200 }, { "epoch": 2.5928180158247107, "grad_norm": 5.569064140319824, "learning_rate": 1.0633183856502242e-05, "loss": 2.3828, "step": 21300 }, { "epoch": 2.6049908703590994, "grad_norm": 6.133281707763672, "learning_rate": 1.0588340807174888e-05, "loss": 2.3203, "step": 21400 }, { "epoch": 2.6171637248934876, "grad_norm": 5.569573402404785, "learning_rate": 1.0543497757847534e-05, "loss": 2.3508, "step": 21500 }, { "epoch": 2.6171637248934876, "eval_loss": 2.3320422172546387, "eval_runtime": 6.8431, "eval_samples_per_second": 146.132, "eval_steps_per_second": 36.533, "step": 21500 }, { "epoch": 2.629336579427876, "grad_norm": 5.560952186584473, "learning_rate": 1.049865470852018e-05, "loss": 2.3532, "step": 21600 }, { "epoch": 2.641509433962264, "grad_norm": 5.652987957000732, "learning_rate": 1.0453811659192825e-05, "loss": 2.3233, "step": 21700 }, { "epoch": 2.653682288496652, "grad_norm": 5.666792869567871, "learning_rate": 1.0408968609865471e-05, "loss": 2.353, "step": 21800 }, { "epoch": 2.665855143031041, "grad_norm": 5.652164936065674, "learning_rate": 1.0364125560538117e-05, "loss": 2.3483, "step": 21900 }, { "epoch": 2.678027997565429, "grad_norm": 5.158956527709961, "learning_rate": 1.0319282511210764e-05, "loss": 2.3344, "step": 22000 }, { "epoch": 2.678027997565429, "eval_loss": 2.3204360008239746, "eval_runtime": 6.8964, "eval_samples_per_second": 145.003, "eval_steps_per_second": 36.251, "step": 22000 }, { "epoch": 2.6902008520998173, "grad_norm": 4.993370056152344, "learning_rate": 1.027443946188341e-05, "loss": 2.3185, "step": 22100 }, { "epoch": 2.702373706634206, "grad_norm": 5.251499652862549, "learning_rate": 1.0229596412556056e-05, "loss": 2.3463, "step": 22200 }, { "epoch": 2.714546561168594, "grad_norm": 5.155273914337158, "learning_rate": 1.0184753363228701e-05, "loss": 2.3299, "step": 22300 }, { "epoch": 2.7267194157029824, "grad_norm": 4.445164680480957, "learning_rate": 1.0139910313901347e-05, "loss": 2.3368, "step": 22400 }, { "epoch": 2.7388922702373706, "grad_norm": 5.968411445617676, "learning_rate": 1.0095067264573993e-05, "loss": 2.321, "step": 22500 }, { "epoch": 2.7388922702373706, "eval_loss": 2.3084633350372314, "eval_runtime": 6.9774, "eval_samples_per_second": 143.32, "eval_steps_per_second": 35.83, "step": 22500 }, { "epoch": 2.751065124771759, "grad_norm": 5.2266364097595215, "learning_rate": 1.0050224215246638e-05, "loss": 2.3387, "step": 22600 }, { "epoch": 2.7632379793061475, "grad_norm": 5.649938583374023, "learning_rate": 1.0005381165919284e-05, "loss": 2.3388, "step": 22700 }, { "epoch": 2.7754108338405357, "grad_norm": 5.603872299194336, "learning_rate": 9.96053811659193e-06, "loss": 2.3331, "step": 22800 }, { "epoch": 2.787583688374924, "grad_norm": 5.831801891326904, "learning_rate": 9.915695067264574e-06, "loss": 2.3509, "step": 22900 }, { "epoch": 2.799756542909312, "grad_norm": 5.071148872375488, "learning_rate": 9.871300448430494e-06, "loss": 2.3296, "step": 23000 }, { "epoch": 2.799756542909312, "eval_loss": 2.298048257827759, "eval_runtime": 6.8909, "eval_samples_per_second": 145.119, "eval_steps_per_second": 36.28, "step": 23000 }, { "epoch": 2.8119293974437003, "grad_norm": 5.613708972930908, "learning_rate": 9.82645739910314e-06, "loss": 2.3458, "step": 23100 }, { "epoch": 2.824102251978089, "grad_norm": 6.964206218719482, "learning_rate": 9.781614349775786e-06, "loss": 2.3523, "step": 23200 }, { "epoch": 2.836275106512477, "grad_norm": 6.069615364074707, "learning_rate": 9.737219730941706e-06, "loss": 2.3364, "step": 23300 }, { "epoch": 2.8484479610468654, "grad_norm": 4.563328266143799, "learning_rate": 9.69237668161435e-06, "loss": 2.3164, "step": 23400 }, { "epoch": 2.860620815581254, "grad_norm": 5.069984436035156, "learning_rate": 9.647533632286995e-06, "loss": 2.3347, "step": 23500 }, { "epoch": 2.860620815581254, "eval_loss": 2.2902982234954834, "eval_runtime": 6.9027, "eval_samples_per_second": 144.87, "eval_steps_per_second": 36.218, "step": 23500 }, { "epoch": 2.8727936701156422, "grad_norm": 5.443928241729736, "learning_rate": 9.602690582959641e-06, "loss": 2.3211, "step": 23600 }, { "epoch": 2.8849665246500305, "grad_norm": 5.5851664543151855, "learning_rate": 9.557847533632287e-06, "loss": 2.3469, "step": 23700 }, { "epoch": 2.8971393791844187, "grad_norm": 5.386264324188232, "learning_rate": 9.513004484304934e-06, "loss": 2.3303, "step": 23800 }, { "epoch": 2.909312233718807, "grad_norm": 5.505928993225098, "learning_rate": 9.46816143497758e-06, "loss": 2.3396, "step": 23900 }, { "epoch": 2.9214850882531955, "grad_norm": 5.181743621826172, "learning_rate": 9.423318385650226e-06, "loss": 2.3214, "step": 24000 }, { "epoch": 2.9214850882531955, "eval_loss": 2.28114652633667, "eval_runtime": 6.8437, "eval_samples_per_second": 146.121, "eval_steps_per_second": 36.53, "step": 24000 }, { "epoch": 2.9336579427875837, "grad_norm": 6.292041301727295, "learning_rate": 9.378475336322872e-06, "loss": 2.3341, "step": 24100 }, { "epoch": 2.945830797321972, "grad_norm": 5.232330322265625, "learning_rate": 9.333632286995517e-06, "loss": 2.2984, "step": 24200 }, { "epoch": 2.95800365185636, "grad_norm": 5.351822376251221, "learning_rate": 9.288789237668161e-06, "loss": 2.326, "step": 24300 }, { "epoch": 2.9701765063907484, "grad_norm": 5.880828380584717, "learning_rate": 9.243946188340807e-06, "loss": 2.3399, "step": 24400 }, { "epoch": 2.982349360925137, "grad_norm": 5.407314777374268, "learning_rate": 9.199103139013453e-06, "loss": 2.3007, "step": 24500 }, { "epoch": 2.982349360925137, "eval_loss": 2.273526906967163, "eval_runtime": 6.9358, "eval_samples_per_second": 144.179, "eval_steps_per_second": 36.045, "step": 24500 }, { "epoch": 2.9945222154595252, "grad_norm": 5.49412727355957, "learning_rate": 9.154260089686099e-06, "loss": 2.3325, "step": 24600 }, { "epoch": 3.0066950699939134, "grad_norm": 5.382359981536865, "learning_rate": 9.109417040358746e-06, "loss": 2.2779, "step": 24700 }, { "epoch": 3.018867924528302, "grad_norm": 6.563231945037842, "learning_rate": 9.064573991031392e-06, "loss": 2.2142, "step": 24800 }, { "epoch": 3.0310407790626903, "grad_norm": 6.05570650100708, "learning_rate": 9.019730941704037e-06, "loss": 2.1952, "step": 24900 }, { "epoch": 3.0432136335970785, "grad_norm": 5.2819366455078125, "learning_rate": 8.974887892376683e-06, "loss": 2.2244, "step": 25000 }, { "epoch": 3.0432136335970785, "eval_loss": 2.253713846206665, "eval_runtime": 6.8869, "eval_samples_per_second": 145.202, "eval_steps_per_second": 36.301, "step": 25000 }, { "epoch": 3.0553864881314667, "grad_norm": 5.801946640014648, "learning_rate": 8.930044843049329e-06, "loss": 2.2423, "step": 25100 }, { "epoch": 3.067559342665855, "grad_norm": 5.829814910888672, "learning_rate": 8.885201793721973e-06, "loss": 2.2372, "step": 25200 }, { "epoch": 3.0797321972002436, "grad_norm": 5.983118534088135, "learning_rate": 8.840358744394619e-06, "loss": 2.2363, "step": 25300 }, { "epoch": 3.091905051734632, "grad_norm": 5.694368839263916, "learning_rate": 8.795515695067264e-06, "loss": 2.1785, "step": 25400 }, { "epoch": 3.10407790626902, "grad_norm": 5.976083755493164, "learning_rate": 8.75067264573991e-06, "loss": 2.2061, "step": 25500 }, { "epoch": 3.10407790626902, "eval_loss": 2.2468533515930176, "eval_runtime": 6.9894, "eval_samples_per_second": 143.075, "eval_steps_per_second": 35.769, "step": 25500 }, { "epoch": 3.1162507608034082, "grad_norm": 5.972872734069824, "learning_rate": 8.705829596412557e-06, "loss": 2.2269, "step": 25600 }, { "epoch": 3.128423615337797, "grad_norm": 5.245333671569824, "learning_rate": 8.660986547085203e-06, "loss": 2.2519, "step": 25700 }, { "epoch": 3.140596469872185, "grad_norm": 6.581233501434326, "learning_rate": 8.616143497757849e-06, "loss": 2.2181, "step": 25800 }, { "epoch": 3.1527693244065733, "grad_norm": 6.18913459777832, "learning_rate": 8.571300448430495e-06, "loss": 2.1892, "step": 25900 }, { "epoch": 3.1649421789409615, "grad_norm": 5.771265983581543, "learning_rate": 8.52645739910314e-06, "loss": 2.1789, "step": 26000 }, { "epoch": 3.1649421789409615, "eval_loss": 2.2344589233398438, "eval_runtime": 6.8267, "eval_samples_per_second": 146.484, "eval_steps_per_second": 36.621, "step": 26000 }, { "epoch": 3.17711503347535, "grad_norm": 6.273107528686523, "learning_rate": 8.481614349775784e-06, "loss": 2.2029, "step": 26100 }, { "epoch": 3.1892878880097384, "grad_norm": 6.768197059631348, "learning_rate": 8.43677130044843e-06, "loss": 2.2042, "step": 26200 }, { "epoch": 3.2014607425441266, "grad_norm": 7.103708267211914, "learning_rate": 8.391928251121076e-06, "loss": 2.2142, "step": 26300 }, { "epoch": 3.213633597078515, "grad_norm": 6.05976676940918, "learning_rate": 8.347085201793723e-06, "loss": 2.1747, "step": 26400 }, { "epoch": 3.225806451612903, "grad_norm": 5.711021900177002, "learning_rate": 8.302242152466369e-06, "loss": 2.2039, "step": 26500 }, { "epoch": 3.225806451612903, "eval_loss": 2.2293026447296143, "eval_runtime": 6.8771, "eval_samples_per_second": 145.41, "eval_steps_per_second": 36.352, "step": 26500 }, { "epoch": 3.2379793061472917, "grad_norm": 5.777741432189941, "learning_rate": 8.257399103139015e-06, "loss": 2.2259, "step": 26600 }, { "epoch": 3.25015216068168, "grad_norm": 5.676499843597412, "learning_rate": 8.21255605381166e-06, "loss": 2.1918, "step": 26700 }, { "epoch": 3.262325015216068, "grad_norm": 6.470264911651611, "learning_rate": 8.167713004484306e-06, "loss": 2.212, "step": 26800 }, { "epoch": 3.2744978697504563, "grad_norm": 6.308848857879639, "learning_rate": 8.122869955156952e-06, "loss": 2.2138, "step": 26900 }, { "epoch": 3.286670724284845, "grad_norm": 5.39501428604126, "learning_rate": 8.078026905829596e-06, "loss": 2.248, "step": 27000 }, { "epoch": 3.286670724284845, "eval_loss": 2.2192747592926025, "eval_runtime": 6.9636, "eval_samples_per_second": 143.603, "eval_steps_per_second": 35.901, "step": 27000 }, { "epoch": 3.298843578819233, "grad_norm": 5.875838756561279, "learning_rate": 8.033183856502242e-06, "loss": 2.2131, "step": 27100 }, { "epoch": 3.3110164333536214, "grad_norm": 5.159265518188477, "learning_rate": 7.988340807174887e-06, "loss": 2.2037, "step": 27200 }, { "epoch": 3.3231892878880096, "grad_norm": 5.619683265686035, "learning_rate": 7.943946188340808e-06, "loss": 2.1818, "step": 27300 }, { "epoch": 3.3353621424223983, "grad_norm": 7.503751277923584, "learning_rate": 7.899103139013453e-06, "loss": 2.2087, "step": 27400 }, { "epoch": 3.3475349969567865, "grad_norm": 5.3004937171936035, "learning_rate": 7.854260089686099e-06, "loss": 2.2151, "step": 27500 }, { "epoch": 3.3475349969567865, "eval_loss": 2.209369659423828, "eval_runtime": 6.9186, "eval_samples_per_second": 144.537, "eval_steps_per_second": 36.134, "step": 27500 }, { "epoch": 3.3597078514911747, "grad_norm": 6.6273193359375, "learning_rate": 7.809417040358745e-06, "loss": 2.2208, "step": 27600 }, { "epoch": 3.371880706025563, "grad_norm": 6.1234588623046875, "learning_rate": 7.76457399103139e-06, "loss": 2.1957, "step": 27700 }, { "epoch": 3.384053560559951, "grad_norm": 6.082681655883789, "learning_rate": 7.719730941704036e-06, "loss": 2.2202, "step": 27800 }, { "epoch": 3.3962264150943398, "grad_norm": 6.618956089019775, "learning_rate": 7.674887892376682e-06, "loss": 2.2045, "step": 27900 }, { "epoch": 3.408399269628728, "grad_norm": 5.74383544921875, "learning_rate": 7.630044843049328e-06, "loss": 2.2308, "step": 28000 }, { "epoch": 3.408399269628728, "eval_loss": 2.206360340118408, "eval_runtime": 6.9078, "eval_samples_per_second": 144.763, "eval_steps_per_second": 36.191, "step": 28000 }, { "epoch": 3.420572124163116, "grad_norm": 6.5505690574646, "learning_rate": 7.5852017937219735e-06, "loss": 2.2036, "step": 28100 }, { "epoch": 3.4327449786975044, "grad_norm": 5.887704372406006, "learning_rate": 7.540358744394619e-06, "loss": 2.1714, "step": 28200 }, { "epoch": 3.444917833231893, "grad_norm": 6.853738784790039, "learning_rate": 7.495515695067265e-06, "loss": 2.2269, "step": 28300 }, { "epoch": 3.4570906877662813, "grad_norm": 5.702883243560791, "learning_rate": 7.4506726457399115e-06, "loss": 2.192, "step": 28400 }, { "epoch": 3.4692635423006695, "grad_norm": 6.062043190002441, "learning_rate": 7.405829596412557e-06, "loss": 2.2095, "step": 28500 }, { "epoch": 3.4692635423006695, "eval_loss": 2.1974008083343506, "eval_runtime": 6.9256, "eval_samples_per_second": 144.392, "eval_steps_per_second": 36.098, "step": 28500 }, { "epoch": 3.4814363968350577, "grad_norm": 5.228243827819824, "learning_rate": 7.360986547085203e-06, "loss": 2.221, "step": 28600 }, { "epoch": 3.4936092513694463, "grad_norm": 5.8091607093811035, "learning_rate": 7.316143497757848e-06, "loss": 2.2062, "step": 28700 }, { "epoch": 3.5057821059038345, "grad_norm": 4.786416053771973, "learning_rate": 7.2713004484304936e-06, "loss": 2.1716, "step": 28800 }, { "epoch": 3.5179549604382228, "grad_norm": 6.92462158203125, "learning_rate": 7.226457399103139e-06, "loss": 2.211, "step": 28900 }, { "epoch": 3.530127814972611, "grad_norm": 7.192811489105225, "learning_rate": 7.181614349775785e-06, "loss": 2.2085, "step": 29000 }, { "epoch": 3.530127814972611, "eval_loss": 2.185516595840454, "eval_runtime": 6.8385, "eval_samples_per_second": 146.231, "eval_steps_per_second": 36.558, "step": 29000 }, { "epoch": 3.542300669506999, "grad_norm": 5.579026222229004, "learning_rate": 7.136771300448431e-06, "loss": 2.1974, "step": 29100 }, { "epoch": 3.554473524041388, "grad_norm": 6.277022838592529, "learning_rate": 7.0919282511210765e-06, "loss": 2.1982, "step": 29200 }, { "epoch": 3.566646378575776, "grad_norm": 5.486943244934082, "learning_rate": 7.047533632286996e-06, "loss": 2.1877, "step": 29300 }, { "epoch": 3.5788192331101643, "grad_norm": 6.431853771209717, "learning_rate": 7.0026905829596416e-06, "loss": 2.2109, "step": 29400 }, { "epoch": 3.590992087644553, "grad_norm": 6.601170539855957, "learning_rate": 6.957847533632287e-06, "loss": 2.2122, "step": 29500 }, { "epoch": 3.590992087644553, "eval_loss": 2.1791625022888184, "eval_runtime": 6.9136, "eval_samples_per_second": 144.642, "eval_steps_per_second": 36.161, "step": 29500 }, { "epoch": 3.603164942178941, "grad_norm": 5.159702301025391, "learning_rate": 6.913004484304934e-06, "loss": 2.2246, "step": 29600 }, { "epoch": 3.6153377967133293, "grad_norm": 6.260033130645752, "learning_rate": 6.86816143497758e-06, "loss": 2.2044, "step": 29700 }, { "epoch": 3.6275106512477175, "grad_norm": 5.428004741668701, "learning_rate": 6.823318385650225e-06, "loss": 2.1735, "step": 29800 }, { "epoch": 3.6396835057821058, "grad_norm": 5.895395278930664, "learning_rate": 6.77847533632287e-06, "loss": 2.2027, "step": 29900 }, { "epoch": 3.6518563603164944, "grad_norm": 5.690395355224609, "learning_rate": 6.733632286995516e-06, "loss": 2.2145, "step": 30000 }, { "epoch": 3.6518563603164944, "eval_loss": 2.177266836166382, "eval_runtime": 7.0906, "eval_samples_per_second": 141.032, "eval_steps_per_second": 35.258, "step": 30000 }, { "epoch": 3.6640292148508826, "grad_norm": 5.669330596923828, "learning_rate": 6.688789237668162e-06, "loss": 2.1954, "step": 30100 }, { "epoch": 3.676202069385271, "grad_norm": 6.493986129760742, "learning_rate": 6.643946188340807e-06, "loss": 2.2011, "step": 30200 }, { "epoch": 3.688374923919659, "grad_norm": 7.738183975219727, "learning_rate": 6.599103139013453e-06, "loss": 2.2347, "step": 30300 }, { "epoch": 3.7005477784540473, "grad_norm": 6.565354347229004, "learning_rate": 6.5542600896861e-06, "loss": 2.1945, "step": 30400 }, { "epoch": 3.712720632988436, "grad_norm": 6.189778804779053, "learning_rate": 6.509417040358745e-06, "loss": 2.2141, "step": 30500 }, { "epoch": 3.712720632988436, "eval_loss": 2.168225049972534, "eval_runtime": 6.9549, "eval_samples_per_second": 143.785, "eval_steps_per_second": 35.946, "step": 30500 }, { "epoch": 3.724893487522824, "grad_norm": 5.11403226852417, "learning_rate": 6.464573991031391e-06, "loss": 2.2121, "step": 30600 }, { "epoch": 3.7370663420572123, "grad_norm": 6.672878742218018, "learning_rate": 6.419730941704037e-06, "loss": 2.181, "step": 30700 }, { "epoch": 3.749239196591601, "grad_norm": 5.224799156188965, "learning_rate": 6.374887892376682e-06, "loss": 2.1807, "step": 30800 }, { "epoch": 3.761412051125989, "grad_norm": 6.442698955535889, "learning_rate": 6.330493273542602e-06, "loss": 2.2021, "step": 30900 }, { "epoch": 3.7735849056603774, "grad_norm": 6.708118438720703, "learning_rate": 6.285650224215248e-06, "loss": 2.182, "step": 31000 }, { "epoch": 3.7735849056603774, "eval_loss": 2.1596100330352783, "eval_runtime": 6.872, "eval_samples_per_second": 145.518, "eval_steps_per_second": 36.38, "step": 31000 }, { "epoch": 3.7857577601947656, "grad_norm": 6.288793087005615, "learning_rate": 6.2408071748878926e-06, "loss": 2.1886, "step": 31100 }, { "epoch": 3.797930614729154, "grad_norm": 6.112220287322998, "learning_rate": 6.195964125560538e-06, "loss": 2.2107, "step": 31200 }, { "epoch": 3.8101034692635425, "grad_norm": 6.044913291931152, "learning_rate": 6.151121076233184e-06, "loss": 2.2246, "step": 31300 }, { "epoch": 3.8222763237979307, "grad_norm": 6.079142093658447, "learning_rate": 6.10627802690583e-06, "loss": 2.2187, "step": 31400 }, { "epoch": 3.834449178332319, "grad_norm": 5.865757942199707, "learning_rate": 6.0614349775784755e-06, "loss": 2.2069, "step": 31500 }, { "epoch": 3.834449178332319, "eval_loss": 2.156599760055542, "eval_runtime": 6.8965, "eval_samples_per_second": 145.001, "eval_steps_per_second": 36.25, "step": 31500 }, { "epoch": 3.846622032866707, "grad_norm": 6.289271354675293, "learning_rate": 6.016591928251122e-06, "loss": 2.2349, "step": 31600 }, { "epoch": 3.8587948874010953, "grad_norm": 6.607455730438232, "learning_rate": 5.971748878923768e-06, "loss": 2.1849, "step": 31700 }, { "epoch": 3.870967741935484, "grad_norm": 6.193937301635742, "learning_rate": 5.9269058295964135e-06, "loss": 2.1901, "step": 31800 }, { "epoch": 3.883140596469872, "grad_norm": 5.6171650886535645, "learning_rate": 5.882062780269059e-06, "loss": 2.1968, "step": 31900 }, { "epoch": 3.8953134510042604, "grad_norm": 7.239607334136963, "learning_rate": 5.837219730941704e-06, "loss": 2.1984, "step": 32000 }, { "epoch": 3.8953134510042604, "eval_loss": 2.1437973976135254, "eval_runtime": 6.9069, "eval_samples_per_second": 144.782, "eval_steps_per_second": 36.195, "step": 32000 }, { "epoch": 3.907486305538649, "grad_norm": 6.314813613891602, "learning_rate": 5.79237668161435e-06, "loss": 2.1706, "step": 32100 }, { "epoch": 3.9196591600730373, "grad_norm": 5.416664123535156, "learning_rate": 5.7475336322869956e-06, "loss": 2.1852, "step": 32200 }, { "epoch": 3.9318320146074255, "grad_norm": 6.1277594566345215, "learning_rate": 5.702690582959641e-06, "loss": 2.2202, "step": 32300 }, { "epoch": 3.9440048691418137, "grad_norm": 6.0932440757751465, "learning_rate": 5.657847533632288e-06, "loss": 2.1591, "step": 32400 }, { "epoch": 3.956177723676202, "grad_norm": 6.029341697692871, "learning_rate": 5.613004484304934e-06, "loss": 2.1805, "step": 32500 }, { "epoch": 3.956177723676202, "eval_loss": 2.138620138168335, "eval_runtime": 6.9091, "eval_samples_per_second": 144.737, "eval_steps_per_second": 36.184, "step": 32500 }, { "epoch": 3.9683505782105906, "grad_norm": 6.374738693237305, "learning_rate": 5.568161434977579e-06, "loss": 2.168, "step": 32600 }, { "epoch": 3.9805234327449788, "grad_norm": 6.206404209136963, "learning_rate": 5.523318385650225e-06, "loss": 2.2168, "step": 32700 }, { "epoch": 3.992696287279367, "grad_norm": 6.701908588409424, "learning_rate": 5.478475336322871e-06, "loss": 2.1796, "step": 32800 }, { "epoch": 4.004869141813756, "grad_norm": 6.456433296203613, "learning_rate": 5.433632286995516e-06, "loss": 2.1461, "step": 32900 }, { "epoch": 4.017041996348143, "grad_norm": 6.578303337097168, "learning_rate": 5.388789237668161e-06, "loss": 2.1061, "step": 33000 }, { "epoch": 4.017041996348143, "eval_loss": 2.123652458190918, "eval_runtime": 6.8771, "eval_samples_per_second": 145.409, "eval_steps_per_second": 36.352, "step": 33000 }, { "epoch": 4.029214850882532, "grad_norm": 5.736875057220459, "learning_rate": 5.343946188340807e-06, "loss": 2.098, "step": 33100 }, { "epoch": 4.04138770541692, "grad_norm": 6.322964191436768, "learning_rate": 5.299103139013453e-06, "loss": 2.1334, "step": 33200 }, { "epoch": 4.0535605599513085, "grad_norm": 7.002594470977783, "learning_rate": 5.2542600896860994e-06, "loss": 2.1186, "step": 33300 }, { "epoch": 4.065733414485697, "grad_norm": 6.592886924743652, "learning_rate": 5.209417040358745e-06, "loss": 2.1218, "step": 33400 }, { "epoch": 4.077906269020085, "grad_norm": 6.610073566436768, "learning_rate": 5.164573991031391e-06, "loss": 2.1112, "step": 33500 }, { "epoch": 4.077906269020085, "eval_loss": 2.115506172180176, "eval_runtime": 6.8933, "eval_samples_per_second": 145.068, "eval_steps_per_second": 36.267, "step": 33500 }, { "epoch": 4.090079123554474, "grad_norm": 6.308100700378418, "learning_rate": 5.119730941704037e-06, "loss": 2.0984, "step": 33600 }, { "epoch": 4.102251978088862, "grad_norm": 5.7667083740234375, "learning_rate": 5.074887892376682e-06, "loss": 2.1189, "step": 33700 }, { "epoch": 4.11442483262325, "grad_norm": 6.554234504699707, "learning_rate": 5.030044843049327e-06, "loss": 2.1006, "step": 33800 }, { "epoch": 4.126597687157639, "grad_norm": 6.494872570037842, "learning_rate": 4.985201793721974e-06, "loss": 2.1258, "step": 33900 }, { "epoch": 4.138770541692026, "grad_norm": 6.796899318695068, "learning_rate": 4.940807174887893e-06, "loss": 2.1335, "step": 34000 }, { "epoch": 4.138770541692026, "eval_loss": 2.1111514568328857, "eval_runtime": 6.8774, "eval_samples_per_second": 145.404, "eval_steps_per_second": 36.351, "step": 34000 }, { "epoch": 4.150943396226415, "grad_norm": 5.832895755767822, "learning_rate": 4.895964125560538e-06, "loss": 2.0956, "step": 34100 }, { "epoch": 4.163116250760804, "grad_norm": 5.17689847946167, "learning_rate": 4.851121076233185e-06, "loss": 2.1503, "step": 34200 }, { "epoch": 4.1752891052951915, "grad_norm": 6.65399694442749, "learning_rate": 4.80627802690583e-06, "loss": 2.1244, "step": 34300 }, { "epoch": 4.18746195982958, "grad_norm": 6.744587421417236, "learning_rate": 4.761434977578476e-06, "loss": 2.1237, "step": 34400 }, { "epoch": 4.199634814363968, "grad_norm": 6.663182258605957, "learning_rate": 4.716591928251121e-06, "loss": 2.1198, "step": 34500 }, { "epoch": 4.199634814363968, "eval_loss": 2.1056010723114014, "eval_runtime": 6.9075, "eval_samples_per_second": 144.771, "eval_steps_per_second": 36.193, "step": 34500 }, { "epoch": 4.211807668898357, "grad_norm": 6.046566009521484, "learning_rate": 4.671748878923767e-06, "loss": 2.0746, "step": 34600 }, { "epoch": 4.223980523432745, "grad_norm": 6.08657169342041, "learning_rate": 4.626905829596413e-06, "loss": 2.1154, "step": 34700 }, { "epoch": 4.236153377967133, "grad_norm": 6.235377788543701, "learning_rate": 4.582062780269059e-06, "loss": 2.1013, "step": 34800 }, { "epoch": 4.248326232501522, "grad_norm": 5.864556312561035, "learning_rate": 4.537219730941705e-06, "loss": 2.1293, "step": 34900 }, { "epoch": 4.26049908703591, "grad_norm": 6.5032124519348145, "learning_rate": 4.49237668161435e-06, "loss": 2.0909, "step": 35000 }, { "epoch": 4.26049908703591, "eval_loss": 2.1002509593963623, "eval_runtime": 6.9552, "eval_samples_per_second": 143.777, "eval_steps_per_second": 35.944, "step": 35000 }, { "epoch": 4.272671941570298, "grad_norm": 5.491804599761963, "learning_rate": 4.447533632286996e-06, "loss": 2.1075, "step": 35100 }, { "epoch": 4.284844796104687, "grad_norm": 6.165935516357422, "learning_rate": 4.402690582959642e-06, "loss": 2.1172, "step": 35200 }, { "epoch": 4.2970176506390745, "grad_norm": 6.2660369873046875, "learning_rate": 4.357847533632288e-06, "loss": 2.1234, "step": 35300 }, { "epoch": 4.309190505173463, "grad_norm": 6.266602516174316, "learning_rate": 4.3130044843049325e-06, "loss": 2.1023, "step": 35400 }, { "epoch": 4.321363359707852, "grad_norm": 6.377227306365967, "learning_rate": 4.268161434977579e-06, "loss": 2.095, "step": 35500 }, { "epoch": 4.321363359707852, "eval_loss": 2.096508264541626, "eval_runtime": 6.896, "eval_samples_per_second": 145.011, "eval_steps_per_second": 36.253, "step": 35500 }, { "epoch": 4.33353621424224, "grad_norm": 5.756918907165527, "learning_rate": 4.223318385650225e-06, "loss": 2.1258, "step": 35600 }, { "epoch": 4.345709068776628, "grad_norm": 6.3634934425354, "learning_rate": 4.1784753363228705e-06, "loss": 2.1326, "step": 35700 }, { "epoch": 4.357881923311016, "grad_norm": 6.081814765930176, "learning_rate": 4.133632286995516e-06, "loss": 2.0923, "step": 35800 }, { "epoch": 4.370054777845405, "grad_norm": 5.829545021057129, "learning_rate": 4.088789237668161e-06, "loss": 2.1001, "step": 35900 }, { "epoch": 4.382227632379793, "grad_norm": 7.019509315490723, "learning_rate": 4.043946188340808e-06, "loss": 2.0947, "step": 36000 }, { "epoch": 4.382227632379793, "eval_loss": 2.0914690494537354, "eval_runtime": 6.9162, "eval_samples_per_second": 144.589, "eval_steps_per_second": 36.147, "step": 36000 }, { "epoch": 4.394400486914181, "grad_norm": 7.135252952575684, "learning_rate": 3.9991031390134534e-06, "loss": 2.111, "step": 36100 }, { "epoch": 4.40657334144857, "grad_norm": 5.3956522941589355, "learning_rate": 3.954260089686099e-06, "loss": 2.1072, "step": 36200 }, { "epoch": 4.418746195982958, "grad_norm": 5.853066921234131, "learning_rate": 3.909417040358744e-06, "loss": 2.1327, "step": 36300 }, { "epoch": 4.430919050517346, "grad_norm": 6.294539928436279, "learning_rate": 3.864573991031391e-06, "loss": 2.0886, "step": 36400 }, { "epoch": 4.443091905051735, "grad_norm": 7.183646202087402, "learning_rate": 3.819730941704036e-06, "loss": 2.139, "step": 36500 }, { "epoch": 4.443091905051735, "eval_loss": 2.0876991748809814, "eval_runtime": 6.8527, "eval_samples_per_second": 145.928, "eval_steps_per_second": 36.482, "step": 36500 }, { "epoch": 4.455264759586123, "grad_norm": 6.069007396697998, "learning_rate": 3.7748878923766817e-06, "loss": 2.1076, "step": 36600 }, { "epoch": 4.467437614120511, "grad_norm": 6.092281341552734, "learning_rate": 3.7300448430493274e-06, "loss": 2.1274, "step": 36700 }, { "epoch": 4.4796104686549, "grad_norm": 6.095892429351807, "learning_rate": 3.6852017937219735e-06, "loss": 2.1202, "step": 36800 }, { "epoch": 4.491783323189288, "grad_norm": 6.349238395690918, "learning_rate": 3.6403587443946193e-06, "loss": 2.1192, "step": 36900 }, { "epoch": 4.503956177723676, "grad_norm": 6.508525848388672, "learning_rate": 3.595515695067265e-06, "loss": 2.106, "step": 37000 }, { "epoch": 4.503956177723676, "eval_loss": 2.0852510929107666, "eval_runtime": 6.9159, "eval_samples_per_second": 144.595, "eval_steps_per_second": 36.149, "step": 37000 }, { "epoch": 4.516129032258064, "grad_norm": 6.2998046875, "learning_rate": 3.5506726457399103e-06, "loss": 2.1264, "step": 37100 }, { "epoch": 4.528301886792453, "grad_norm": 6.988924503326416, "learning_rate": 3.5058295964125565e-06, "loss": 2.0855, "step": 37200 }, { "epoch": 4.540474741326841, "grad_norm": 5.999715328216553, "learning_rate": 3.460986547085202e-06, "loss": 2.1288, "step": 37300 }, { "epoch": 4.552647595861229, "grad_norm": 5.390603542327881, "learning_rate": 3.416143497757848e-06, "loss": 2.1119, "step": 37400 }, { "epoch": 4.564820450395618, "grad_norm": 5.443009853363037, "learning_rate": 3.3713004484304932e-06, "loss": 2.1137, "step": 37500 }, { "epoch": 4.564820450395618, "eval_loss": 2.0807323455810547, "eval_runtime": 6.868, "eval_samples_per_second": 145.603, "eval_steps_per_second": 36.401, "step": 37500 }, { "epoch": 4.5769933049300064, "grad_norm": 6.028597831726074, "learning_rate": 3.326457399103139e-06, "loss": 2.1233, "step": 37600 }, { "epoch": 4.589166159464394, "grad_norm": 6.3508992195129395, "learning_rate": 3.281614349775785e-06, "loss": 2.0785, "step": 37700 }, { "epoch": 4.601339013998783, "grad_norm": 6.304683685302734, "learning_rate": 3.237219730941704e-06, "loss": 2.1058, "step": 37800 }, { "epoch": 4.6135118685331715, "grad_norm": 5.774105548858643, "learning_rate": 3.1923766816143497e-06, "loss": 2.1363, "step": 37900 }, { "epoch": 4.625684723067559, "grad_norm": 6.000542163848877, "learning_rate": 3.147533632286996e-06, "loss": 2.1097, "step": 38000 }, { "epoch": 4.625684723067559, "eval_loss": 2.0734775066375732, "eval_runtime": 6.9601, "eval_samples_per_second": 143.677, "eval_steps_per_second": 35.919, "step": 38000 }, { "epoch": 4.637857577601948, "grad_norm": 6.037074565887451, "learning_rate": 3.1026905829596416e-06, "loss": 2.0839, "step": 38100 }, { "epoch": 4.650030432136336, "grad_norm": 6.941400051116943, "learning_rate": 3.0578475336322874e-06, "loss": 2.0961, "step": 38200 }, { "epoch": 4.662203286670724, "grad_norm": 6.625183582305908, "learning_rate": 3.0130044843049327e-06, "loss": 2.1532, "step": 38300 }, { "epoch": 4.674376141205112, "grad_norm": 5.852269649505615, "learning_rate": 2.968161434977579e-06, "loss": 2.1085, "step": 38400 }, { "epoch": 4.686548995739501, "grad_norm": 5.130061626434326, "learning_rate": 2.9233183856502245e-06, "loss": 2.1028, "step": 38500 }, { "epoch": 4.686548995739501, "eval_loss": 2.070453643798828, "eval_runtime": 6.9577, "eval_samples_per_second": 143.725, "eval_steps_per_second": 35.931, "step": 38500 }, { "epoch": 4.6987218502738894, "grad_norm": 6.478227615356445, "learning_rate": 2.8784753363228703e-06, "loss": 2.0895, "step": 38600 }, { "epoch": 4.710894704808277, "grad_norm": 6.043088436126709, "learning_rate": 2.8336322869955156e-06, "loss": 2.1032, "step": 38700 }, { "epoch": 4.723067559342666, "grad_norm": 6.732186317443848, "learning_rate": 2.7887892376681617e-06, "loss": 2.0838, "step": 38800 }, { "epoch": 4.7352404138770545, "grad_norm": 6.393290996551514, "learning_rate": 2.7439461883408075e-06, "loss": 2.1058, "step": 38900 }, { "epoch": 4.747413268411442, "grad_norm": 6.3943705558776855, "learning_rate": 2.699103139013453e-06, "loss": 2.1142, "step": 39000 }, { "epoch": 4.747413268411442, "eval_loss": 2.0703060626983643, "eval_runtime": 7.0835, "eval_samples_per_second": 141.172, "eval_steps_per_second": 35.293, "step": 39000 }, { "epoch": 4.759586122945831, "grad_norm": 5.651825428009033, "learning_rate": 2.654260089686099e-06, "loss": 2.1099, "step": 39100 }, { "epoch": 4.77175897748022, "grad_norm": 5.763203144073486, "learning_rate": 2.609417040358745e-06, "loss": 2.1199, "step": 39200 }, { "epoch": 4.783931832014607, "grad_norm": 6.404742240905762, "learning_rate": 2.5645739910313904e-06, "loss": 2.1065, "step": 39300 }, { "epoch": 4.796104686548996, "grad_norm": 6.63946533203125, "learning_rate": 2.519730941704036e-06, "loss": 2.0982, "step": 39400 }, { "epoch": 4.808277541083384, "grad_norm": 6.3910675048828125, "learning_rate": 2.474887892376682e-06, "loss": 2.0945, "step": 39500 }, { "epoch": 4.808277541083384, "eval_loss": 2.066244602203369, "eval_runtime": 7.0266, "eval_samples_per_second": 142.317, "eval_steps_per_second": 35.579, "step": 39500 }, { "epoch": 4.820450395617772, "grad_norm": 6.50945520401001, "learning_rate": 2.4300448430493276e-06, "loss": 2.0619, "step": 39600 }, { "epoch": 4.83262325015216, "grad_norm": 6.681357383728027, "learning_rate": 2.3852017937219733e-06, "loss": 2.097, "step": 39700 }, { "epoch": 4.844796104686549, "grad_norm": 5.813176155090332, "learning_rate": 2.340358744394619e-06, "loss": 2.1173, "step": 39800 }, { "epoch": 4.8569689592209375, "grad_norm": 6.834031581878662, "learning_rate": 2.2955156950672647e-06, "loss": 2.0721, "step": 39900 }, { "epoch": 4.869141813755325, "grad_norm": 5.929574966430664, "learning_rate": 2.2506726457399105e-06, "loss": 2.1325, "step": 40000 }, { "epoch": 4.869141813755325, "eval_loss": 2.064162254333496, "eval_runtime": 7.021, "eval_samples_per_second": 142.43, "eval_steps_per_second": 35.608, "step": 40000 }, { "epoch": 4.881314668289714, "grad_norm": 6.4569830894470215, "learning_rate": 2.205829596412556e-06, "loss": 2.1224, "step": 40100 }, { "epoch": 4.893487522824103, "grad_norm": 6.773449897766113, "learning_rate": 2.160986547085202e-06, "loss": 2.1037, "step": 40200 }, { "epoch": 4.90566037735849, "grad_norm": 6.341082572937012, "learning_rate": 2.1165919282511213e-06, "loss": 2.0936, "step": 40300 }, { "epoch": 4.917833231892879, "grad_norm": 6.29095983505249, "learning_rate": 2.071748878923767e-06, "loss": 2.141, "step": 40400 }, { "epoch": 4.930006086427268, "grad_norm": 7.924270153045654, "learning_rate": 2.0269058295964127e-06, "loss": 2.0937, "step": 40500 }, { "epoch": 4.930006086427268, "eval_loss": 2.0617458820343018, "eval_runtime": 6.9841, "eval_samples_per_second": 143.183, "eval_steps_per_second": 35.796, "step": 40500 }, { "epoch": 4.942178940961655, "grad_norm": 7.386099338531494, "learning_rate": 1.9820627802690585e-06, "loss": 2.1187, "step": 40600 }, { "epoch": 4.954351795496044, "grad_norm": 6.6330413818359375, "learning_rate": 1.937219730941704e-06, "loss": 2.0891, "step": 40700 }, { "epoch": 4.966524650030432, "grad_norm": 5.590965747833252, "learning_rate": 1.89237668161435e-06, "loss": 2.0809, "step": 40800 }, { "epoch": 4.9786975045648205, "grad_norm": 6.468350410461426, "learning_rate": 1.8475336322869959e-06, "loss": 2.0654, "step": 40900 }, { "epoch": 4.990870359099208, "grad_norm": 6.724806308746338, "learning_rate": 1.8026905829596414e-06, "loss": 2.0938, "step": 41000 }, { "epoch": 4.990870359099208, "eval_loss": 2.057617664337158, "eval_runtime": 7.1367, "eval_samples_per_second": 140.121, "eval_steps_per_second": 35.03, "step": 41000 }, { "epoch": 5.003043213633597, "grad_norm": 5.549363136291504, "learning_rate": 1.7578475336322873e-06, "loss": 2.0672, "step": 41100 }, { "epoch": 5.015216068167986, "grad_norm": 6.4161152839660645, "learning_rate": 1.7130044843049328e-06, "loss": 2.0589, "step": 41200 }, { "epoch": 5.027388922702373, "grad_norm": 6.318953514099121, "learning_rate": 1.6681614349775786e-06, "loss": 2.0643, "step": 41300 }, { "epoch": 5.039561777236762, "grad_norm": 7.292160987854004, "learning_rate": 1.6233183856502243e-06, "loss": 2.0718, "step": 41400 }, { "epoch": 5.051734631771151, "grad_norm": 6.140988349914551, "learning_rate": 1.57847533632287e-06, "loss": 2.0437, "step": 41500 }, { "epoch": 5.051734631771151, "eval_loss": 2.051799774169922, "eval_runtime": 7.0596, "eval_samples_per_second": 141.651, "eval_steps_per_second": 35.413, "step": 41500 }, { "epoch": 5.063907486305538, "grad_norm": 6.15008020401001, "learning_rate": 1.533632286995516e-06, "loss": 2.0561, "step": 41600 }, { "epoch": 5.076080340839927, "grad_norm": 6.889511585235596, "learning_rate": 1.4887892376681615e-06, "loss": 2.0729, "step": 41700 }, { "epoch": 5.088253195374315, "grad_norm": 5.815738201141357, "learning_rate": 1.4439461883408074e-06, "loss": 2.0413, "step": 41800 }, { "epoch": 5.1004260499087035, "grad_norm": 5.965245723724365, "learning_rate": 1.399103139013453e-06, "loss": 2.0407, "step": 41900 }, { "epoch": 5.112598904443092, "grad_norm": 7.188913345336914, "learning_rate": 1.3542600896860989e-06, "loss": 2.0781, "step": 42000 }, { "epoch": 5.112598904443092, "eval_loss": 2.0501816272735596, "eval_runtime": 6.9017, "eval_samples_per_second": 144.892, "eval_steps_per_second": 36.223, "step": 42000 }, { "epoch": 5.12477175897748, "grad_norm": 7.101166725158691, "learning_rate": 1.3094170403587444e-06, "loss": 2.0348, "step": 42100 }, { "epoch": 5.136944613511869, "grad_norm": 5.820453643798828, "learning_rate": 1.2645739910313903e-06, "loss": 2.0497, "step": 42200 }, { "epoch": 5.149117468046257, "grad_norm": 5.811570167541504, "learning_rate": 1.2197309417040358e-06, "loss": 2.058, "step": 42300 }, { "epoch": 5.161290322580645, "grad_norm": 6.54494047164917, "learning_rate": 1.1748878923766818e-06, "loss": 2.0833, "step": 42400 }, { "epoch": 5.173463177115034, "grad_norm": 6.547015190124512, "learning_rate": 1.1300448430493275e-06, "loss": 2.0671, "step": 42500 }, { "epoch": 5.173463177115034, "eval_loss": 2.049518346786499, "eval_runtime": 7.0155, "eval_samples_per_second": 142.542, "eval_steps_per_second": 35.636, "step": 42500 }, { "epoch": 5.185636031649421, "grad_norm": 6.079003810882568, "learning_rate": 1.0852017937219732e-06, "loss": 2.0732, "step": 42600 }, { "epoch": 5.19780888618381, "grad_norm": 6.835382461547852, "learning_rate": 1.040358744394619e-06, "loss": 2.0516, "step": 42700 }, { "epoch": 5.209981740718199, "grad_norm": 6.055761814117432, "learning_rate": 9.955156950672647e-07, "loss": 2.0639, "step": 42800 }, { "epoch": 5.2221545952525865, "grad_norm": 6.516651630401611, "learning_rate": 9.511210762331839e-07, "loss": 2.0597, "step": 42900 }, { "epoch": 5.234327449786975, "grad_norm": 5.874512195587158, "learning_rate": 9.062780269058297e-07, "loss": 2.073, "step": 43000 }, { "epoch": 5.234327449786975, "eval_loss": 2.0482187271118164, "eval_runtime": 6.8896, "eval_samples_per_second": 145.145, "eval_steps_per_second": 36.286, "step": 43000 }, { "epoch": 5.246500304321363, "grad_norm": 6.2515459060668945, "learning_rate": 8.614349775784754e-07, "loss": 2.0594, "step": 43100 }, { "epoch": 5.258673158855752, "grad_norm": 6.7219438552856445, "learning_rate": 8.165919282511211e-07, "loss": 2.0138, "step": 43200 }, { "epoch": 5.27084601339014, "grad_norm": 6.588565349578857, "learning_rate": 7.71748878923767e-07, "loss": 2.089, "step": 43300 }, { "epoch": 5.283018867924528, "grad_norm": 6.52641487121582, "learning_rate": 7.269058295964127e-07, "loss": 2.0274, "step": 43400 }, { "epoch": 5.295191722458917, "grad_norm": 7.77009391784668, "learning_rate": 6.820627802690584e-07, "loss": 2.0412, "step": 43500 }, { "epoch": 5.295191722458917, "eval_loss": 2.0471861362457275, "eval_runtime": 6.8473, "eval_samples_per_second": 146.043, "eval_steps_per_second": 36.511, "step": 43500 }, { "epoch": 5.307364576993305, "grad_norm": 6.563704490661621, "learning_rate": 6.372197309417041e-07, "loss": 2.0538, "step": 43600 }, { "epoch": 5.319537431527693, "grad_norm": 5.842877388000488, "learning_rate": 5.923766816143499e-07, "loss": 2.0378, "step": 43700 }, { "epoch": 5.331710286062082, "grad_norm": 5.96117639541626, "learning_rate": 5.475336322869956e-07, "loss": 2.0702, "step": 43800 }, { "epoch": 5.3438831405964695, "grad_norm": 6.195252895355225, "learning_rate": 5.026905829596413e-07, "loss": 2.0519, "step": 43900 }, { "epoch": 5.356055995130858, "grad_norm": 6.967134475708008, "learning_rate": 4.5784753363228705e-07, "loss": 2.0747, "step": 44000 }, { "epoch": 5.356055995130858, "eval_loss": 2.046496629714966, "eval_runtime": 6.9289, "eval_samples_per_second": 144.322, "eval_steps_per_second": 36.081, "step": 44000 }, { "epoch": 5.368228849665247, "grad_norm": 5.7902984619140625, "learning_rate": 4.130044843049328e-07, "loss": 2.0551, "step": 44100 }, { "epoch": 5.380401704199635, "grad_norm": 6.01054048538208, "learning_rate": 3.6816143497757846e-07, "loss": 2.0569, "step": 44200 }, { "epoch": 5.392574558734023, "grad_norm": 6.690357685089111, "learning_rate": 3.2331838565022424e-07, "loss": 2.08, "step": 44300 }, { "epoch": 5.404747413268412, "grad_norm": 5.836359024047852, "learning_rate": 2.7847533632286997e-07, "loss": 2.0405, "step": 44400 }, { "epoch": 5.4169202678028, "grad_norm": 6.3250298500061035, "learning_rate": 2.3363228699551572e-07, "loss": 2.0717, "step": 44500 }, { "epoch": 5.4169202678028, "eval_loss": 2.04555606842041, "eval_runtime": 6.9014, "eval_samples_per_second": 144.899, "eval_steps_per_second": 36.225, "step": 44500 }, { "epoch": 5.429093122337188, "grad_norm": 6.5666890144348145, "learning_rate": 1.8878923766816145e-07, "loss": 2.06, "step": 44600 }, { "epoch": 5.441265976871576, "grad_norm": 7.2658843994140625, "learning_rate": 1.4394618834080718e-07, "loss": 2.0556, "step": 44700 }, { "epoch": 5.453438831405965, "grad_norm": 6.671789646148682, "learning_rate": 9.910313901345293e-08, "loss": 2.0642, "step": 44800 }, { "epoch": 5.465611685940353, "grad_norm": 5.944987773895264, "learning_rate": 5.426008968609866e-08, "loss": 2.0414, "step": 44900 }, { "epoch": 5.477784540474741, "grad_norm": 6.427646636962891, "learning_rate": 9.417040358744396e-09, "loss": 2.0667, "step": 45000 }, { "epoch": 5.477784540474741, "eval_loss": 2.0452468395233154, "eval_runtime": 7.0229, "eval_samples_per_second": 142.391, "eval_steps_per_second": 35.598, "step": 45000 }, { "epoch": 5.48995739500913, "grad_norm": 6.559889793395996, "learning_rate": 5.007718120805369e-06, "loss": 2.0447, "step": 45100 }, { "epoch": 5.502130249543518, "grad_norm": 6.235354423522949, "learning_rate": 4.974161073825503e-06, "loss": 2.0806, "step": 45200 }, { "epoch": 5.514303104077906, "grad_norm": 7.230030536651611, "learning_rate": 4.940604026845638e-06, "loss": 2.0696, "step": 45300 }, { "epoch": 5.526475958612295, "grad_norm": 5.613503456115723, "learning_rate": 4.907046979865772e-06, "loss": 2.0662, "step": 45400 }, { "epoch": 5.538648813146683, "grad_norm": 5.988820552825928, "learning_rate": 4.873489932885906e-06, "loss": 2.0551, "step": 45500 }, { "epoch": 5.538648813146683, "eval_loss": 2.0472412109375, "eval_runtime": 6.7805, "eval_samples_per_second": 147.481, "eval_steps_per_second": 36.87, "step": 45500 }, { "epoch": 5.550821667681071, "grad_norm": 6.566047191619873, "learning_rate": 4.8399328859060404e-06, "loss": 2.0652, "step": 45600 }, { "epoch": 5.56299452221546, "grad_norm": 6.979294300079346, "learning_rate": 4.806375838926175e-06, "loss": 2.0409, "step": 45700 }, { "epoch": 5.575167376749848, "grad_norm": 6.474365234375, "learning_rate": 4.772818791946309e-06, "loss": 2.0562, "step": 45800 }, { "epoch": 5.587340231284236, "grad_norm": 6.100124835968018, "learning_rate": 4.739261744966443e-06, "loss": 2.0448, "step": 45900 }, { "epoch": 5.599513085818624, "grad_norm": 6.383643627166748, "learning_rate": 4.706040268456376e-06, "loss": 2.0798, "step": 46000 }, { "epoch": 5.599513085818624, "eval_loss": 2.042715311050415, "eval_runtime": 6.7981, "eval_samples_per_second": 147.1, "eval_steps_per_second": 36.775, "step": 46000 }, { "epoch": 5.611685940353013, "grad_norm": 6.848605632781982, "learning_rate": 4.67248322147651e-06, "loss": 2.0615, "step": 46100 }, { "epoch": 5.6238587948874015, "grad_norm": 6.921677589416504, "learning_rate": 4.638926174496644e-06, "loss": 2.0888, "step": 46200 }, { "epoch": 5.636031649421789, "grad_norm": 6.901805400848389, "learning_rate": 4.6053691275167785e-06, "loss": 2.0552, "step": 46300 }, { "epoch": 5.648204503956178, "grad_norm": 6.497274398803711, "learning_rate": 4.571812080536913e-06, "loss": 2.0596, "step": 46400 }, { "epoch": 5.660377358490566, "grad_norm": 6.1705803871154785, "learning_rate": 4.538255033557047e-06, "loss": 2.0352, "step": 46500 }, { "epoch": 5.660377358490566, "eval_loss": 2.0392038822174072, "eval_runtime": 6.8175, "eval_samples_per_second": 146.681, "eval_steps_per_second": 36.67, "step": 46500 }, { "epoch": 5.672550213024954, "grad_norm": 6.3149847984313965, "learning_rate": 4.504697986577181e-06, "loss": 2.0782, "step": 46600 }, { "epoch": 5.684723067559343, "grad_norm": 5.7811760902404785, "learning_rate": 4.471140939597316e-06, "loss": 2.0745, "step": 46700 }, { "epoch": 5.696895922093731, "grad_norm": 6.381850719451904, "learning_rate": 4.43758389261745e-06, "loss": 2.0967, "step": 46800 }, { "epoch": 5.709068776628119, "grad_norm": 6.7904534339904785, "learning_rate": 4.404026845637584e-06, "loss": 2.048, "step": 46900 }, { "epoch": 5.721241631162508, "grad_norm": 6.390072822570801, "learning_rate": 4.370469798657718e-06, "loss": 2.0458, "step": 47000 }, { "epoch": 5.721241631162508, "eval_loss": 2.0331013202667236, "eval_runtime": 6.7556, "eval_samples_per_second": 148.026, "eval_steps_per_second": 37.006, "step": 47000 }, { "epoch": 5.733414485696896, "grad_norm": 6.4294514656066895, "learning_rate": 4.336912751677853e-06, "loss": 2.0555, "step": 47100 }, { "epoch": 5.7455873402312845, "grad_norm": 7.039945602416992, "learning_rate": 4.303355704697987e-06, "loss": 2.0973, "step": 47200 }, { "epoch": 5.757760194765672, "grad_norm": 6.919515132904053, "learning_rate": 4.269798657718121e-06, "loss": 2.0572, "step": 47300 }, { "epoch": 5.769933049300061, "grad_norm": 6.846578598022461, "learning_rate": 4.2362416107382554e-06, "loss": 2.0703, "step": 47400 }, { "epoch": 5.78210590383445, "grad_norm": 6.899037837982178, "learning_rate": 4.20268456375839e-06, "loss": 2.0382, "step": 47500 }, { "epoch": 5.78210590383445, "eval_loss": 2.0307412147521973, "eval_runtime": 6.8182, "eval_samples_per_second": 146.666, "eval_steps_per_second": 36.667, "step": 47500 }, { "epoch": 5.794278758368837, "grad_norm": 5.726818084716797, "learning_rate": 4.169127516778524e-06, "loss": 2.0595, "step": 47600 }, { "epoch": 5.806451612903226, "grad_norm": 7.426904201507568, "learning_rate": 4.135570469798658e-06, "loss": 2.0605, "step": 47700 }, { "epoch": 5.818624467437614, "grad_norm": 6.416141986846924, "learning_rate": 4.1020134228187925e-06, "loss": 2.071, "step": 47800 }, { "epoch": 5.830797321972002, "grad_norm": 6.170881748199463, "learning_rate": 4.068456375838927e-06, "loss": 2.0601, "step": 47900 }, { "epoch": 5.842970176506391, "grad_norm": 5.913904666900635, "learning_rate": 4.034899328859061e-06, "loss": 2.0663, "step": 48000 }, { "epoch": 5.842970176506391, "eval_loss": 2.0260586738586426, "eval_runtime": 6.7968, "eval_samples_per_second": 147.127, "eval_steps_per_second": 36.782, "step": 48000 }, { "epoch": 5.855143031040779, "grad_norm": 6.9575090408325195, "learning_rate": 4.0013422818791944e-06, "loss": 2.0487, "step": 48100 }, { "epoch": 5.8673158855751675, "grad_norm": 7.018653392791748, "learning_rate": 3.967785234899329e-06, "loss": 2.0836, "step": 48200 }, { "epoch": 5.879488740109556, "grad_norm": 6.9810285568237305, "learning_rate": 3.934228187919463e-06, "loss": 2.0645, "step": 48300 }, { "epoch": 5.891661594643944, "grad_norm": 5.732436656951904, "learning_rate": 3.900671140939597e-06, "loss": 2.0682, "step": 48400 }, { "epoch": 5.9038344491783326, "grad_norm": 6.543402671813965, "learning_rate": 3.8671140939597315e-06, "loss": 2.0797, "step": 48500 }, { "epoch": 5.9038344491783326, "eval_loss": 2.0245697498321533, "eval_runtime": 6.7746, "eval_samples_per_second": 147.609, "eval_steps_per_second": 36.902, "step": 48500 }, { "epoch": 5.91600730371272, "grad_norm": 6.355215072631836, "learning_rate": 3.833557046979866e-06, "loss": 2.0386, "step": 48600 }, { "epoch": 5.928180158247109, "grad_norm": 5.7379889488220215, "learning_rate": 3.8000000000000005e-06, "loss": 2.0498, "step": 48700 }, { "epoch": 5.940353012781498, "grad_norm": 5.857077121734619, "learning_rate": 3.7664429530201347e-06, "loss": 2.061, "step": 48800 }, { "epoch": 5.952525867315885, "grad_norm": 7.078189373016357, "learning_rate": 3.732885906040269e-06, "loss": 2.0569, "step": 48900 }, { "epoch": 5.964698721850274, "grad_norm": 6.31903600692749, "learning_rate": 3.6993288590604033e-06, "loss": 2.0755, "step": 49000 }, { "epoch": 5.964698721850274, "eval_loss": 2.0195415019989014, "eval_runtime": 6.7847, "eval_samples_per_second": 147.392, "eval_steps_per_second": 36.848, "step": 49000 }, { "epoch": 5.976871576384662, "grad_norm": 6.295201778411865, "learning_rate": 3.6657718120805375e-06, "loss": 2.0393, "step": 49100 }, { "epoch": 5.9890444309190505, "grad_norm": 5.829520225524902, "learning_rate": 3.6322147651006714e-06, "loss": 2.0839, "step": 49200 }, { "epoch": 6.001217285453439, "grad_norm": 6.653756141662598, "learning_rate": 3.5986577181208056e-06, "loss": 2.0581, "step": 49300 }, { "epoch": 6.013390139987827, "grad_norm": 6.303423881530762, "learning_rate": 3.56510067114094e-06, "loss": 2.0524, "step": 49400 }, { "epoch": 6.0255629945222156, "grad_norm": 6.783233642578125, "learning_rate": 3.531543624161074e-06, "loss": 2.0284, "step": 49500 }, { "epoch": 6.0255629945222156, "eval_loss": 2.013944387435913, "eval_runtime": 6.7878, "eval_samples_per_second": 147.323, "eval_steps_per_second": 36.831, "step": 49500 }, { "epoch": 6.037735849056604, "grad_norm": 5.857462406158447, "learning_rate": 3.4979865771812084e-06, "loss": 2.0229, "step": 49600 }, { "epoch": 6.049908703590992, "grad_norm": 6.777635097503662, "learning_rate": 3.4644295302013427e-06, "loss": 1.9982, "step": 49700 }, { "epoch": 6.062081558125381, "grad_norm": 7.1341328620910645, "learning_rate": 3.430872483221477e-06, "loss": 2.0211, "step": 49800 }, { "epoch": 6.074254412659768, "grad_norm": 6.320338249206543, "learning_rate": 3.3973154362416112e-06, "loss": 2.0137, "step": 49900 }, { "epoch": 6.086427267194157, "grad_norm": 6.523722171783447, "learning_rate": 3.3640939597315437e-06, "loss": 2.0073, "step": 50000 }, { "epoch": 6.086427267194157, "eval_loss": 2.0087661743164062, "eval_runtime": 6.731, "eval_samples_per_second": 148.565, "eval_steps_per_second": 37.141, "step": 50000 }, { "epoch": 6.098600121728546, "grad_norm": 6.2543559074401855, "learning_rate": 3.330536912751678e-06, "loss": 2.05, "step": 50100 }, { "epoch": 6.1107729762629335, "grad_norm": 6.838403701782227, "learning_rate": 3.2969798657718123e-06, "loss": 2.0041, "step": 50200 }, { "epoch": 6.122945830797322, "grad_norm": 6.734765529632568, "learning_rate": 3.2634228187919465e-06, "loss": 2.0144, "step": 50300 }, { "epoch": 6.13511868533171, "grad_norm": 7.506516933441162, "learning_rate": 3.2298657718120808e-06, "loss": 2.0238, "step": 50400 }, { "epoch": 6.1472915398660986, "grad_norm": 7.153513431549072, "learning_rate": 3.196308724832215e-06, "loss": 2.0032, "step": 50500 }, { "epoch": 6.1472915398660986, "eval_loss": 2.0054242610931396, "eval_runtime": 6.8452, "eval_samples_per_second": 146.089, "eval_steps_per_second": 36.522, "step": 50500 }, { "epoch": 6.159464394400487, "grad_norm": 5.951141834259033, "learning_rate": 3.1627516778523493e-06, "loss": 2.0768, "step": 50600 }, { "epoch": 6.171637248934875, "grad_norm": 6.877615928649902, "learning_rate": 3.1291946308724836e-06, "loss": 2.0123, "step": 50700 }, { "epoch": 6.183810103469264, "grad_norm": 6.209372520446777, "learning_rate": 3.095637583892618e-06, "loss": 2.0153, "step": 50800 }, { "epoch": 6.195982958003652, "grad_norm": 6.799842834472656, "learning_rate": 3.062080536912752e-06, "loss": 1.9955, "step": 50900 }, { "epoch": 6.20815581253804, "grad_norm": 6.479254722595215, "learning_rate": 3.0285234899328864e-06, "loss": 2.0315, "step": 51000 }, { "epoch": 6.20815581253804, "eval_loss": 2.0038652420043945, "eval_runtime": 6.8012, "eval_samples_per_second": 147.033, "eval_steps_per_second": 36.758, "step": 51000 }, { "epoch": 6.220328667072429, "grad_norm": 6.269389629364014, "learning_rate": 2.9949664429530206e-06, "loss": 1.9839, "step": 51100 }, { "epoch": 6.2325015216068165, "grad_norm": 7.240963935852051, "learning_rate": 2.9614093959731545e-06, "loss": 2.0155, "step": 51200 }, { "epoch": 6.244674376141205, "grad_norm": 5.774966716766357, "learning_rate": 2.9278523489932887e-06, "loss": 2.0198, "step": 51300 }, { "epoch": 6.256847230675594, "grad_norm": 6.272314071655273, "learning_rate": 2.894295302013423e-06, "loss": 2.0554, "step": 51400 }, { "epoch": 6.2690200852099816, "grad_norm": 9.089746475219727, "learning_rate": 2.8607382550335573e-06, "loss": 2.0408, "step": 51500 }, { "epoch": 6.2690200852099816, "eval_loss": 2.000591278076172, "eval_runtime": 6.7821, "eval_samples_per_second": 147.447, "eval_steps_per_second": 36.862, "step": 51500 }, { "epoch": 6.28119293974437, "grad_norm": 6.007697105407715, "learning_rate": 2.8271812080536915e-06, "loss": 2.0251, "step": 51600 }, { "epoch": 6.293365794278758, "grad_norm": 7.7493791580200195, "learning_rate": 2.793624161073826e-06, "loss": 2.0447, "step": 51700 }, { "epoch": 6.305538648813147, "grad_norm": 7.068716526031494, "learning_rate": 2.76006711409396e-06, "loss": 2.0377, "step": 51800 }, { "epoch": 6.317711503347535, "grad_norm": 6.732091903686523, "learning_rate": 2.7265100671140943e-06, "loss": 2.0131, "step": 51900 }, { "epoch": 6.329884357881923, "grad_norm": 6.7231125831604, "learning_rate": 2.693288590604027e-06, "loss": 2.0385, "step": 52000 }, { "epoch": 6.329884357881923, "eval_loss": 1.9973669052124023, "eval_runtime": 6.7984, "eval_samples_per_second": 147.093, "eval_steps_per_second": 36.773, "step": 52000 }, { "epoch": 6.342057212416312, "grad_norm": 6.017531394958496, "learning_rate": 2.659731543624161e-06, "loss": 2.0407, "step": 52100 }, { "epoch": 6.3542300669507, "grad_norm": 5.93875789642334, "learning_rate": 2.6261744966442954e-06, "loss": 2.0368, "step": 52200 }, { "epoch": 6.366402921485088, "grad_norm": 6.382920265197754, "learning_rate": 2.5926174496644296e-06, "loss": 2.036, "step": 52300 }, { "epoch": 6.378575776019477, "grad_norm": 6.723759651184082, "learning_rate": 2.559060402684564e-06, "loss": 1.9914, "step": 52400 }, { "epoch": 6.3907486305538646, "grad_norm": 8.295475959777832, "learning_rate": 2.525503355704698e-06, "loss": 2.0401, "step": 52500 }, { "epoch": 6.3907486305538646, "eval_loss": 1.9946650266647339, "eval_runtime": 6.8495, "eval_samples_per_second": 145.995, "eval_steps_per_second": 36.499, "step": 52500 }, { "epoch": 6.402921485088253, "grad_norm": 6.045047283172607, "learning_rate": 2.4919463087248324e-06, "loss": 2.0287, "step": 52600 }, { "epoch": 6.415094339622642, "grad_norm": 7.3694000244140625, "learning_rate": 2.4583892617449667e-06, "loss": 2.0318, "step": 52700 }, { "epoch": 6.42726719415703, "grad_norm": 6.970037460327148, "learning_rate": 2.424832214765101e-06, "loss": 2.0352, "step": 52800 }, { "epoch": 6.439440048691418, "grad_norm": 7.87092399597168, "learning_rate": 2.391275167785235e-06, "loss": 2.0522, "step": 52900 }, { "epoch": 6.451612903225806, "grad_norm": 6.341009616851807, "learning_rate": 2.357718120805369e-06, "loss": 2.0717, "step": 53000 }, { "epoch": 6.451612903225806, "eval_loss": 1.9915155172348022, "eval_runtime": 6.8145, "eval_samples_per_second": 146.746, "eval_steps_per_second": 36.686, "step": 53000 }, { "epoch": 6.463785757760195, "grad_norm": 7.210479736328125, "learning_rate": 2.3241610738255038e-06, "loss": 2.0154, "step": 53100 }, { "epoch": 6.475958612294583, "grad_norm": 8.30247688293457, "learning_rate": 2.290604026845638e-06, "loss": 2.0242, "step": 53200 }, { "epoch": 6.488131466828971, "grad_norm": 5.9992570877075195, "learning_rate": 2.2573825503355705e-06, "loss": 2.0372, "step": 53300 }, { "epoch": 6.50030432136336, "grad_norm": 6.450936317443848, "learning_rate": 2.2238255033557048e-06, "loss": 2.0267, "step": 53400 }, { "epoch": 6.512477175897748, "grad_norm": 6.037837982177734, "learning_rate": 2.190268456375839e-06, "loss": 2.0178, "step": 53500 }, { "epoch": 6.512477175897748, "eval_loss": 1.9894185066223145, "eval_runtime": 6.8572, "eval_samples_per_second": 145.831, "eval_steps_per_second": 36.458, "step": 53500 }, { "epoch": 6.524650030432136, "grad_norm": 6.875925064086914, "learning_rate": 2.1567114093959733e-06, "loss": 2.0354, "step": 53600 }, { "epoch": 6.536822884966525, "grad_norm": 6.961463451385498, "learning_rate": 2.1231543624161076e-06, "loss": 2.06, "step": 53700 }, { "epoch": 6.548995739500913, "grad_norm": 5.773210525512695, "learning_rate": 2.089597315436242e-06, "loss": 2.0167, "step": 53800 }, { "epoch": 6.561168594035301, "grad_norm": 6.747873783111572, "learning_rate": 2.056040268456376e-06, "loss": 1.9882, "step": 53900 }, { "epoch": 6.57334144856969, "grad_norm": 6.432974338531494, "learning_rate": 2.0224832214765104e-06, "loss": 2.0029, "step": 54000 }, { "epoch": 6.57334144856969, "eval_loss": 1.9841845035552979, "eval_runtime": 6.8372, "eval_samples_per_second": 146.258, "eval_steps_per_second": 36.564, "step": 54000 }, { "epoch": 6.585514303104078, "grad_norm": 6.159907341003418, "learning_rate": 1.9889261744966446e-06, "loss": 2.0454, "step": 54100 }, { "epoch": 6.597687157638466, "grad_norm": 7.004731178283691, "learning_rate": 1.955369127516779e-06, "loss": 2.0011, "step": 54200 }, { "epoch": 6.609860012172854, "grad_norm": 7.388941764831543, "learning_rate": 1.9218120805369127e-06, "loss": 2.0446, "step": 54300 }, { "epoch": 6.622032866707243, "grad_norm": 7.399050235748291, "learning_rate": 1.888255033557047e-06, "loss": 2.0265, "step": 54400 }, { "epoch": 6.634205721241631, "grad_norm": 6.445584297180176, "learning_rate": 1.8546979865771813e-06, "loss": 2.0124, "step": 54500 }, { "epoch": 6.634205721241631, "eval_loss": 1.9837737083435059, "eval_runtime": 6.903, "eval_samples_per_second": 144.864, "eval_steps_per_second": 36.216, "step": 54500 }, { "epoch": 6.646378575776019, "grad_norm": 6.1334967613220215, "learning_rate": 1.8211409395973155e-06, "loss": 2.0495, "step": 54600 }, { "epoch": 6.658551430310408, "grad_norm": 6.132894992828369, "learning_rate": 1.7875838926174498e-06, "loss": 2.0308, "step": 54700 }, { "epoch": 6.6707242848447965, "grad_norm": 7.038134574890137, "learning_rate": 1.7540268456375839e-06, "loss": 2.0532, "step": 54800 }, { "epoch": 6.682897139379184, "grad_norm": 6.755254745483398, "learning_rate": 1.7204697986577181e-06, "loss": 2.0178, "step": 54900 }, { "epoch": 6.695069993913573, "grad_norm": 6.841146945953369, "learning_rate": 1.6869127516778524e-06, "loss": 2.0442, "step": 55000 }, { "epoch": 6.695069993913573, "eval_loss": 1.9824799299240112, "eval_runtime": 6.8019, "eval_samples_per_second": 147.018, "eval_steps_per_second": 36.755, "step": 55000 }, { "epoch": 6.707242848447961, "grad_norm": 6.4666547775268555, "learning_rate": 1.6533557046979867e-06, "loss": 2.0205, "step": 55100 }, { "epoch": 6.719415702982349, "grad_norm": 7.502538204193115, "learning_rate": 1.619798657718121e-06, "loss": 2.0261, "step": 55200 }, { "epoch": 6.731588557516738, "grad_norm": 7.378790378570557, "learning_rate": 1.5862416107382552e-06, "loss": 2.0288, "step": 55300 }, { "epoch": 6.743761412051126, "grad_norm": 7.264867305755615, "learning_rate": 1.5526845637583892e-06, "loss": 2.0187, "step": 55400 }, { "epoch": 6.755934266585514, "grad_norm": 7.020994663238525, "learning_rate": 1.5191275167785235e-06, "loss": 2.038, "step": 55500 }, { "epoch": 6.755934266585514, "eval_loss": 1.9808002710342407, "eval_runtime": 6.8401, "eval_samples_per_second": 146.197, "eval_steps_per_second": 36.549, "step": 55500 }, { "epoch": 6.768107121119902, "grad_norm": 6.773026943206787, "learning_rate": 1.4855704697986578e-06, "loss": 2.0144, "step": 55600 }, { "epoch": 6.780279975654291, "grad_norm": 5.357457160949707, "learning_rate": 1.452013422818792e-06, "loss": 2.0353, "step": 55700 }, { "epoch": 6.7924528301886795, "grad_norm": 6.2290873527526855, "learning_rate": 1.4184563758389263e-06, "loss": 2.0328, "step": 55800 }, { "epoch": 6.804625684723067, "grad_norm": 6.145375728607178, "learning_rate": 1.3848993288590606e-06, "loss": 2.0438, "step": 55900 }, { "epoch": 6.816798539257456, "grad_norm": 6.537805080413818, "learning_rate": 1.3513422818791946e-06, "loss": 2.0634, "step": 56000 }, { "epoch": 6.816798539257456, "eval_loss": 1.9801044464111328, "eval_runtime": 6.8399, "eval_samples_per_second": 146.202, "eval_steps_per_second": 36.55, "step": 56000 } ], "logging_steps": 100, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.080261694721884e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }